Initial commit: bibfetch v1.0

2026-01-17 02:38:39 +01:00
commit 32c0037434
3 changed files with 307 additions and 0 deletions
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Dávid Ali
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,92 @@
+# bibfetch
+
+Prosty skrypt do automatycznego pobierania plików PDF z bibliografii (`references.bib`).
+
+Obsługuje:
+* Bezpośrednie linki do PDF
+* ArXiv
+* DOI (z przekierowaniami)
+* IEEE Xplore (wyciąganie PDF z ramek/stron HTML)
+
+## Wymagania
+
+Zainstaluj potrzebne biblioteki jedną komendą (wymagana starsza wersja bibtexparser):
+
+```bash
+pip install "bibtexparser<2.0" requests beautifulsoup4
+```
+
+## Użycie
+
+Wejdź do katalogu ze swoją pracą (tam gdzie masz `references.bib`) i uruchom skrypt:
+
+```bash
+python3 bibfetch.py .
+```
+
+Skrypt utworzy folder `references/` i pobierze tam brakujące pliki, nazywając je zgodnie z kluczem cytowania (np. `Kowalski2023.pdf`).
+
+---
+
+## Instalacja globalna (Opcjonalnie)
+
+Wybierz jedną z dwóch metod, aby używać komendy `bibfetch` w całym systemie.
+
+### Opcja A: Szybka (Symlink)
+
+*Dobra, jeśli masz biblioteki zainstalowane w głównym Pythonie systemowym.*
+
+1. Nadaj uprawnienia: `chmod +x bibfetch.py`
+2. Zrób link (zastąp ścieżkę swoją):
+
+```bash
+sudo ln -s /pełna/ścieżka/do/bibfetch.py /usr/local/bin/bibfetch
+
+```
+
+3. Gotowe. Użycie: `bibfetch .`
+
+### Opcja B: Bezpieczna (Dedykowany Venv)
+
+*Dobra, jeśli chcesz trzymać biblioteki w izolacji.*
+
+**Krok 1: Stwórz środowisko**
+
+```bash
+# Tworzymy venv
+mkdir -p ~/venvs
+python3 -m venv ~/venvs/bibfetch
+
+# Instalujemy zależności (wersja <2.0 jest kluczowa!)
+~/venvs/bibfetch/bin/pip install "bibtexparser<2.0" requests beautifulsoup4
+```
+
+**Krok 2: Utwórz wrapper**
+
+Zamiast linkować plik bezpośrednio, tworzymy skrypt pomocniczy.
+
+1. Edytuj plik: `sudo nano /usr/local/bin/bibfetch`
+2. Wklej poniższą treść (**podmień ścieżki na swoje!**):
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Ścieżki (dostosuj do siebie)
+VENV_PYTHON="$HOME/venvs/bibfetch/bin/python"
+SCRIPT_PATH="$HOME/projekty/scripts/bibfetch.py"
+
+exec "$VENV_PYTHON" "$SCRIPT_PATH" "$@"
+```
+
+**Krok 3: Nadaj uprawnienia**
+
+```bash
+sudo chmod +x /usr/local/bin/bibfetch
+```
+
+Teraz możesz wpisać `bibfetch .` w dowolnym katalogu.
+
+## Licencja
+
+MIT
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+import os
+import sys
+import argparse
+import bibtexparser
+import requests
+from bs4 import BeautifulSoup
+import warnings
+import re
+from urllib.parse import urljoin
+
+# Wyłączamy ostrzeżenia bibliotek (np. BS4) dla czystego outputu
+warnings.filterwarnings("ignore")
+
+# Kolory terminala
+C_RESET  = "\033[0m"
+C_RED    = "\033[91m"
+C_GREEN  = "\033[92m"
+C_BLUE   = "\033[94m"
+C_GRAY   = "\033[90m"
+
+# Nagłówki udające przeglądarkę
+GLOBAL_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+    'Connection': 'keep-alive',
+}
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Pobieracz PDF z pliku BibTeX.")
+    parser.add_argument('work_dir', type=str, help="Ścieżka do katalogu projektu")
+    return parser.parse_args()
+
+def fix_ieee_url(link):
+    """Naprawia ścieżki względne z IEEE."""
+    if not link:
+        return None
+    if link.startswith('/'):
+        return f"https://ieeexplore.ieee.org{link}"
+    return link
+
+def get_real_pdf_url_from_ieee_html(content):
+    """
+    Wyciąga link do PDF z ramki (iframe) lub zmiennych JS na stronie IEEE.
+    """
+    try:
+        soup = BeautifulSoup(content, 'html.parser')
+        
+        # 1. Szukamy iframe (często zawiera /stamp/stamp.jsp...)
+        iframe = soup.find('iframe', src=re.compile(r'stamp\.jsp|\.pdf'))
+        if iframe:
+            return fix_ieee_url(iframe['src'])
+        
+        # 2. Szukamy w JavaScript (zmienna pdfUrl)
+        scripts = soup.find_all('script')
+        for script in scripts:
+            if script.string and '"pdfUrl":' in script.string:
+                match = re.search(r'"pdfUrl":"([^"]+)"', script.string)
+                if match:
+                    return fix_ieee_url(match.group(1))
+    except:
+        pass
+    return None
+
+def download_file(url, save_path, referer=None):
+    try:
+        # Normalizacja URL
+        if url.startswith('/'):
+            url = f"https://ieeexplore.ieee.org{url}"
+
+        headers = GLOBAL_HEADERS.copy()
+        if referer:
+            headers['Referer'] = referer
+        elif 'ieeexplore.ieee.org' in url:
+            headers['Referer'] = 'https://ieeexplore.ieee.org/'
+
+        print(f"    ...próba: {url[:80]}...")
+        
+        r = requests.get(url, headers=headers, stream=True, timeout=20, allow_redirects=True)
+        
+        if r.status_code != 200:
+            return False
+
+        content_type = r.headers.get('content-type', '').lower()
+        final_url = r.url
+
+        # A. To jest PDF
+        if 'application/pdf' in content_type:
+            with open(save_path, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            return True
+
+        # B. To jest HTML z IEEE (Wrapper)
+        if 'text/html' in content_type and 'ieeexplore.ieee.org' in final_url:
+            # IEEE często robi pętlę: Link -> Strona(HTML) -> Iframe(PDF)
+            # Sprawdzamy czy nie pobieramy w kółko tego samego
+            if url.endswith('stamp.jsp') or 'arnumber=' in url:
+                html_content = r.content
+                pdf_link = get_real_pdf_url_from_ieee_html(html_content)
+                
+                # Zabezpieczenie przed pętlą
+                if pdf_link and pdf_link != url:
+                    print(f"    ...znaleziono głębszy link: {pdf_link[:40]}...")
+                    return download_file(pdf_link, save_path, referer='https://ieeexplore.ieee.org/')
+            
+    except Exception:
+        pass
+    return False
+
+def get_pdf_url_from_meta_tags(page_url):
+    try:
+        r = requests.get(page_url, headers=GLOBAL_HEADERS, allow_redirects=True, timeout=10)
+        if r.status_code == 200:
+            soup = BeautifulSoup(r.content, 'html.parser')
+            meta_pdf = soup.find('meta', attrs={'name': 'citation_pdf_url'})
+            if meta_pdf:
+                return meta_pdf['content']
+    except:
+        pass
+    return None
+
+def main():
+    args = parse_args()
+    work_dir = args.work_dir
+    bib_file = os.path.join(work_dir, 'references.bib')
+    out_dir = os.path.join(work_dir, 'references')
+
+    if not os.path.isfile(bib_file):
+        print(f"{C_RED}Błąd: Nie znaleziono {bib_file}{C_RESET}")
+        sys.exit(1)
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+
+    print(f"Wczytywanie: {bib_file}...")
+    with open(bib_file, encoding='utf-8') as bf:
+        db = bibtexparser.load(bf)
+    
+    total = len(db.entries)
+    print(f"Znaleziono wpisów: {total}\n")
+
+    for i, entry in enumerate(db.entries, 1):
+        key = entry.get('ID')
+        if not key: continue
+
+        pdf_name = f"{key}.pdf"
+        save_path = os.path.join(out_dir, pdf_name)
+
+        if os.path.exists(save_path):
+            continue
+
+        print(f"[{i}/{total}] Pobieranie: {C_BLUE}{key}{C_RESET}")
+        success = False
+
+        url = entry.get('url')
+        doi = entry.get('doi')
+        eprint = entry.get('eprint')
+
+        # 1. URL bezpośredni
+        if url:
+            success = download_file(url, save_path)
+
+        # 2. ArXiv
+        if not success and eprint:
+             success = download_file(f"https://arxiv.org/pdf/{eprint}.pdf", save_path)
+
+        # 3. DOI (przekierowanie + szukanie w meta/iframe)
+        if not success and doi:
+            doi_url = f"https://doi.org/{doi}"
+            # Najpierw zwykłe pobranie (może przekieruje na PDF)
+            success = download_file(doi_url, save_path)
+            
+            # Jak nie, szukamy w meta tagach
+            if not success:
+                meta_link = get_pdf_url_from_meta_tags(doi_url)
+                if meta_link:
+                    # Fix dla IEEE (meta tagi czasem dają względne linki)
+                    if 'ieee.org' in meta_link and meta_link.startswith('/'):
+                        meta_link = f"https://ieeexplore.ieee.org{meta_link}"
+                    success = download_file(meta_link, save_path)
+
+        # 4. URL Scraping (ostatnia deska ratunku)
+        if not success and url:
+             meta_link = get_pdf_url_from_meta_tags(url)
+             if meta_link:
+                 success = download_file(meta_link, save_path)
+
+        if success:
+            print(f" {C_GREEN}-> SUKCES{C_RESET}")
+        else:
+            print(f" {C_RED}-> PORAŻKA.{C_RESET}")
+
+if __name__ == "__main__":
+    main()