bibfetch/bibfetch.py

#!/usr/bin/env python3
import os
import sys
import argparse
import bibtexparser
import requests
from bs4 import BeautifulSoup
import warnings
import re
from urllib.parse import urljoin

# Wyłączamy ostrzeżenia bibliotek (np. BS4) dla czystego outputu
warnings.filterwarnings("ignore")

# Kolory terminala
C_RESET  = "\033[0m"
C_RED    = "\033[91m"
C_GREEN  = "\033[92m"
C_BLUE   = "\033[94m"
C_GRAY   = "\033[90m"

# Nagłówki udające przeglądarkę
GLOBAL_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Connection': 'keep-alive',
}

def parse_args():
    parser = argparse.ArgumentParser(description="Pobieracz PDF z pliku BibTeX.")
    parser.add_argument('work_dir', type=str, help="Ścieżka do katalogu projektu")
    return parser.parse_args()

def fix_ieee_url(link):
    """Naprawia ścieżki względne z IEEE."""
    if not link:
        return None
    if link.startswith('/'):
        return f"https://ieeexplore.ieee.org{link}"
    return link

def get_real_pdf_url_from_ieee_html(content):
    """
    Wyciąga link do PDF z ramki (iframe) lub zmiennych JS na stronie IEEE.
    """
    try:
        soup = BeautifulSoup(content, 'html.parser')

        # 1. Szukamy iframe (często zawiera /stamp/stamp.jsp...)
        iframe = soup.find('iframe', src=re.compile(r'stamp\.jsp|\.pdf'))
        if iframe:
            return fix_ieee_url(iframe['src'])

        # 2. Szukamy w JavaScript (zmienna pdfUrl)
        scripts = soup.find_all('script')
        for script in scripts:
            if script.string and '"pdfUrl":' in script.string:
                match = re.search(r'"pdfUrl":"([^"]+)"', script.string)
                if match:
                    return fix_ieee_url(match.group(1))
    except:
        pass
    return None

def download_file(url, save_path, referer=None):
    try:
        # Normalizacja URL
        if url.startswith('/'):
            url = f"https://ieeexplore.ieee.org{url}"

        headers = GLOBAL_HEADERS.copy()
        if referer:
            headers['Referer'] = referer
        elif 'ieeexplore.ieee.org' in url:
            headers['Referer'] = 'https://ieeexplore.ieee.org/'

        print(f"    ...próba: {url[:80]}...")

        r = requests.get(url, headers=headers, stream=True, timeout=20, allow_redirects=True)

        if r.status_code != 200:
            return False

        content_type = r.headers.get('content-type', '').lower()
        final_url = r.url

        # A. To jest PDF
        if 'application/pdf' in content_type:
            with open(save_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
            return True

        # B. To jest HTML z IEEE (Wrapper)
        if 'text/html' in content_type and 'ieeexplore.ieee.org' in final_url:
            # IEEE często robi pętlę: Link -> Strona(HTML) -> Iframe(PDF)
            # Sprawdzamy czy nie pobieramy w kółko tego samego
            if url.endswith('stamp.jsp') or 'arnumber=' in url:
                html_content = r.content
                pdf_link = get_real_pdf_url_from_ieee_html(html_content)

                # Zabezpieczenie przed pętlą
                if pdf_link and pdf_link != url:
                    print(f"    ...znaleziono głębszy link: {pdf_link[:40]}...")
                    return download_file(pdf_link, save_path, referer='https://ieeexplore.ieee.org/')

    except Exception:
        pass
    return False

def get_pdf_url_from_meta_tags(page_url):
    try:
        r = requests.get(page_url, headers=GLOBAL_HEADERS, allow_redirects=True, timeout=10)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, 'html.parser')
            meta_pdf = soup.find('meta', attrs={'name': 'citation_pdf_url'})
            if meta_pdf:
                return meta_pdf['content']
    except:
        pass
    return None

def main():
    args = parse_args()
    work_dir = args.work_dir
    bib_file = os.path.join(work_dir, 'references.bib')
    out_dir = os.path.join(work_dir, 'references')

    if not os.path.isfile(bib_file):
        print(f"{C_RED}Błąd: Nie znaleziono {bib_file}{C_RESET}")
        sys.exit(1)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    print(f"Wczytywanie: {bib_file}...")
    with open(bib_file, encoding='utf-8') as bf:
        db = bibtexparser.load(bf)

    total = len(db.entries)
    print(f"Znaleziono wpisów: {total}\n")

    for i, entry in enumerate(db.entries, 1):
        key = entry.get('ID')
        if not key: continue

        pdf_name = f"{key}.pdf"
        save_path = os.path.join(out_dir, pdf_name)

        if os.path.exists(save_path):
            continue

        print(f"[{i}/{total}] Pobieranie: {C_BLUE}{key}{C_RESET}")
        success = False

        url = entry.get('url')
        doi = entry.get('doi')
        eprint = entry.get('eprint')

        # 1. URL bezpośredni
        if url:
            success = download_file(url, save_path)

        # 2. ArXiv
        if not success and eprint:
             success = download_file(f"https://arxiv.org/pdf/{eprint}.pdf", save_path)

        # 3. DOI (przekierowanie + szukanie w meta/iframe)
        if not success and doi:
            doi_url = f"https://doi.org/{doi}"
            # Najpierw zwykłe pobranie (może przekieruje na PDF)
            success = download_file(doi_url, save_path)

            # Jak nie, szukamy w meta tagach
            if not success:
                meta_link = get_pdf_url_from_meta_tags(doi_url)
                if meta_link:
                    # Fix dla IEEE (meta tagi czasem dają względne linki)
                    if 'ieee.org' in meta_link and meta_link.startswith('/'):
                        meta_link = f"https://ieeexplore.ieee.org{meta_link}"
                    success = download_file(meta_link, save_path)

        # 4. URL Scraping (ostatnia deska ratunku)
        if not success and url:
             meta_link = get_pdf_url_from_meta_tags(url)
             if meta_link:
                 success = download_file(meta_link, save_path)

        if success:
            print(f" {C_GREEN}-> SUKCES{C_RESET}")
        else:
            print(f" {C_RED}-> PORAŻKA.{C_RESET}")

if __name__ == "__main__":
    main()