#!/usr/bin/env python3 import os import sys import argparse import bibtexparser import requests from bs4 import BeautifulSoup import warnings import re from urllib.parse import urljoin # Wyłączamy ostrzeżenia bibliotek (np. BS4) dla czystego outputu warnings.filterwarnings("ignore") # Kolory terminala C_RESET = "\033[0m" C_RED = "\033[91m" C_GREEN = "\033[92m" C_BLUE = "\033[94m" C_GRAY = "\033[90m" # Nagłówki udające przeglądarkę GLOBAL_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Connection': 'keep-alive', } def parse_args(): parser = argparse.ArgumentParser(description="Pobieracz PDF z pliku BibTeX.") parser.add_argument('work_dir', type=str, help="Ścieżka do katalogu projektu") return parser.parse_args() def fix_ieee_url(link): """Naprawia ścieżki względne z IEEE.""" if not link: return None if link.startswith('/'): return f"https://ieeexplore.ieee.org{link}" return link def get_real_pdf_url_from_ieee_html(content): """ Wyciąga link do PDF z ramki (iframe) lub zmiennych JS na stronie IEEE. """ try: soup = BeautifulSoup(content, 'html.parser') # 1. Szukamy iframe (często zawiera /stamp/stamp.jsp...) iframe = soup.find('iframe', src=re.compile(r'stamp\.jsp|\.pdf')) if iframe: return fix_ieee_url(iframe['src']) # 2. Szukamy w JavaScript (zmienna pdfUrl) scripts = soup.find_all('script') for script in scripts: if script.string and '"pdfUrl":' in script.string: match = re.search(r'"pdfUrl":"([^"]+)"', script.string) if match: return fix_ieee_url(match.group(1)) except: pass return None def download_file(url, save_path, referer=None): try: # Normalizacja URL if url.startswith('/'): url = f"https://ieeexplore.ieee.org{url}" headers = GLOBAL_HEADERS.copy() if referer: headers['Referer'] = referer elif 'ieeexplore.ieee.org' in url: headers['Referer'] = 'https://ieeexplore.ieee.org/' print(f" ...próba: {url[:80]}...") r = requests.get(url, headers=headers, stream=True, timeout=20, allow_redirects=True) if r.status_code != 200: return False content_type = r.headers.get('content-type', '').lower() final_url = r.url # A. To jest PDF if 'application/pdf' in content_type: with open(save_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) return True # B. To jest HTML z IEEE (Wrapper) if 'text/html' in content_type and 'ieeexplore.ieee.org' in final_url: # IEEE często robi pętlę: Link -> Strona(HTML) -> Iframe(PDF) # Sprawdzamy czy nie pobieramy w kółko tego samego if url.endswith('stamp.jsp') or 'arnumber=' in url: html_content = r.content pdf_link = get_real_pdf_url_from_ieee_html(html_content) # Zabezpieczenie przed pętlą if pdf_link and pdf_link != url: print(f" ...znaleziono głębszy link: {pdf_link[:40]}...") return download_file(pdf_link, save_path, referer='https://ieeexplore.ieee.org/') except Exception: pass return False def get_pdf_url_from_meta_tags(page_url): try: r = requests.get(page_url, headers=GLOBAL_HEADERS, allow_redirects=True, timeout=10) if r.status_code == 200: soup = BeautifulSoup(r.content, 'html.parser') meta_pdf = soup.find('meta', attrs={'name': 'citation_pdf_url'}) if meta_pdf: return meta_pdf['content'] except: pass return None def main(): args = parse_args() work_dir = args.work_dir bib_file = os.path.join(work_dir, 'references.bib') out_dir = os.path.join(work_dir, 'references') if not os.path.isfile(bib_file): print(f"{C_RED}Błąd: Nie znaleziono {bib_file}{C_RESET}") sys.exit(1) if not os.path.exists(out_dir): os.makedirs(out_dir) print(f"Wczytywanie: {bib_file}...") with open(bib_file, encoding='utf-8') as bf: db = bibtexparser.load(bf) total = len(db.entries) print(f"Znaleziono wpisów: {total}\n") for i, entry in enumerate(db.entries, 1): key = entry.get('ID') if not key: continue pdf_name = f"{key}.pdf" save_path = os.path.join(out_dir, pdf_name) if os.path.exists(save_path): continue print(f"[{i}/{total}] Pobieranie: {C_BLUE}{key}{C_RESET}") success = False url = entry.get('url') doi = entry.get('doi') eprint = entry.get('eprint') # 1. URL bezpośredni if url: success = download_file(url, save_path) # 2. ArXiv if not success and eprint: success = download_file(f"https://arxiv.org/pdf/{eprint}.pdf", save_path) # 3. DOI (przekierowanie + szukanie w meta/iframe) if not success and doi: doi_url = f"https://doi.org/{doi}" # Najpierw zwykłe pobranie (może przekieruje na PDF) success = download_file(doi_url, save_path) # Jak nie, szukamy w meta tagach if not success: meta_link = get_pdf_url_from_meta_tags(doi_url) if meta_link: # Fix dla IEEE (meta tagi czasem dają względne linki) if 'ieee.org' in meta_link and meta_link.startswith('/'): meta_link = f"https://ieeexplore.ieee.org{meta_link}" success = download_file(meta_link, save_path) # 4. URL Scraping (ostatnia deska ratunku) if not success and url: meta_link = get_pdf_url_from_meta_tags(url) if meta_link: success = download_file(meta_link, save_path) if success: print(f" {C_GREEN}-> SUKCES{C_RESET}") else: print(f" {C_RED}-> PORAŻKA.{C_RESET}") if __name__ == "__main__": main()