diff --git a/app/workers/paper_fulltext_promote.py b/app/workers/paper_fulltext_promote.py index ccd390a..1af5084 100644 --- a/app/workers/paper_fulltext_promote.py +++ b/app/workers/paper_fulltext_promote.py @@ -18,7 +18,7 @@ import random from pathlib import Path import httpx -from sqlalchemy import select +from sqlalchemy import or_, select from core.config import settings from core.crawl_politeness import CRAWL_UA @@ -35,6 +35,9 @@ _DOWNLOAD_DELAY = (2.0, 5.0) _RUN_CAP = 10 # 1회 승격 상한(marker/embed GPU 보호). bulk 시 해제. _ARXIV_ID_EXPR = Document.extract_meta[("paper", "arxiv_id")].astext +_OA_URL_EXPR = Document.extract_meta[("paper", "oa_url")].astext +_OA_STATUS_EXPR = Document.extract_meta[("paper", "oa_status")].astext +_REAL_OA = ("gold", "hybrid", "green", "diamond") async def _download(url: str, dest: Path) -> int: @@ -62,7 +65,10 @@ async def run(bulk: bool = False, limit: int = 0) -> None: .where( Document.material_type == "paper", Document.file_format == "article", - _ARXIV_ID_EXPR.isnot(None), + or_( + _ARXIV_ID_EXPR.isnot(None), + Document.extract_meta[("paper", "oa_url")].astext.isnot(None), + ), ) .order_by(Document.id.desc()) .limit(cap) @@ -75,15 +81,23 @@ async def run(bulk: bool = False, limit: int = 0) -> None: doc = await session.get(Document, doc_id) if doc is None or doc.file_format != "article": continue - arxiv_id = ((doc.extract_meta or {}).get("paper") or {}).get("arxiv_id") - if not arxiv_id: + paper = (doc.extract_meta or {}).get("paper") or {} + arxiv_id = paper.get("arxiv_id") + oa_status = (paper.get("oa_status") or "").lower() + if arxiv_id: + url = _ARXIV_PDF.format(id=arxiv_id) + key = arxiv_id.replace("/", "_") + elif paper.get("oa_url") and oa_status in _REAL_OA: + url = paper["oa_url"] # doi.org/KISTI/PMC (friendly OA). 비-OA·paywall 은 헤더검증서 skip + key = (paper.get("openalex_id") or paper.get("doi") or "oa").replace("/", "_") + else: continue - rel_path = f"crawl_raw/papers/arxiv/{arxiv_id.replace('/', '_')}.pdf" + rel_path = f"crawl_raw/papers/{key}.pdf" dest = Path(settings.nas_mount_path) / rel_path try: - size = await _download(_ARXIV_PDF.format(id=arxiv_id), dest) + size = await _download(url, dest) except Exception as e: # noqa: BLE001 — 다운로드 실패 격리 - logger.error(f"[promote] {arxiv_id} 다운로드 실패: {e}") + logger.error(f"[promote] {key} 다운로드 실패: {e}") failed += 1 continue # in-place 승격: 초록 행 → 전문 PDF 행 (file_hash·extract_meta.paper 보존) @@ -96,7 +110,7 @@ async def run(bulk: bool = False, limit: int = 0) -> None: await enqueue_stage(session, doc.id, "extract") await session.commit() promoted += 1 - logger.info(f"[promote] {arxiv_id} → 전문 PDF in-place (doc {doc.id}, {size}b)") + logger.info(f"[promote] {key} → 전문 PDF in-place (doc {doc.id}, {size}b)") logger.info(f"[paper_fulltext_promote] 승격 {promoted} · 실패 {failed} (cap {cap})")