feat(papers): B-3 P2-PR1 oa_url 승격 분기 (arXiv 외 doi.org/KISTI/PMC OA)

arxiv_id 없는 OA 논문(oa_status gold/hybrid/green/diamond + oa_url)도 전문 승격 대상에 포함.
url = arxiv.org/pdf 또는 oa_url(friendly OA host). paywall/비-PDF 는 헤더검증서 skip(실패 격리).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Code
2026-06-14 03:16:47 +00:00
parent 73c6f123b8
commit 6d978289b8
+22 -8
View File
@@ -18,7 +18,7 @@ import random
from pathlib import Path
import httpx
from sqlalchemy import select
from sqlalchemy import or_, select
from core.config import settings
from core.crawl_politeness import CRAWL_UA
@@ -35,6 +35,9 @@ _DOWNLOAD_DELAY = (2.0, 5.0)
_RUN_CAP = 10 # 1회 승격 상한(marker/embed GPU 보호). bulk 시 해제.
_ARXIV_ID_EXPR = Document.extract_meta[("paper", "arxiv_id")].astext
_OA_URL_EXPR = Document.extract_meta[("paper", "oa_url")].astext
_OA_STATUS_EXPR = Document.extract_meta[("paper", "oa_status")].astext
_REAL_OA = ("gold", "hybrid", "green", "diamond")
async def _download(url: str, dest: Path) -> int:
@@ -62,7 +65,10 @@ async def run(bulk: bool = False, limit: int = 0) -> None:
.where(
Document.material_type == "paper",
Document.file_format == "article",
_ARXIV_ID_EXPR.isnot(None),
or_(
_ARXIV_ID_EXPR.isnot(None),
Document.extract_meta[("paper", "oa_url")].astext.isnot(None),
),
)
.order_by(Document.id.desc())
.limit(cap)
@@ -75,15 +81,23 @@ async def run(bulk: bool = False, limit: int = 0) -> None:
doc = await session.get(Document, doc_id)
if doc is None or doc.file_format != "article":
continue
arxiv_id = ((doc.extract_meta or {}).get("paper") or {}).get("arxiv_id")
if not arxiv_id:
paper = (doc.extract_meta or {}).get("paper") or {}
arxiv_id = paper.get("arxiv_id")
oa_status = (paper.get("oa_status") or "").lower()
if arxiv_id:
url = _ARXIV_PDF.format(id=arxiv_id)
key = arxiv_id.replace("/", "_")
elif paper.get("oa_url") and oa_status in _REAL_OA:
url = paper["oa_url"] # doi.org/KISTI/PMC (friendly OA). 비-OA·paywall 은 헤더검증서 skip
key = (paper.get("openalex_id") or paper.get("doi") or "oa").replace("/", "_")
else:
continue
rel_path = f"crawl_raw/papers/arxiv/{arxiv_id.replace('/', '_')}.pdf"
rel_path = f"crawl_raw/papers/{key}.pdf"
dest = Path(settings.nas_mount_path) / rel_path
try:
size = await _download(_ARXIV_PDF.format(id=arxiv_id), dest)
size = await _download(url, dest)
except Exception as e: # noqa: BLE001 — 다운로드 실패 격리
logger.error(f"[promote] {arxiv_id} 다운로드 실패: {e}")
logger.error(f"[promote] {key} 다운로드 실패: {e}")
failed += 1
continue
# in-place 승격: 초록 행 → 전문 PDF 행 (file_hash·extract_meta.paper 보존)
@@ -96,7 +110,7 @@ async def run(bulk: bool = False, limit: int = 0) -> None:
await enqueue_stage(session, doc.id, "extract")
await session.commit()
promoted += 1
logger.info(f"[promote] {arxiv_id} → 전문 PDF in-place (doc {doc.id}, {size}b)")
logger.info(f"[promote] {key} → 전문 PDF in-place (doc {doc.id}, {size}b)")
logger.info(f"[paper_fulltext_promote] 승격 {promoted} · 실패 {failed} (cap {cap})")