feat(papers): B-3 P2-PR1 oa_url 승격 분기 (arXiv 외 doi.org/KISTI/PMC OA)
arxiv_id 없는 OA 논문(oa_status gold/hybrid/green/diamond + oa_url)도 전문 승격 대상에 포함. url = arxiv.org/pdf 또는 oa_url(friendly OA host). paywall/비-PDF 는 헤더검증서 skip(실패 격리). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -18,7 +18,7 @@ import random
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import or_, select
|
||||
|
||||
from core.config import settings
|
||||
from core.crawl_politeness import CRAWL_UA
|
||||
@@ -35,6 +35,9 @@ _DOWNLOAD_DELAY = (2.0, 5.0)
|
||||
_RUN_CAP = 10 # 1회 승격 상한(marker/embed GPU 보호). bulk 시 해제.
|
||||
|
||||
_ARXIV_ID_EXPR = Document.extract_meta[("paper", "arxiv_id")].astext
|
||||
_OA_URL_EXPR = Document.extract_meta[("paper", "oa_url")].astext
|
||||
_OA_STATUS_EXPR = Document.extract_meta[("paper", "oa_status")].astext
|
||||
_REAL_OA = ("gold", "hybrid", "green", "diamond")
|
||||
|
||||
|
||||
async def _download(url: str, dest: Path) -> int:
|
||||
@@ -62,7 +65,10 @@ async def run(bulk: bool = False, limit: int = 0) -> None:
|
||||
.where(
|
||||
Document.material_type == "paper",
|
||||
Document.file_format == "article",
|
||||
_ARXIV_ID_EXPR.isnot(None),
|
||||
or_(
|
||||
_ARXIV_ID_EXPR.isnot(None),
|
||||
Document.extract_meta[("paper", "oa_url")].astext.isnot(None),
|
||||
),
|
||||
)
|
||||
.order_by(Document.id.desc())
|
||||
.limit(cap)
|
||||
@@ -75,15 +81,23 @@ async def run(bulk: bool = False, limit: int = 0) -> None:
|
||||
doc = await session.get(Document, doc_id)
|
||||
if doc is None or doc.file_format != "article":
|
||||
continue
|
||||
arxiv_id = ((doc.extract_meta or {}).get("paper") or {}).get("arxiv_id")
|
||||
if not arxiv_id:
|
||||
paper = (doc.extract_meta or {}).get("paper") or {}
|
||||
arxiv_id = paper.get("arxiv_id")
|
||||
oa_status = (paper.get("oa_status") or "").lower()
|
||||
if arxiv_id:
|
||||
url = _ARXIV_PDF.format(id=arxiv_id)
|
||||
key = arxiv_id.replace("/", "_")
|
||||
elif paper.get("oa_url") and oa_status in _REAL_OA:
|
||||
url = paper["oa_url"] # doi.org/KISTI/PMC (friendly OA). 비-OA·paywall 은 헤더검증서 skip
|
||||
key = (paper.get("openalex_id") or paper.get("doi") or "oa").replace("/", "_")
|
||||
else:
|
||||
continue
|
||||
rel_path = f"crawl_raw/papers/arxiv/{arxiv_id.replace('/', '_')}.pdf"
|
||||
rel_path = f"crawl_raw/papers/{key}.pdf"
|
||||
dest = Path(settings.nas_mount_path) / rel_path
|
||||
try:
|
||||
size = await _download(_ARXIV_PDF.format(id=arxiv_id), dest)
|
||||
size = await _download(url, dest)
|
||||
except Exception as e: # noqa: BLE001 — 다운로드 실패 격리
|
||||
logger.error(f"[promote] {arxiv_id} 다운로드 실패: {e}")
|
||||
logger.error(f"[promote] {key} 다운로드 실패: {e}")
|
||||
failed += 1
|
||||
continue
|
||||
# in-place 승격: 초록 행 → 전문 PDF 행 (file_hash·extract_meta.paper 보존)
|
||||
@@ -96,7 +110,7 @@ async def run(bulk: bool = False, limit: int = 0) -> None:
|
||||
await enqueue_stage(session, doc.id, "extract")
|
||||
await session.commit()
|
||||
promoted += 1
|
||||
logger.info(f"[promote] {arxiv_id} → 전문 PDF in-place (doc {doc.id}, {size}b)")
|
||||
logger.info(f"[promote] {key} → 전문 PDF in-place (doc {doc.id}, {size}b)")
|
||||
|
||||
logger.info(f"[paper_fulltext_promote] 승격 {promoted} · 실패 {failed} (cap {cap})")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user