feat(papers): B-3 P2-PR1 — arXiv 논문 전문 in-place 승격 + classify paper 요약-스킵

plan safety-library-b3-1 Phase-2. 논문을 초록(signal-only)에서 전문 md/검색으로 승격. - paper_fulltext_promote.py: 미승격 arXiv 논문(file_format='article') → arxiv.org/pdf/{id} 다운로드 (kosha 패턴·50MB cap·PDF 헤더검증) → NAS crawl_raw/papers/arxiv/ → in-place 갱신 (file_format=pdf·file_type=immutable·file_path·md_status=pending, file_hash·extract_meta.paper 보존) → 'extract' enqueue. 1-Document(2행 분리 회피, 기존 display 스택 재사용). per-run cap 10(GPU 보호). arXiv=공개 프리프린트라 전문 검색/RAG 무난(restricted 불요; 유료 구매분만 Papers_Purchased restricted). - classify_worker: material_type='paper' 가드 추가 — 요약/분류 LLM 스킵(맥미니 큐 무접촉), queue_consumer 가 embed/chunk/markdown 은 chain. law_monitor 스킵 패턴 동형. CLI 전용(Phase-2 deliberate 승격·GPU 부하 사용자 통제). 파이프라인=extract→classify[skip]→embed/chunk/markdown, marker 표시 md + hier 절구조 + 전문 검색 청크. 배포 후 라이브 검증. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-14 03:04:02 +00:00
parent 57c1805a8d
commit 73c6f123b8
2 changed files with 118 additions and 0 deletions
@@ -411,6 +411,15 @@ async def process(
        logger.info(f"doc {document_id}: devonagent → classify skip")
        return

+    # 논문(material_type='paper') — 요약/분류 LLM 스킵(맥미니 큐 무접촉, B-3 signal-only 유지).
+    # embed/chunk/markdown 은 queue_consumer 가 chain (early-return 후에도 다음 stage enqueue).
+    if doc.material_type == "paper":
+        if not doc.ai_domain:
+            doc.ai_domain = "논문"
+        await session.commit()
+        logger.info(f"doc {document_id}: paper → classify skip (no summarize)")
+        return
+
    if not doc.extracted_text:
        raise ValueError(f"문서 ID {document_id}: extracted_text가 비어있음")

@@ -0,0 +1,109 @@
+"""논문 arXiv 전문 승격 (in-place) — B-3 Phase-2 P2-PR1 (plan safety-library-b3-1).
+
+arXiv 프리프린트 초록 행(file_format='article', signal-only)을 전문 PDF로 **in-place 승격**:
+PDF 다운로드 → file_format/file_type/file_path/md_status 갱신 → 'extract' enqueue → 기존 파이프라인
+(extract → classify[paper skip summarize] → embed/chunk/markdown)이 전문 검색 청크 + md_content(marker 표시)
+ hier 절구조를 생성. 1-Document(2행 분리 회피, 기존 display 스택 재사용).
+
+- arXiv = 공개 프리프린트(arxiv.org/pdf/{id}, friendly host) → 전문 검색/RAG 무난, restricted 불요.
+  (유료 구매 논문은 Papers_Purchased 경로가 restricted=true 로 별개 처리.)
+- per-run cap (marker GPU ~10GB + embed 부하 보호, 4070 16GB 빡빡 → idle-unload·증분). keyless.
+- 요약 0 (classify paper-skip 가드). file_hash·extract_meta.paper 보존(수집기 dedup 무영향).
+- CLI 전용(Phase-2 deliberate 승격, GPU 부하 사용자 통제). 스케줄 잡 미등록.
+"""
+
+import argparse
+import asyncio
+import random
+from pathlib import Path
+
+import httpx
+from sqlalchemy import select
+
+from core.config import settings
+from core.crawl_politeness import CRAWL_UA
+from core.database import async_session
+from core.utils import setup_logger
+from models.document import Document
+from models.queue import enqueue_stage
+
+logger = setup_logger("paper_fulltext_promote")
+
+_ARXIV_PDF = "https://arxiv.org/pdf/{id}"
+_MAX_FILE_BYTES = 50 * 1024 * 1024
+_DOWNLOAD_DELAY = (2.0, 5.0)
+_RUN_CAP = 10  # 1회 승격 상한(marker/embed GPU 보호). bulk 시 해제.
+
+_ARXIV_ID_EXPR = Document.extract_meta[("paper", "arxiv_id")].astext
+
+
+async def _download(url: str, dest: Path) -> int:
+    """arXiv PDF 다운로드 — 크기 cap + PDF 헤더 검증 + 연속 간격(kosha 패턴)."""
+    await asyncio.sleep(random.uniform(*_DOWNLOAD_DELAY))
+    async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
+        resp = await client.get(url, headers={"User-Agent": CRAWL_UA})
+    if resp.status_code != 200:
+        raise RuntimeError(f"arXiv PDF {resp.status_code}: {url}")
+    if len(resp.content) > _MAX_FILE_BYTES:
+        raise RuntimeError(f"크기 초과 {len(resp.content)}b: {url}")
+    if resp.content[:5] != b"%PDF-":
+        raise RuntimeError(f"PDF 아님(헤더 {resp.content[:8]!r}): {url}")
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    dest.write_bytes(resp.content)
+    return len(resp.content)
+
+
+async def run(bulk: bool = False, limit: int = 0) -> None:
+    """미승격 arXiv 논문(file_format='article')을 전문 PDF로 in-place 승격."""
+    cap = (limit or 10**9) if bulk else (min(limit, _RUN_CAP) if limit else _RUN_CAP)
+    async with async_session() as session:
+        q = (
+            select(Document.id)
+            .where(
+                Document.material_type == "paper",
+                Document.file_format == "article",
+                _ARXIV_ID_EXPR.isnot(None),
+            )
+            .order_by(Document.id.desc())
+            .limit(cap)
+        )
+        ids = [r[0] for r in (await session.execute(q)).all()]
+
+    promoted = failed = 0
+    for doc_id in ids:
+        async with async_session() as session:
+            doc = await session.get(Document, doc_id)
+            if doc is None or doc.file_format != "article":
+                continue
+            arxiv_id = ((doc.extract_meta or {}).get("paper") or {}).get("arxiv_id")
+            if not arxiv_id:
+                continue
+            rel_path = f"crawl_raw/papers/arxiv/{arxiv_id.replace('/', '_')}.pdf"
+            dest = Path(settings.nas_mount_path) / rel_path
+            try:
+                size = await _download(_ARXIV_PDF.format(id=arxiv_id), dest)
+            except Exception as e:  # noqa: BLE001 — 다운로드 실패 격리
+                logger.error(f"[promote] {arxiv_id} 다운로드 실패: {e}")
+                failed += 1
+                continue
+            # in-place 승격: 초록 행 → 전문 PDF 행 (file_hash·extract_meta.paper 보존)
+            doc.file_path = rel_path
+            doc.file_format = "pdf"
+            doc.file_type = "immutable"
+            doc.file_size = size
+            doc.md_status = "pending"  # marker 재실행(기존 'skipped' 해제)
+            doc.md_extraction_error = None
+            await enqueue_stage(session, doc.id, "extract")
+            await session.commit()
+            promoted += 1
+            logger.info(f"[promote] {arxiv_id} → 전문 PDF in-place (doc {doc.id}, {size}b)")
+
+    logger.info(f"[paper_fulltext_promote] 승격 {promoted} · 실패 {failed} (cap {cap})")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="논문 arXiv 전문 승격 (in-place, keyless)")
+    parser.add_argument("--bulk", action="store_true", help="cap 해제(전건 백필 — GPU 부하 주의)")
+    parser.add_argument("--limit", type=int, default=0, help="승격 상한(0=기본 cap 10)")
+    args = parser.parse_args()
+    asyncio.run(run(bulk=args.bulk, limit=args.limit))