From 73c6f123b85b56ebbb8010cec8c993909472f46d Mon Sep 17 00:00:00 2001 From: Claude Code Date: Sun, 14 Jun 2026 03:04:02 +0000 Subject: [PATCH] =?UTF-8?q?feat(papers):=20B-3=20P2-PR1=20=E2=80=94=20arXi?= =?UTF-8?q?v=20=EB=85=BC=EB=AC=B8=20=EC=A0=84=EB=AC=B8=20in-place=20?= =?UTF-8?q?=EC=8A=B9=EA=B2=A9=20+=20classify=20paper=20=EC=9A=94=EC=95=BD-?= =?UTF-8?q?=EC=8A=A4=ED=82=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit plan safety-library-b3-1 Phase-2. 논문을 초록(signal-only)에서 전문 md/검색으로 승격. - paper_fulltext_promote.py: 미승격 arXiv 논문(file_format='article') → arxiv.org/pdf/{id} 다운로드 (kosha 패턴·50MB cap·PDF 헤더검증) → NAS crawl_raw/papers/arxiv/ → in-place 갱신 (file_format=pdf·file_type=immutable·file_path·md_status=pending, file_hash·extract_meta.paper 보존) → 'extract' enqueue. 1-Document(2행 분리 회피, 기존 display 스택 재사용). per-run cap 10(GPU 보호). arXiv=공개 프리프린트라 전문 검색/RAG 무난(restricted 불요; 유료 구매분만 Papers_Purchased restricted). - classify_worker: material_type='paper' 가드 추가 — 요약/분류 LLM 스킵(맥미니 큐 무접촉), queue_consumer 가 embed/chunk/markdown 은 chain. law_monitor 스킵 패턴 동형. CLI 전용(Phase-2 deliberate 승격·GPU 부하 사용자 통제). 파이프라인=extract→classify[skip]→embed/chunk/markdown, marker 표시 md + hier 절구조 + 전문 검색 청크. 배포 후 라이브 검증. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/workers/classify_worker.py | 9 +++ app/workers/paper_fulltext_promote.py | 109 ++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 app/workers/paper_fulltext_promote.py diff --git a/app/workers/classify_worker.py b/app/workers/classify_worker.py index 5f8470c..f0f0a96 100644 --- a/app/workers/classify_worker.py +++ b/app/workers/classify_worker.py @@ -411,6 +411,15 @@ async def process( logger.info(f"doc {document_id}: devonagent → classify skip") return + # 논문(material_type='paper') — 요약/분류 LLM 스킵(맥미니 큐 무접촉, B-3 signal-only 유지). + # embed/chunk/markdown 은 queue_consumer 가 chain (early-return 후에도 다음 stage enqueue). + if doc.material_type == "paper": + if not doc.ai_domain: + doc.ai_domain = "논문" + await session.commit() + logger.info(f"doc {document_id}: paper → classify skip (no summarize)") + return + if not doc.extracted_text: raise ValueError(f"문서 ID {document_id}: extracted_text가 비어있음") diff --git a/app/workers/paper_fulltext_promote.py b/app/workers/paper_fulltext_promote.py new file mode 100644 index 0000000..ccd390a --- /dev/null +++ b/app/workers/paper_fulltext_promote.py @@ -0,0 +1,109 @@ +"""논문 arXiv 전문 승격 (in-place) — B-3 Phase-2 P2-PR1 (plan safety-library-b3-1). + +arXiv 프리프린트 초록 행(file_format='article', signal-only)을 전문 PDF로 **in-place 승격**: +PDF 다운로드 → file_format/file_type/file_path/md_status 갱신 → 'extract' enqueue → 기존 파이프라인 +(extract → classify[paper skip summarize] → embed/chunk/markdown)이 전문 검색 청크 + md_content(marker 표시) ++ hier 절구조를 생성. 1-Document(2행 분리 회피, 기존 display 스택 재사용). + +- arXiv = 공개 프리프린트(arxiv.org/pdf/{id}, friendly host) → 전문 검색/RAG 무난, restricted 불요. + (유료 구매 논문은 Papers_Purchased 경로가 restricted=true 로 별개 처리.) +- per-run cap (marker GPU ~10GB + embed 부하 보호, 4070 16GB 빡빡 → idle-unload·증분). keyless. +- 요약 0 (classify paper-skip 가드). file_hash·extract_meta.paper 보존(수집기 dedup 무영향). +- CLI 전용(Phase-2 deliberate 승격, GPU 부하 사용자 통제). 스케줄 잡 미등록. +""" + +import argparse +import asyncio +import random +from pathlib import Path + +import httpx +from sqlalchemy import select + +from core.config import settings +from core.crawl_politeness import CRAWL_UA +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from models.queue import enqueue_stage + +logger = setup_logger("paper_fulltext_promote") + +_ARXIV_PDF = "https://arxiv.org/pdf/{id}" +_MAX_FILE_BYTES = 50 * 1024 * 1024 +_DOWNLOAD_DELAY = (2.0, 5.0) +_RUN_CAP = 10 # 1회 승격 상한(marker/embed GPU 보호). bulk 시 해제. + +_ARXIV_ID_EXPR = Document.extract_meta[("paper", "arxiv_id")].astext + + +async def _download(url: str, dest: Path) -> int: + """arXiv PDF 다운로드 — 크기 cap + PDF 헤더 검증 + 연속 간격(kosha 패턴).""" + await asyncio.sleep(random.uniform(*_DOWNLOAD_DELAY)) + async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client: + resp = await client.get(url, headers={"User-Agent": CRAWL_UA}) + if resp.status_code != 200: + raise RuntimeError(f"arXiv PDF {resp.status_code}: {url}") + if len(resp.content) > _MAX_FILE_BYTES: + raise RuntimeError(f"크기 초과 {len(resp.content)}b: {url}") + if resp.content[:5] != b"%PDF-": + raise RuntimeError(f"PDF 아님(헤더 {resp.content[:8]!r}): {url}") + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(resp.content) + return len(resp.content) + + +async def run(bulk: bool = False, limit: int = 0) -> None: + """미승격 arXiv 논문(file_format='article')을 전문 PDF로 in-place 승격.""" + cap = (limit or 10**9) if bulk else (min(limit, _RUN_CAP) if limit else _RUN_CAP) + async with async_session() as session: + q = ( + select(Document.id) + .where( + Document.material_type == "paper", + Document.file_format == "article", + _ARXIV_ID_EXPR.isnot(None), + ) + .order_by(Document.id.desc()) + .limit(cap) + ) + ids = [r[0] for r in (await session.execute(q)).all()] + + promoted = failed = 0 + for doc_id in ids: + async with async_session() as session: + doc = await session.get(Document, doc_id) + if doc is None or doc.file_format != "article": + continue + arxiv_id = ((doc.extract_meta or {}).get("paper") or {}).get("arxiv_id") + if not arxiv_id: + continue + rel_path = f"crawl_raw/papers/arxiv/{arxiv_id.replace('/', '_')}.pdf" + dest = Path(settings.nas_mount_path) / rel_path + try: + size = await _download(_ARXIV_PDF.format(id=arxiv_id), dest) + except Exception as e: # noqa: BLE001 — 다운로드 실패 격리 + logger.error(f"[promote] {arxiv_id} 다운로드 실패: {e}") + failed += 1 + continue + # in-place 승격: 초록 행 → 전문 PDF 행 (file_hash·extract_meta.paper 보존) + doc.file_path = rel_path + doc.file_format = "pdf" + doc.file_type = "immutable" + doc.file_size = size + doc.md_status = "pending" # marker 재실행(기존 'skipped' 해제) + doc.md_extraction_error = None + await enqueue_stage(session, doc.id, "extract") + await session.commit() + promoted += 1 + logger.info(f"[promote] {arxiv_id} → 전문 PDF in-place (doc {doc.id}, {size}b)") + + logger.info(f"[paper_fulltext_promote] 승격 {promoted} · 실패 {failed} (cap {cap})") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="논문 arXiv 전문 승격 (in-place, keyless)") + parser.add_argument("--bulk", action="store_true", help="cap 해제(전건 백필 — GPU 부하 주의)") + parser.add_argument("--limit", type=int, default=0, help="승격 상한(0=기본 cap 10)") + args = parser.parse_args() + asyncio.run(run(bulk=args.bulk, limit=args.limit))