"""paper DOI reconcile — B-3 PR4(레거시 arXiv) + PR5(구매 PDF) (plan safety-library-b3-1). paper.doi/parent_doi 둘 다 없는 paper 행을 두 갈래로 정리: - 레거시 arXiv 초록(holder): arXiv id → arxiv_doi(10.48550/arxiv.{id}) 스탬프 → partial-unique 인덱스 편입 → 재유입 차단('동일-DOI 재유입 차단만'). - 구매 PDF(child, license.restricted=true — Papers_Purchased 드롭): 본문 DOI 파싱 → paper.parent_doi 링크(서지 holder 와 DOI 공유로 연결). child 는 doi 미보유(인덱스 밖) → unique 무충돌. - KEYLESS·결정적(OpenAlex 호출 0)·in-DB·enqueue 0(콘텐츠 무변경). dedup_reconcile(file_hash 캐시)와 별 worker(적대리뷰 B·C major). 선재 DOI holder 존재 시 arXiv 행도 parent_doi 마킹(unique 위반 회피). """ import asyncio from sqlalchemy import select from core.database import async_session from core.utils import setup_logger from models.document import Document from services.papers.doi import ( arxiv_doi, parse_arxiv_id, parse_doi_from_text, with_paper_doi, with_parent_doi, ) from services.papers.holder import find_paper_holder logger = setup_logger("paper_doi_reconcile") _DOI_TEXT = Document.extract_meta[("paper", "doi")].astext _PARENT_DOI_TEXT = Document.extract_meta[("paper", "parent_doi")].astext def _is_restricted(meta: dict) -> bool: return (meta.get("license") or {}).get("restricted") in (True, "true") async def run(limit: int = 0) -> None: """paper.doi/parent_doi 없는 paper 행 reconcile(멱등). limit=0 = 전건.""" stamped = marked_dup = skipped_no_arxiv = 0 linked_purchased = skipped_purchased_no_doi = 0 async with async_session() as session: q = ( select(Document) .where( Document.material_type == "paper", _DOI_TEXT.is_(None), _PARENT_DOI_TEXT.is_(None), ) .order_by(Document.id) ) if limit: q = q.limit(limit) rows = (await session.execute(q)).scalars().all() for row in rows: meta = dict(row.extract_meta or {}) paper = dict(meta.get("paper") or {}) # PR5: 구매 PDF(restricted) = child → 본문 DOI 파싱 → parent_doi 링크 if _is_restricted(meta): doi = parse_doi_from_text(row.extracted_text) if not doi: skipped_purchased_no_doi += 1 continue row.extract_meta = with_parent_doi(meta, doi) linked_purchased += 1 continue # PR4: 레거시 arXiv 초록(holder) = arXiv DataCite DOI 스탬프 arxiv_id = paper.get("arxiv_id") or parse_arxiv_id(row.extracted_text) doi = arxiv_doi(arxiv_id) if not doi: skipped_no_arxiv += 1 continue paper["arxiv_id"] = arxiv_id meta["paper"] = paper holder = await find_paper_holder(session, doi) if holder is not None and holder.id != row.id: row.extract_meta = with_parent_doi(meta, doi) # 선재 중복 → child 마킹 marked_dup += 1 else: row.extract_meta = with_paper_doi(meta, doi) # holder 스탬프, 인덱스 진입 stamped += 1 # 콘텐츠 무변경 → enqueue 없음(summarize/embed/chunk 0) await session.commit() logger.info( f"[paper_doi_reconcile] {len(rows)}행 → arXiv 스탬프 {stamped} · 선재중복 {marked_dup} · " f"arXiv id 없음 skip {skipped_no_arxiv} / 구매PDF parent_doi 링크 {linked_purchased} · " f"구매PDF DOI 없음 skip {skipped_purchased_no_doi}" ) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="paper DOI reconcile (arXiv 레거시 + 구매 PDF, keyless)") parser.add_argument("--limit", type=int, default=0, help="처리 상한(0=전건)") args = parser.parse_args() asyncio.run(run(limit=args.limit))