Files
hyungi_document_server/tests/test_paper_doi_units.py
Claude Code bf0348a3e0 feat(papers): B-3 PR5 — 구매 PDF parent_doi 스탬프 (paper_doi_reconcile 통합)
plan safety-library-b3-1 PR5. Papers_Purchased 수동 드롭 PDF(license.restricted=true)를 서지 holder 에
연결: 본문 DOI 파싱 → paper.parent_doi 링크(child, doi 미보유=인덱스 밖, unique 무충돌).
- doi.py: parse_doi_from_text(본문 전체 DOI 정규식 — PDF 구조 무관).
- paper_doi_reconcile: restricted 분기 — restricted 행은 본문 DOI→parent_doi(child),
  그 외(레거시 arXiv)는 holder 스탬프(PR4). 쿼리에 parent_doi IS NULL 추가(링크분 재처리 회피).
- file_watcher merge-only license 주입 clobber-safe 존중. enqueue 0(콘텐츠 무변경).

단위 29 passed(+parse_doi_from_text). ephemeral PASS: 합성 restricted 행 → parent_doi 링크·
paper.doi 부재·restricted 보존·스키마 수용(insert+rollback). reconcile 멱등(재실행 0 변경).
실 구매 PDF 라이브 검증 = 사용자 첫 논문 구매·드롭 시(로직 검증 완료).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 22:58:19 +00:00

142 lines
5.5 KiB
Python

"""B-3 PR1 — 논문 DOI 코어 순수 단위 테스트 (plan safety-library-b3-1).
holder.find_paper_holder(DB 조회)는 PR2 arXiv 실수집 시 라이브 검증 — 여기선 순수 함수만.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "app"))
from services.papers.doi import ( # noqa: E402
arxiv_doi,
normalize_doi,
paper_doi_hash,
parse_arxiv_id,
parse_doi_from_text,
read_paper_doi,
with_paper_doi,
with_parent_doi,
)
# ─── normalize_doi: 단일 함수(저장=조회) ───
def test_normalize_strips_url_and_lowercases():
assert normalize_doi("https://doi.org/10.1585/PFR.15.2402039") == "10.1585/pfr.15.2402039"
assert normalize_doi("http://dx.doi.org/10.1115/1.4045678") == "10.1115/1.4045678"
assert normalize_doi("doi:10.1016/j.jlp.2020.104321") == "10.1016/j.jlp.2020.104321"
assert normalize_doi("DOI: 10.1234/ABC") == "10.1234/abc"
def test_normalize_trims_whitespace_and_citation_noise():
assert normalize_doi(" https://doi.org/10.1234/abc ") == "10.1234/abc"
assert normalize_doi("10.1234/abc.") == "10.1234/abc"
assert normalize_doi("10.1234/abc;") == "10.1234/abc"
def test_normalize_preserves_parens_in_doi():
# 괄호는 DOI 일부일 수 있어 보존 (과삭제 = 다른 논문 병합 = 데이터 손상, near-dup 보다 위험)
assert normalize_doi("10.1016/s0010-8650(00)80003-2") == "10.1016/s0010-8650(00)80003-2"
assert normalize_doi("https://doi.org/10.1016/S0010-8650(00)80003-2") == "10.1016/s0010-8650(00)80003-2"
def test_normalize_rejects_non_doi():
assert normalize_doi(None) is None
assert normalize_doi("") is None
assert normalize_doi(" ") is None
assert normalize_doi("not-a-doi") is None
assert normalize_doi("arXiv:2606.08108") is None # arXiv id 는 DOI 아님
def test_normalize_is_idempotent_store_equals_lookup():
# 저장측·조회측이 같은 함수를 거치면 표기 차이가 한 값으로 붕괴 (dedup 성립 조건)
forms = [
"https://doi.org/10.1/X",
"doi:10.1/x",
"10.1/X",
" HTTPS://DOI.ORG/10.1/x ",
]
assert {normalize_doi(f) for f in forms} == {"10.1/x"}
assert normalize_doi(normalize_doi("https://doi.org/10.1/X")) == "10.1/x" # 멱등
# ─── paper_doi_hash: holder file_hash 키 ───
def test_paper_doi_hash_deterministic_len32():
h = paper_doi_hash("10.1234/abc")
assert len(h) == 32
assert h == paper_doi_hash("10.1234/abc")
def test_paper_doi_hash_distinct_per_doi():
assert paper_doi_hash("10.1/a") != paper_doi_hash("10.1/b")
# ─── 2-Document extract_meta 계약 (holder doi / child parent_doi 상호 배타) ───
def test_with_paper_doi_holder_shape_and_merge_safe():
meta = with_paper_doi({"license": {"scheme": "cc_by"}, "source_id": 7}, "10.1/x")
assert meta["paper"]["doi"] == "10.1/x"
assert "parent_doi" not in meta["paper"]
assert meta["license"]["scheme"] == "cc_by" # 타 키 보존
assert meta["source_id"] == 7
def test_with_parent_doi_child_shape_no_doi():
meta = with_parent_doi({"license": {"scheme": "proprietary"}}, "10.1/holder")
assert meta["paper"]["parent_doi"] == "10.1/holder"
assert "doi" not in meta["paper"] # child 는 doi 미보유 (partial-unique 인덱스 밖)
assert meta["license"]["scheme"] == "proprietary"
def test_holder_child_mutually_exclusive():
child = with_parent_doi({}, "10.1/p")
promoted = with_paper_doi(child, "10.1/self")
assert promoted["paper"]["doi"] == "10.1/self"
assert "parent_doi" not in promoted["paper"]
def test_input_not_mutated():
src = {"paper": {"doi": "10.1/old"}}
with_parent_doi(src, "10.1/new")
assert src["paper"]["doi"] == "10.1/old" # 원본 dict 불변
# ─── read_paper_doi: 인덱스 식의 조회측 거울 ───
def test_read_paper_doi():
assert read_paper_doi({"paper": {"doi": "10.1/x"}}) == "10.1/x"
assert read_paper_doi({"paper": {"doi": "https://doi.org/10.1/X"}}) == "10.1/x" # 방어적 재정규화
assert read_paper_doi({}) is None
assert read_paper_doi(None) is None
assert read_paper_doi({"paper": {"parent_doi": "10.1/p"}}) is None # child 는 doi 없음
assert read_paper_doi({"paper": {}}) is None
# ─── PR4: arXiv id 파싱 + arXiv DataCite DOI (교차소스 dedup 통일 키) ───
def test_parse_arxiv_id():
assert parse_arxiv_id("Title arXiv:2606.10236v1 Announce Type: new Abstract") == "2606.10236"
assert parse_arxiv_id("see arXiv:2601.02852 for details") == "2601.02852"
assert parse_arxiv_id("arXiv:cond-mat/0703470v2") == "cond-mat/0703470"
assert parse_arxiv_id("no arxiv here") is None
assert parse_arxiv_id(None) is None
def test_arxiv_doi_canonical():
# OpenAlex canonical 실측 일치: 10.48550/arxiv.{id} (소문자)
assert arxiv_doi("2606.10236") == "10.48550/arxiv.2606.10236"
assert arxiv_doi(None) is None
# 수집기·reconcile 가 같은 함수 → 같은 paper.doi (교차소스 dedup 성립)
assert arxiv_doi(parse_arxiv_id("x arXiv:2606.10236v1 y")) == "10.48550/arxiv.2606.10236"
# ─── PR5: 구매 PDF 본문 DOI 파싱 (parent_doi 링크용, PDF 구조 무관) ───
def test_parse_doi_from_text():
assert parse_doi_from_text("ref https://doi.org/10.1016/j.jlp.2024.105474 end") == "10.1016/j.jlp.2024.105474"
assert parse_doi_from_text("DOI 10.1115/1.4045678. Next.") == "10.1115/1.4045678"
assert parse_doi_from_text("no doi here") is None
assert parse_doi_from_text(None) is None