"""B-3 PR1 — 논문 DOI 코어 순수 단위 테스트 (plan safety-library-b3-1). holder.find_paper_holder(DB 조회)는 PR2 arXiv 실수집 시 라이브 검증 — 여기선 순수 함수만. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent / "app")) from services.papers.doi import ( # noqa: E402 arxiv_doi, normalize_doi, paper_doi_hash, parse_arxiv_id, parse_doi_from_text, read_paper_doi, with_paper_doi, with_parent_doi, ) # ─── normalize_doi: 단일 함수(저장=조회) ─── def test_normalize_strips_url_and_lowercases(): assert normalize_doi("https://doi.org/10.1585/PFR.15.2402039") == "10.1585/pfr.15.2402039" assert normalize_doi("http://dx.doi.org/10.1115/1.4045678") == "10.1115/1.4045678" assert normalize_doi("doi:10.1016/j.jlp.2020.104321") == "10.1016/j.jlp.2020.104321" assert normalize_doi("DOI: 10.1234/ABC") == "10.1234/abc" def test_normalize_trims_whitespace_and_citation_noise(): assert normalize_doi(" https://doi.org/10.1234/abc ") == "10.1234/abc" assert normalize_doi("10.1234/abc.") == "10.1234/abc" assert normalize_doi("10.1234/abc;") == "10.1234/abc" def test_normalize_preserves_parens_in_doi(): # 괄호는 DOI 일부일 수 있어 보존 (과삭제 = 다른 논문 병합 = 데이터 손상, near-dup 보다 위험) assert normalize_doi("10.1016/s0010-8650(00)80003-2") == "10.1016/s0010-8650(00)80003-2" assert normalize_doi("https://doi.org/10.1016/S0010-8650(00)80003-2") == "10.1016/s0010-8650(00)80003-2" def test_normalize_rejects_non_doi(): assert normalize_doi(None) is None assert normalize_doi("") is None assert normalize_doi(" ") is None assert normalize_doi("not-a-doi") is None assert normalize_doi("arXiv:2606.08108") is None # arXiv id 는 DOI 아님 def test_normalize_is_idempotent_store_equals_lookup(): # 저장측·조회측이 같은 함수를 거치면 표기 차이가 한 값으로 붕괴 (dedup 성립 조건) forms = [ "https://doi.org/10.1/X", "doi:10.1/x", "10.1/X", " HTTPS://DOI.ORG/10.1/x ", ] assert {normalize_doi(f) for f in forms} == {"10.1/x"} assert normalize_doi(normalize_doi("https://doi.org/10.1/X")) == "10.1/x" # 멱등 # ─── paper_doi_hash: holder file_hash 키 ─── def test_paper_doi_hash_deterministic_len32(): h = paper_doi_hash("10.1234/abc") assert len(h) == 32 assert h == paper_doi_hash("10.1234/abc") def test_paper_doi_hash_distinct_per_doi(): assert paper_doi_hash("10.1/a") != paper_doi_hash("10.1/b") # ─── 2-Document extract_meta 계약 (holder doi / child parent_doi 상호 배타) ─── def test_with_paper_doi_holder_shape_and_merge_safe(): meta = with_paper_doi({"license": {"scheme": "cc_by"}, "source_id": 7}, "10.1/x") assert meta["paper"]["doi"] == "10.1/x" assert "parent_doi" not in meta["paper"] assert meta["license"]["scheme"] == "cc_by" # 타 키 보존 assert meta["source_id"] == 7 def test_with_parent_doi_child_shape_no_doi(): meta = with_parent_doi({"license": {"scheme": "proprietary"}}, "10.1/holder") assert meta["paper"]["parent_doi"] == "10.1/holder" assert "doi" not in meta["paper"] # child 는 doi 미보유 (partial-unique 인덱스 밖) assert meta["license"]["scheme"] == "proprietary" def test_holder_child_mutually_exclusive(): child = with_parent_doi({}, "10.1/p") promoted = with_paper_doi(child, "10.1/self") assert promoted["paper"]["doi"] == "10.1/self" assert "parent_doi" not in promoted["paper"] def test_input_not_mutated(): src = {"paper": {"doi": "10.1/old"}} with_parent_doi(src, "10.1/new") assert src["paper"]["doi"] == "10.1/old" # 원본 dict 불변 # ─── read_paper_doi: 인덱스 식의 조회측 거울 ─── def test_read_paper_doi(): assert read_paper_doi({"paper": {"doi": "10.1/x"}}) == "10.1/x" assert read_paper_doi({"paper": {"doi": "https://doi.org/10.1/X"}}) == "10.1/x" # 방어적 재정규화 assert read_paper_doi({}) is None assert read_paper_doi(None) is None assert read_paper_doi({"paper": {"parent_doi": "10.1/p"}}) is None # child 는 doi 없음 assert read_paper_doi({"paper": {}}) is None # ─── PR4: arXiv id 파싱 + arXiv DataCite DOI (교차소스 dedup 통일 키) ─── def test_parse_arxiv_id(): assert parse_arxiv_id("Title arXiv:2606.10236v1 Announce Type: new Abstract") == "2606.10236" assert parse_arxiv_id("see arXiv:2601.02852 for details") == "2601.02852" assert parse_arxiv_id("arXiv:cond-mat/0703470v2") == "cond-mat/0703470" assert parse_arxiv_id("no arxiv here") is None assert parse_arxiv_id(None) is None def test_arxiv_doi_canonical(): # OpenAlex canonical 실측 일치: 10.48550/arxiv.{id} (소문자) assert arxiv_doi("2606.10236") == "10.48550/arxiv.2606.10236" assert arxiv_doi(None) is None # 수집기·reconcile 가 같은 함수 → 같은 paper.doi (교차소스 dedup 성립) assert arxiv_doi(parse_arxiv_id("x arXiv:2606.10236v1 y")) == "10.48550/arxiv.2606.10236" # ─── PR5: 구매 PDF 본문 DOI 파싱 (parent_doi 링크용, PDF 구조 무관) ─── def test_parse_doi_from_text(): assert parse_doi_from_text("ref https://doi.org/10.1016/j.jlp.2024.105474 end") == "10.1016/j.jlp.2024.105474" assert parse_doi_from_text("DOI 10.1115/1.4045678. Next.") == "10.1115/1.4045678" assert parse_doi_from_text("no doi here") is None assert parse_doi_from_text(None) is None