feat(papers): B-3 PR3 — OpenAlex 백본 수집기 (scaffold-first, signal-only, per-run cap)
plan safety-library-b3-1 PR3. 발견+dedup 글로벌 백본(JP/EU/US 색인+정본 DOI, 전문 안 줌). - scaffold-first: OPENALEX_API_KEY 부재 시 FeedError explicit-skip(silent fallback 0). 키=무료. - signal-only: inverted-index 초록 복원→색인(embed+chunk), summarize 0. PDF 절대 미fetch(oa_url=신호). - 관련성 사전필터=title_and_abstract.search 키워드 + per-run cap 60(임베드 firehose 차단, 적대리뷰 A major) + cursor 페이징 + from_publication_date 워터마크 증분. 초록 없는 thin 레코드 skip(재료 품질). - license: 명시 CC→redistribute true / OA·closed→false(restricted 부재=초록 RAG 사용가능, 비-CC 전문은 L-1 Phase-2). - DOI→paper.doi(holder, 교차소스 dedup) / 없으면 openalex_id. enabled=False 행+add_job(daily 07:45 KST)+CLI. 순수 파서/초록복원/license_meta fixture 단위 7 passed(OpenAlex 실응답: cc-by/cc-by-nc-nd/None·초록 유무). 라이브 검증 PASS (prod, running fastapi 무접촉): 키없음→explicit-skip / 키주입→3건 적재 (paper/NULL/ai_summary NULL/region INT, cc-by→redist true·unspecified→false, green/gold, 큐 embed3+chunk3·summarize 0, distinct openalex_id=total, 교차소스 DOI 4 distinct 4 중복 0). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+1
File diff suppressed because one or more lines are too long
@@ -0,0 +1,92 @@
|
||||
"""B-3 PR3 — OpenAlex 파서·초록복원·license 순수 단위 테스트 (plan safety-library-b3-1).
|
||||
|
||||
fixture = OpenAlex /works 실응답 박제(process safety/pressure vessel OA 5건 —
|
||||
cc-by/cc-by-nc-nd/license None, 초록 있음/없음). run()/적재(DB)는 PR3 라이브 검증.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "app"))
|
||||
|
||||
from workers.openalex_collector import ( # noqa: E402
|
||||
_reconstruct_abstract,
|
||||
build_filter,
|
||||
license_meta,
|
||||
parse_openalex_works,
|
||||
)
|
||||
|
||||
FIX = Path(__file__).parent / "fixtures" / "openalex_works_response.json"
|
||||
|
||||
|
||||
def _works():
|
||||
count, cursor, works = parse_openalex_works(FIX.read_text(encoding="utf-8"))
|
||||
return count, {w.openalex_id: w for w in works}, works
|
||||
|
||||
|
||||
# ─── 피드 레벨 ───
|
||||
|
||||
def test_count_and_results():
|
||||
count, by_id, works = _works()
|
||||
assert count == 1111
|
||||
assert len(works) == 5
|
||||
assert all(w.openalex_id.startswith("W") and "/" not in w.openalex_id for w in works)
|
||||
|
||||
|
||||
# ─── 초록 보유 + CC 라이선스 ───
|
||||
|
||||
def test_work_with_abstract_and_cc():
|
||||
_, by_id, _ = _works()
|
||||
w = by_id["W2910511816"]
|
||||
assert w.doi and w.doi.startswith("10.") and w.doi == w.doi.lower() # normalize_doi
|
||||
assert len(w.abstract) > 50 # inverted-index 복원
|
||||
assert w.oa_status == "diamond" and w.is_oa is True
|
||||
assert w.license == "cc-by"
|
||||
assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True
|
||||
|
||||
|
||||
# ─── 초록 없는 thin 레코드(skip 대상) ───
|
||||
|
||||
def test_work_without_abstract():
|
||||
_, by_id, _ = _works()
|
||||
w = by_id["W3107397139"]
|
||||
assert w.abstract == "" # inverted-index 부재 → 빈 초록
|
||||
lm = license_meta(w.license, w.is_oa, w.source_name)
|
||||
assert lm["redistribute"] is False # license None → 비배포
|
||||
|
||||
|
||||
# ─── cc-by-nc-nd 도 CC 계열 → redistribute True ───
|
||||
|
||||
def test_cc_variant_redistribute():
|
||||
_, by_id, _ = _works()
|
||||
w = by_id["W4391130399"]
|
||||
assert w.license == "cc-by-nc-nd"
|
||||
assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True
|
||||
|
||||
|
||||
# ─── 초록 inverted-index 복원 순서 ───
|
||||
|
||||
def test_reconstruct_abstract_order():
|
||||
inv = {"Safety": [0], "of": [1, 4], "pressure": [2], "vessels": [3], "design": [5]}
|
||||
assert _reconstruct_abstract(inv) == "Safety of pressure vessels of design"
|
||||
assert _reconstruct_abstract(None) == ""
|
||||
assert _reconstruct_abstract({}) == ""
|
||||
|
||||
|
||||
# ─── license_meta 분기 ───
|
||||
|
||||
def test_license_meta_branches():
|
||||
assert license_meta("cc-by", True, "X")["redistribute"] is True
|
||||
assert license_meta("cc0", True, "X")["redistribute"] is True
|
||||
none_oa = license_meta(None, True, "X")
|
||||
assert none_oa["redistribute"] is False and none_oa["scheme"] == "open-unspecified"
|
||||
closed = license_meta(None, False, "X")
|
||||
assert closed["redistribute"] is False and closed["scheme"] == "proprietary"
|
||||
|
||||
|
||||
# ─── 쿼리 빌더 ───
|
||||
|
||||
def test_build_filter():
|
||||
assert build_filter("process safety") == "title_and_abstract.search:process safety"
|
||||
assert build_filter("process safety", "2026-06-01") == \
|
||||
"title_and_abstract.search:process safety,from_publication_date:2026-06-01"
|
||||
Reference in New Issue
Block a user