fdabca2a2f
plan safety-library-b3-1 PR6 (revised). 라이브 정찰: KoreaScience=깨끗한 API 없음(OAI 404)· J-STAGE=ToS bulk 금지, 그리고 Phase-1 메타는 OpenAlex 가 이미 전수 색인(한국안전학회지 1766건 실측) → 전용 스크래퍼 대신 검증된 OpenAlex 수집기에 도메인 저널 ISSN 시드 추가(전용 무료 전문 PDF=Phase-2 park). - _JOURNAL_ISSNS(OpenAlex sources 실측): 한국안전학회지 1738-3803·한국가스학회지 1226-8402· KSME A/B 1226-4873·1226-4881·KSME Intl 1226-4865·JP 고압 0917-639X. - _seeds() = ISSN 시드(cap 우선) + 키워드. build_issn_filter(primary_location.source.issn:). run() 루프 통합(종류별 필터, 워터마크 시드별). 적재/parser/cap/signal-only = PR3 재사용. 단위 8 passed(+ISSN 시드). 라이브 PASS: 키주입 run → 한국안전학회지 5건 적재(ISSN 우선 확인), running fastapi 무접촉. KoreaScience/J-STAGE 전용 fulltext 수집기 = Phase-2 강등(park). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
107 lines
3.8 KiB
Python
107 lines
3.8 KiB
Python
"""B-3 PR3 — OpenAlex 파서·초록복원·license 순수 단위 테스트 (plan safety-library-b3-1).
|
|
|
|
fixture = OpenAlex /works 실응답 박제(process safety/pressure vessel OA 5건 —
|
|
cc-by/cc-by-nc-nd/license None, 초록 있음/없음). run()/적재(DB)는 PR3 라이브 검증.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "app"))
|
|
|
|
from workers.openalex_collector import ( # noqa: E402
|
|
_reconstruct_abstract,
|
|
_seeds,
|
|
build_filter,
|
|
build_issn_filter,
|
|
license_meta,
|
|
parse_openalex_works,
|
|
)
|
|
|
|
FIX = Path(__file__).parent / "fixtures" / "openalex_works_response.json"
|
|
|
|
|
|
def _works():
|
|
count, cursor, works = parse_openalex_works(FIX.read_text(encoding="utf-8"))
|
|
return count, {w.openalex_id: w for w in works}, works
|
|
|
|
|
|
# ─── 피드 레벨 ───
|
|
|
|
def test_count_and_results():
|
|
count, by_id, works = _works()
|
|
assert count == 1111
|
|
assert len(works) == 5
|
|
assert all(w.openalex_id.startswith("W") and "/" not in w.openalex_id for w in works)
|
|
|
|
|
|
# ─── 초록 보유 + CC 라이선스 ───
|
|
|
|
def test_work_with_abstract_and_cc():
|
|
_, by_id, _ = _works()
|
|
w = by_id["W2910511816"]
|
|
assert w.doi and w.doi.startswith("10.") and w.doi == w.doi.lower() # normalize_doi
|
|
assert len(w.abstract) > 50 # inverted-index 복원
|
|
assert w.oa_status == "diamond" and w.is_oa is True
|
|
assert w.license == "cc-by"
|
|
assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True
|
|
|
|
|
|
# ─── 초록 없는 thin 레코드(skip 대상) ───
|
|
|
|
def test_work_without_abstract():
|
|
_, by_id, _ = _works()
|
|
w = by_id["W3107397139"]
|
|
assert w.abstract == "" # inverted-index 부재 → 빈 초록
|
|
lm = license_meta(w.license, w.is_oa, w.source_name)
|
|
assert lm["redistribute"] is False # license None → 비배포
|
|
|
|
|
|
# ─── cc-by-nc-nd 도 CC 계열 → redistribute True ───
|
|
|
|
def test_cc_variant_redistribute():
|
|
_, by_id, _ = _works()
|
|
w = by_id["W4391130399"]
|
|
assert w.license == "cc-by-nc-nd"
|
|
assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True
|
|
|
|
|
|
# ─── 초록 inverted-index 복원 순서 ───
|
|
|
|
def test_reconstruct_abstract_order():
|
|
inv = {"Safety": [0], "of": [1, 4], "pressure": [2], "vessels": [3], "design": [5]}
|
|
assert _reconstruct_abstract(inv) == "Safety of pressure vessels of design"
|
|
assert _reconstruct_abstract(None) == ""
|
|
assert _reconstruct_abstract({}) == ""
|
|
|
|
|
|
# ─── license_meta 분기 ───
|
|
|
|
def test_license_meta_branches():
|
|
assert license_meta("cc-by", True, "X")["redistribute"] is True
|
|
assert license_meta("cc0", True, "X")["redistribute"] is True
|
|
none_oa = license_meta(None, True, "X")
|
|
assert none_oa["redistribute"] is False and none_oa["scheme"] == "open-unspecified"
|
|
closed = license_meta(None, False, "X")
|
|
assert closed["redistribute"] is False and closed["scheme"] == "proprietary"
|
|
|
|
|
|
# ─── 쿼리 빌더 ───
|
|
|
|
def test_build_filter():
|
|
assert build_filter("process safety") == "title_and_abstract.search:process safety"
|
|
assert build_filter("process safety", "2026-06-01") == \
|
|
"title_and_abstract.search:process safety,from_publication_date:2026-06-01"
|
|
|
|
|
|
# ─── PR6: ISSN 소스 시드 (KR/JP 안전 저널 직접 커버) ───
|
|
|
|
def test_build_issn_filter_and_seeds():
|
|
assert build_issn_filter("1738-3803") == "primary_location.source.issn:1738-3803"
|
|
assert build_issn_filter("1738-3803", "2026-01-01") == \
|
|
"primary_location.source.issn:1738-3803,from_publication_date:2026-01-01"
|
|
seeds = _seeds()
|
|
kinds = [k for _, _, k in seeds]
|
|
assert kinds[0] == "issn" # ISSN 시드가 키워드보다 먼저(cap 우선권)
|
|
assert any(v == "1738-3803" and k == "issn" for _, v, k in seeds) # 한국안전학회지 포함
|