diff --git a/app/workers/openalex_collector.py b/app/workers/openalex_collector.py index f7b6e35..a52c867 100644 --- a/app/workers/openalex_collector.py +++ b/app/workers/openalex_collector.py @@ -52,6 +52,18 @@ _KEYWORDS = ( "fatigue life assessment", ) +# 도메인 직결 저널 ISSN 시드(OpenAlex sources 실측 확인) — 키워드 매칭 누락분까지 전수 커버. +# KR 안전/가스/기계 + JP 고압. KR/JP 관심 = OpenAlex 깨끗한 API 로 직접(KoreaScience/J-STAGE 전용 +# 스크래퍼 불요 — Phase-1 메타는 OpenAlex 와 중복, 전용 수집기의 유니크 가치=무료 전문 PDF=Phase-2). +_JOURNAL_ISSNS = ( + ("한국안전학회지", "1738-3803"), + ("한국가스학회지", "1226-8402"), + ("대한기계학회논문집 A", "1226-4873"), + ("대한기계학회논문집 B", "1226-4881"), + ("KSME International J.", "1226-4865"), + ("Review of High Pressure Sci&Tech (JP)", "0917-639X"), +) + _RUN_CAP = 60 # 1회 run 신규 적재 상한(임베드 큐 보호). bulk 시 해제. _PER_PAGE = 50 _MAX_PAGES_PER_KW = 4 # 키워드당 최대 페이지(증분이라 보통 1페이지에 워터마크 도달) @@ -145,6 +157,20 @@ def build_filter(keyword: str, from_date: str | None = None) -> str: return f +def build_issn_filter(issn: str, from_date: str | None = None) -> str: + f = f"primary_location.source.issn:{issn}" + if from_date: + f += f",from_publication_date:{from_date}" + return f + + +def _seeds() -> list[tuple[str, str, str]]: + """수집 시드 = (라벨, 워터마크키, 종류). 도메인 저널 ISSN 우선(cap 우선권) → 키워드.""" + s: list[tuple[str, str, str]] = [(label, issn, "issn") for label, issn in _JOURNAL_ISSNS] + s += [(kw, kw, "kw") for kw in _KEYWORDS] + return s + + # ───────────────────────── 적재 (DB — PR3 라이브 검증) ───────────────────────── def _build_paper_meta(source: NewsSource, w: OpenAlexWork) -> dict: @@ -296,13 +322,14 @@ async def run(bulk: bool = False, limit: int = 0) -> None: async with httpx.AsyncClient( timeout=30.0, headers={"User-Agent": CRAWL_UA}, follow_redirects=True ) as client: - for keyword in _KEYWORDS: + for label, wm_key, kind in _seeds(): if inserted >= run_cap: break async with async_session() as session: src = await session.get(NewsSource, source_id) - watermark = None if bulk else _watermark(src, keyword) - filter_str = build_filter(keyword, watermark) + watermark = None if bulk else _watermark(src, wm_key) + filter_str = (build_issn_filter(wm_key, watermark) if kind == "issn" + else build_filter(wm_key, watermark)) newest: str | None = None cursor = "*" max_pages = (10**6 if bulk else _MAX_PAGES_PER_KW) @@ -334,10 +361,10 @@ async def run(bulk: bool = False, limit: int = 0) -> None: if newest: async with async_session() as session: src = await session.get(NewsSource, source_id) - _set_watermark(src, keyword, newest) + _set_watermark(src, wm_key, newest) await session.commit() except (httpx.HTTPError, FeedError, ValueError) as e: - msg = f"[{keyword}] {e or repr(e)}" + msg = f"[{label}] {e or repr(e)}" logger.error(f"[openalex] {msg}") failures.append(msg) @@ -351,7 +378,7 @@ async def run(bulk: bool = False, limit: int = 0) -> None: deferred = "" if inserted < run_cap else f" (cap {run_cap} 도달 — 잔여 다음 run 이월)" logger.info( - f"[openalex] {len(_KEYWORDS)}개 키워드 스캔 {seen}건 → 신규 {inserted}건{deferred}" + f"[openalex] {len(_seeds())}개 시드(ISSN+키워드) 스캔 {seen}건 → 신규 {inserted}건{deferred}" + (f" / 실패 {len(failures)}건" if failures else "") ) diff --git a/tests/test_openalex_collector_units.py b/tests/test_openalex_collector_units.py index 5e17479..afee7b0 100644 --- a/tests/test_openalex_collector_units.py +++ b/tests/test_openalex_collector_units.py @@ -11,7 +11,9 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "app")) from workers.openalex_collector import ( # noqa: E402 _reconstruct_abstract, + _seeds, build_filter, + build_issn_filter, license_meta, parse_openalex_works, ) @@ -90,3 +92,15 @@ def test_build_filter(): assert build_filter("process safety") == "title_and_abstract.search:process safety" assert build_filter("process safety", "2026-06-01") == \ "title_and_abstract.search:process safety,from_publication_date:2026-06-01" + + +# ─── PR6: ISSN 소스 시드 (KR/JP 안전 저널 직접 커버) ─── + +def test_build_issn_filter_and_seeds(): + assert build_issn_filter("1738-3803") == "primary_location.source.issn:1738-3803" + assert build_issn_filter("1738-3803", "2026-01-01") == \ + "primary_location.source.issn:1738-3803,from_publication_date:2026-01-01" + seeds = _seeds() + kinds = [k for _, _, k in seeds] + assert kinds[0] == "issn" # ISSN 시드가 키워드보다 먼저(cap 우선권) + assert any(v == "1738-3803" and k == "issn" for _, v, k in seeds) # 한국안전학회지 포함