From 1fbb341e28fa3ec65cddb19f262cad35c9d8febe Mon Sep 17 00:00:00 2001 From: Claude Code Date: Sat, 13 Jun 2026 22:30:36 +0000 Subject: [PATCH] =?UTF-8?q?feat(papers):=20B-3=20PR3=20=E2=80=94=20OpenAle?= =?UTF-8?q?x=20=EB=B0=B1=EB=B3=B8=20=EC=88=98=EC=A7=91=EA=B8=B0=20(scaffol?= =?UTF-8?q?d-first,=20signal-only,=20per-run=20cap)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit plan safety-library-b3-1 PR3. 발견+dedup 글로벌 백본(JP/EU/US 색인+정본 DOI, 전문 안 줌). - scaffold-first: OPENALEX_API_KEY 부재 시 FeedError explicit-skip(silent fallback 0). 키=무료. - signal-only: inverted-index 초록 복원→색인(embed+chunk), summarize 0. PDF 절대 미fetch(oa_url=신호). - 관련성 사전필터=title_and_abstract.search 키워드 + per-run cap 60(임베드 firehose 차단, 적대리뷰 A major) + cursor 페이징 + from_publication_date 워터마크 증분. 초록 없는 thin 레코드 skip(재료 품질). - license: 명시 CC→redistribute true / OA·closed→false(restricted 부재=초록 RAG 사용가능, 비-CC 전문은 L-1 Phase-2). - DOI→paper.doi(holder, 교차소스 dedup) / 없으면 openalex_id. enabled=False 행+add_job(daily 07:45 KST)+CLI. 순수 파서/초록복원/license_meta fixture 단위 7 passed(OpenAlex 실응답: cc-by/cc-by-nc-nd/None·초록 유무). 라이브 검증 PASS (prod, running fastapi 무접촉): 키없음→explicit-skip / 키주입→3건 적재 (paper/NULL/ai_summary NULL/region INT, cc-by→redist true·unspecified→false, green/gold, 큐 embed3+chunk3·summarize 0, distinct openalex_id=total, 교차소스 DOI 4 distinct 4 중복 0). Co-Authored-By: Claude Opus 4.8 (1M context) --- app/main.py | 4 + app/workers/openalex_collector.py | 366 ++++++++++++++++++++ tests/fixtures/openalex_works_response.json | 1 + tests/test_openalex_collector_units.py | 92 +++++ 4 files changed, 463 insertions(+) create mode 100644 app/workers/openalex_collector.py create mode 100644 tests/fixtures/openalex_works_response.json create mode 100644 tests/test_openalex_collector_units.py diff --git a/app/main.py b/app/main.py index 85f3cde..ca85f58 100644 --- a/app/main.py +++ b/app/main.py @@ -57,6 +57,7 @@ async def lifespan(app: FastAPI): from workers.statute_collector import run as statute_run from workers.news_collector import run as news_collector_run from workers.arxiv_collector import run as arxiv_collector_run + from workers.openalex_collector import run as openalex_collector_run from workers.fulltext_worker import reconcile_unresolved as fulltext_reconcile_run from workers.kosha_collector import run as kosha_collector_run from workers.csb_collector import run as csb_collector_run @@ -151,6 +152,9 @@ async def lifespan(app: FastAPI): # B-3 PR2: arXiv 키워드 필터 수집기 (daily 07:30 KST — statute 07:00 직후 빈 슬롯). # signal-only 초록 색인, per-run cap 으로 임베드 큐 보호. keyless. scheduler.add_job(arxiv_collector_run, CronTrigger(hour=7, minute=30, timezone=KST), id="arxiv_collector") + # B-3 PR3: OpenAlex 백본 수집기 (daily 07:45 KST). scaffold-first(키 부재 explicit-skip), + # signal-only 초록 색인, per-run cap + cursor watermark. 키=OPENALEX_API_KEY(credentials.env). + scheduler.add_job(openalex_collector_run, CronTrigger(hour=7, minute=45, timezone=KST), id="openalex_collector") scheduler.start() # Phase 2.1 (async 구조): QueryAnalyzer prewarm. diff --git a/app/workers/openalex_collector.py b/app/workers/openalex_collector.py new file mode 100644 index 0000000..f7b6e35 --- /dev/null +++ b/app/workers/openalex_collector.py @@ -0,0 +1,366 @@ +"""OpenAlex 백본 수집기 — B-3 PR3 (plan safety-library-b3-1). + +OpenAlex = 발견+dedup 글로벌 백본(JP/EU/US 논문 다 색인 + 정본 DOI). 전문은 안 줌(oa_url 포인터만). +- scaffold-first: OPENALEX_API_KEY 부재 시 FeedError(explicit-skip, silent fallback 금지). 키=무료. +- signal-only: 초록(inverted-index 복원)만 색인(embed+chunk), summarize 절대 미enqueue(맥미니 큐 무접촉). + PDF 는 절대 OpenAlex 경유로 안 받음(oa_url 은 링크/신호일 뿐). +- 관련성 사전필터 = title_and_abstract.search 키워드(서버측) + per-run insert cap(임베드 firehose 차단, + 적대리뷰 A major). cursor 페이징 + from_publication_date 워터마크로 증분. +- 초록 없는 thin 레코드(주로 비-OA 메타)는 skip — Phase-1 재료 품질 유지. +- DOI → paper.doi(holder, partial-unique 인덱스, 교차소스 dedup). 없으면 openalex id fallback. +- license: 명시 CC → redistribute=true / 그 외 OA·closed → false(restricted 부재 = 초록 RAG 사용 가능). +- enabled=False news_sources 행 + main.py CronTrigger(자체 폴링). list+filter 비용 미미($1/일 크레딧). +""" + +import asyncio +import hashlib +import json +import os +from dataclasses import dataclass +from datetime import date, datetime, timezone + +import httpx +from sqlalchemy import select + +from core.crawl_politeness import CRAWL_UA +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from models.news_source import NewsSource +from models.queue import enqueue_stage +from services.papers.doi import normalize_doi +from services.papers.holder import find_paper_holder +from workers.news_collector import ( + FeedError, + _get_or_create_health, + _record_failure, + _record_success, +) + +logger = setup_logger("openalex_collector") + +_API = "https://api.openalex.org/works" +_SOURCE_NAME = "OpenAlex 안전·공학 (keyword)" +_ENV_KEY = "OPENALEX_API_KEY" + +# 압력용기·공정안전·구조건전성 도메인 키워드(키워드별 1쿼리 = 관련성 사전필터). +_KEYWORDS = ( + "pressure vessel safety", + "process safety", + "structural integrity", + "fracture mechanics", + "fatigue life assessment", +) + +_RUN_CAP = 60 # 1회 run 신규 적재 상한(임베드 큐 보호). bulk 시 해제. +_PER_PAGE = 50 +_MAX_PAGES_PER_KW = 4 # 키워드당 최대 페이지(증분이라 보통 1페이지에 워터마크 도달) +_REQ_SLEEP = 1.0 # 페이지 간 polite 간격 +_MAX_RETRY = 4 +_BACKOFF_BASE = 5.0 + + +# ───────────────────────── 순수 파서 (fixture 단위 테스트 대상) ───────────────────────── + +@dataclass +class OpenAlexWork: + openalex_id: str # "W2910511816" + doi: str | None # normalize_doi 적용 + title: str + abstract: str # inverted-index 복원 (없으면 "") + publication_date: str | None + oa_status: str | None # closed/green/bronze/hybrid/gold/diamond + oa_url: str | None + is_oa: bool + license: str | None # cc-by / cc-by-nc-nd / None + source_name: str | None + primary_topic: str | None + work_type: str | None + + +def _clean(text): + return " ".join(text.split()).strip() if text else "" + + +def _reconstruct_abstract(inv: dict | None) -> str: + """abstract_inverted_index({word:[positions]}) → 평문 초록. 없으면 ''.""" + if not inv: + return "" + positions = [(pos, word) for word, idxs in inv.items() for pos in idxs] + positions.sort() + return " ".join(w for _, w in positions) + + +def license_meta(license_str: str | None, is_oa: bool, source_name: str | None) -> dict: + """extract_meta.license — 명시 CC/public-domain 만 redistribute=true. restricted 부재(초록 색인 자유). + + redistribute=false 라도 restricted 가 없으면 RAG 사용 가능(초록). 비-CC 전문의 RAG verbatim 차단은 + Phase-2 전문 승격 단계가 restricted=true 로 처리(L-1) — Phase-1(초록)은 무해. + """ + attribution = source_name or "OpenAlex" + if license_str and (license_str.startswith("cc") or license_str == "public-domain"): + return {"scheme": license_str, "redistribute": True, "attribution": attribution} + return { + "scheme": "open-unspecified" if is_oa else "proprietary", + "redistribute": False, + "attribution": attribution, + } + + +def parse_openalex_works(json_text: str) -> tuple[int, str | None, list[OpenAlexWork]]: + """OpenAlex /works 응답 → (count, next_cursor, [OpenAlexWork]). 순수 함수.""" + d = json.loads(json_text) + meta = d.get("meta") or {} + count = meta.get("count") or 0 + next_cursor = meta.get("next_cursor") + works: list[OpenAlexWork] = [] + for w in d.get("results") or []: + oid = (w.get("id") or "").rstrip("/").rsplit("/", 1)[-1] + if not oid: + continue + oa = w.get("open_access") or {} + pl = w.get("primary_location") or {} + pt = w.get("primary_topic") or {} + works.append(OpenAlexWork( + openalex_id=oid, + doi=normalize_doi(w.get("doi")), + title=_clean(w.get("title")), + abstract=_reconstruct_abstract(w.get("abstract_inverted_index")), + publication_date=w.get("publication_date"), + oa_status=oa.get("oa_status"), + oa_url=oa.get("oa_url") or None, + is_oa=bool(oa.get("is_oa")), + license=pl.get("license"), + source_name=(pl.get("source") or {}).get("display_name"), + primary_topic=pt.get("display_name"), + work_type=w.get("type"), + )) + return count, next_cursor, works + + +def build_filter(keyword: str, from_date: str | None = None) -> str: + f = f"title_and_abstract.search:{keyword}" + if from_date: + f += f",from_publication_date:{from_date}" + return f + + +# ───────────────────────── 적재 (DB — PR3 라이브 검증) ───────────────────────── + +def _build_paper_meta(source: NewsSource, w: OpenAlexWork) -> dict: + paper: dict = {"openalex_id": w.openalex_id} + if w.doi: + paper["doi"] = w.doi # partial-unique 인덱스 진입(교차소스 dedup) + if w.oa_status: + paper["oa_status"] = w.oa_status + if w.oa_url: + paper["oa_url"] = w.oa_url # 링크/신호 — 자동 fetch 안 함 + if w.primary_topic: + paper["topic"] = w.primary_topic + meta: dict = { + "source_id": source.id, + "source_name": source.name, + "source_region": "INT", # OpenAlex = 글로벌. paper.jurisdiction 은 NULL 유지(A-2). + "paper": paper, + "license": license_meta(w.license, w.is_oa, w.source_name), + } + if w.publication_date: + meta["published_at"] = w.publication_date + return meta + + +async def _ingest_work(session, source: NewsSource, w: OpenAlexWork) -> bool: + """1건 적재. 반환 = 신규 여부. signal-only. 초록 없으면 skip(thin 레코드 배제).""" + if not w.abstract: + return False # 초록 없는 thin 레코드(주로 비-OA 메타) — Phase-1 재료 품질 유지 + oid_hash = hashlib.sha256(f"openalex|{w.openalex_id}".encode()).hexdigest()[:32] + dup = await session.execute( + select(Document.id).where(Document.file_hash == oid_hash).limit(1) + ) + if dup.scalars().first(): + return False + if w.doi and await find_paper_holder(session, w.doi): + return False # 교차소스 dedup(arXiv 등이 이미 holder 보유) + + pub_date = None + if w.publication_date: + try: + pub_date = date.fromisoformat(w.publication_date) + except ValueError: + pub_date = None + body = w.abstract + doc = Document( + file_path=f"crawl/openalex/{w.openalex_id}", + file_hash=oid_hash, + file_format="article", + file_size=len(body.encode()), + file_type="note", + title=w.title, + extracted_text=f"{w.title}\n\n{body}", + extracted_at=datetime.now(timezone.utc), + extractor_version="openalex-signal", + md_status="skipped", + md_extraction_error="OpenAlex abstract: signal-only, markdown 비대상", + source_channel="crawl", + data_origin="external", + edit_url=w.oa_url or f"https://openalex.org/{w.openalex_id}", + review_status="approved", + material_type="paper", + jurisdiction=None, + published_date=pub_date, + extract_meta=_build_paper_meta(source, w), + ) + session.add(doc) + await session.flush() + await enqueue_stage(session, doc.id, "embed") + await enqueue_stage(session, doc.id, "chunk") + return True + + +async def _get_or_create_source(session) -> NewsSource: + result = await session.execute( + select(NewsSource).where(NewsSource.name == _SOURCE_NAME) + ) + source = result.scalars().first() + if source is None: + source = NewsSource( + name=_SOURCE_NAME, feed_url=_API, feed_type="json", + fetch_method="signal-only", fulltext_policy="none", + source_channel="crawl", category="Engineering", language="en", + country=None, material_type="paper", + license_scheme="openalex", license_redistribute=False, + enabled=False, + ) + session.add(source) + await session.flush() + return source + + +def _api_key() -> str: + key = os.getenv(_ENV_KEY, "").strip() + if not key: + raise FeedError(f"{_ENV_KEY} 미설정 — OpenAlex 수집 불가 (scaffold-first explicit-skip)") + return key + + +def _watermark(source: NewsSource, keyword: str) -> str | None: + return (source.selector_override or {}).get("openalex_watermark", {}).get(keyword) + + +def _set_watermark(source: NewsSource, keyword: str, value: str) -> None: + cfg = dict(source.selector_override or {}) + wm = dict(cfg.get("openalex_watermark") or {}) + wm[keyword] = value + cfg["openalex_watermark"] = wm + source.selector_override = cfg + + +async def _fetch(client: httpx.AsyncClient, key: str, filter_str: str, cursor: str) -> str: + params = { + "filter": filter_str, "per-page": _PER_PAGE, "cursor": cursor, + "sort": "publication_date:desc", "api_key": key, + } + for attempt in range(_MAX_RETRY): + resp = await client.get(_API, params=params) + if resp.status_code == 429: + await asyncio.sleep(_BACKOFF_BASE * (2 ** attempt)) + continue + resp.raise_for_status() + return resp.text + raise FeedError(f"OpenAlex 429 재시도 초과: {filter_str[:48]}") + + +async def run(bulk: bool = False, limit: int = 0) -> None: + """daily 진입점(스케줄러). 키 부재 = explicit-skip(health 실패 기록).""" + now = datetime.now(timezone.utc) + async with async_session() as session: + source = await _get_or_create_source(session) + await session.commit() + source_id = source.id + + try: + key = _api_key() + except FeedError as e: + logger.warning(f"[openalex] {e}") + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_failure(health, str(e), now) + await session.commit() + return + + run_cap = (limit or 10**9) if bulk else (min(limit, _RUN_CAP) if limit else _RUN_CAP) + inserted = 0 + seen = 0 + failures: list[str] = [] + + async with httpx.AsyncClient( + timeout=30.0, headers={"User-Agent": CRAWL_UA}, follow_redirects=True + ) as client: + for keyword in _KEYWORDS: + if inserted >= run_cap: + break + async with async_session() as session: + src = await session.get(NewsSource, source_id) + watermark = None if bulk else _watermark(src, keyword) + filter_str = build_filter(keyword, watermark) + newest: str | None = None + cursor = "*" + max_pages = (10**6 if bulk else _MAX_PAGES_PER_KW) + try: + for _page in range(max_pages): + if inserted >= run_cap: + break + text = await _fetch(client, key, filter_str, cursor) + _count, next_cursor, works = parse_openalex_works(text) + if not works: + break + for w in works: + seen += 1 + if w.publication_date and (newest is None or w.publication_date > newest): + newest = w.publication_date + async with async_session() as session: + src = await session.get(NewsSource, source_id) + if await _ingest_work(session, src, w): + inserted += 1 + await session.commit() + else: + await session.rollback() + if inserted >= run_cap: + break + await asyncio.sleep(_REQ_SLEEP) + if not next_cursor: + break + cursor = next_cursor + if newest: + async with async_session() as session: + src = await session.get(NewsSource, source_id) + _set_watermark(src, keyword, newest) + await session.commit() + except (httpx.HTTPError, FeedError, ValueError) as e: + msg = f"[{keyword}] {e or repr(e)}" + logger.error(f"[openalex] {msg}") + failures.append(msg) + + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + if failures and inserted == 0: + _record_failure(health, "; ".join(failures)[:500], now) + else: + _record_success(health, inserted, False, now) + await session.commit() + + deferred = "" if inserted < run_cap else f" (cap {run_cap} 도달 — 잔여 다음 run 이월)" + logger.info( + f"[openalex] {len(_KEYWORDS)}개 키워드 스캔 {seen}건 → 신규 {inserted}건{deferred}" + + (f" / 실패 {len(failures)}건" if failures else "") + ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="OpenAlex 안전·공학 키워드 백본 수집기") + parser.add_argument("--bulk", action="store_true", help="cap 해제 + 깊은 cursor 페이징 백필") + parser.add_argument("--limit", type=int, default=0, help="신규 적재 상한(0=기본 cap)") + args = parser.parse_args() + asyncio.run(run(bulk=args.bulk, limit=args.limit)) diff --git a/tests/fixtures/openalex_works_response.json b/tests/fixtures/openalex_works_response.json new file mode 100644 index 0000000..d4666fc --- /dev/null +++ b/tests/fixtures/openalex_works_response.json @@ -0,0 +1 @@ +{"meta": {"count": 1111, "db_response_time_ms": 66, "page": 1, "per_page": 5, "groups_count": null, "x_query": {"oql": "works where it's open access\n and title/abstract contains (process safety pressure vessel)\nreturn id, DOI, title, abstract inverted index, date, year, type, open access, primary location, locations, primary topic", "oqo": {"get_rows": "works", "filter_rows": [{"column_id": "open_access.is_oa", "value": true}, {"column_id": "title_and_abstract.search", "value": "process safety pressure vessel", "operator": "contains"}], "select": ["id", "doi", "title", "abstract_inverted_index", "publication_date", "publication_year", "type", "open_access", "primary_location", "locations", "primary_topic"], "per_page": 5}, "url": "/works?filter=open_access.is_oa:true,title_and_abstract.search:process safety pressure vessel&select=id,doi,title,abstract_inverted_index,publication_date,publication_year,type,open_access,primary_location,locations,primary_topic&per_page=5"}, "cost_usd": 0.001}, "results": [{"id": "https://openalex.org/W2910511816", "doi": "https://doi.org/10.1088/1757-899x/469/1/012009", "title": "A critical review and analysis of pressure vessel structures", "abstract_inverted_index": {"This": [0], "paper": [1], "provides": [2], "an": [3], "overview": [4], "of": [5, 28, 54, 89, 95, 107], "the": [6, 18, 29, 41, 47, 52, 65, 70, 84, 90, 93, 96, 102, 108, 119], "pressure": [7, 30, 77, 97, 121], "vessel,": [8, 42], "starting": [9], "with": [10, 51, 114], "its": [11], "background": [12], "and": [13, 25, 56, 87, 104], "a": [14], "brief": [15], "history.": [16], "Then,": [17], "geometry,": [19], "main": [20], "components,": [21], "classification,": [22], "applications,": [23], "materials": [24], "fabrication": [26], "process": [27], "vessel": [31, 98, 122], "are": [32, 112], "also": [33], "discussed.": [34], "When": [35], "designing": [36], "or": [37, 78], "performing": [38], "optimization": [39], "on": [40], "it": [43], "is": [44, 99], "crucial": [45], "for": [46, 118], "designers": [48], "to": [49, 58, 63], "familiar": [50], "types": [53], "failures": [55], "loadings,": [57], "select": [59], "appropriate": [60], "analytical": [61], "methods": [62], "analyse": [64], "vessel.": [66, 91], "As": [67], "well": [68], "as": [69, 74], "design": [71, 76, 94], "parameters": [72], "such": [73], "thickness,": [75], "allowable": [79], "stresses,": [80], "which": [81], "can": [82], "alter": [83], "performance,": [85], "efficiency": [86], "safety": [88], "Since": [92], "governed": [100], "by": [101], "codes": [103, 111], "standards,": [105], "some": [106], "commonly": [109], "used": [110], "presented,": [113], "more": [115], "details": [116], "included": [117], "ASME": [120], "code.": [123]}, "publication_date": "2019-01-16", "publication_year": 2019, "type": "review", "open_access": {"is_oa": true, "oa_status": "diamond", "oa_url": "https://doi.org/10.1088/1757-899x/469/1/012009", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.1088/1757-899x/469/1/012009", "is_oa": true, "landing_page_url": "https://doi.org/10.1088/1757-899x/469/1/012009", "pdf_url": null, "source": {"id": "https://openalex.org/S4210189194", "display_name": "IOP Conference Series Materials Science and Engineering", "issn_l": "1757-8981", "issn": ["1757-8981", "1757-899X"], "is_oa": true, "is_in_doaj": false, "is_core": false, "host_organization": "https://openalex.org/P4310320083", "host_organization_name": "IOP Publishing", "host_organization_lineage": ["https://openalex.org/P4310320083", "https://openalex.org/P4310311669"], "host_organization_lineage_names": ["IOP Publishing", "Institute of Physics"], "type": "conference"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "IOP Conference Series: Materials Science and Engineering", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.1088/1757-899x/469/1/012009", "is_oa": true, "landing_page_url": "https://doi.org/10.1088/1757-899x/469/1/012009", "pdf_url": null, "source": {"id": "https://openalex.org/S4210189194", "display_name": "IOP Conference Series Materials Science and Engineering", "issn_l": "1757-8981", "issn": ["1757-8981", "1757-899X"], "is_oa": true, "is_in_doaj": false, "is_core": false, "host_organization": "https://openalex.org/P4310320083", "host_organization_name": "IOP Publishing", "host_organization_lineage": ["https://openalex.org/P4310320083", "https://openalex.org/P4310311669"], "host_organization_lineage_names": ["IOP Publishing", "Institute of Physics"], "type": "conference"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "IOP Conference Series: Materials Science and Engineering", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T12581", "display_name": "Engineering Structural Analysis Methods", "score": 0.9983999729156494, "subfield": {"id": "https://openalex.org/subfields/2210", "display_name": "Mechanical Engineering"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}, {"id": "https://openalex.org/W4407429739", "doi": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "title": "A review of type IV composite overwrapped pressure vessels", "abstract_inverted_index": {"Type": [0, 22, 65, 128, 147, 166, 180], "IV": [1, 23, 66, 129, 148, 167, 181], "Composite": [2], "overwrapped": [3], "pressure": [4], "vessels": [5, 24], "(COPVs)": [6], "are": [7, 25], "commonly": [8], "used": [9], "in": [10, 41, 72, 96, 136, 152, 169, 177], "high-pressure": [11], "environments": [12], "such": [13], "as": [14], "aerospace,": [15], "automotive,": [16], "and": [17, 43, 60, 77, 88, 101, 124, 133, 144, 155, 171], "industrial": [18], "sectors.": [19], "The": [20], "COPV": [21], "characterized": [26], "by": [27, 34], "their": [28], "non-metallic": [29], "liner,": [30], "which": [31], "is": [32], "surrounded": [33], "composite": [35, 156], "materials,": [36, 57, 143], "leading": [37], "to": [38, 84], "significant": [39], "reductions": [40], "weight": [42], "enhanced": [44], "storage": [45], "capacity.": [46], "This": [47], "review": [48, 92, 115], "article": [49], "outlines": [50], "the": [51, 55, 81, 91, 108, 121, 137], "essential": [52], "aspects": [53], "of": [54, 111, 127, 141, 146, 162], "design,": [56, 142], "manufacturing": [58, 145], "processes,": [59], "performance": [61], "criteria": [62], "associated": [63], "with": [64], "COPVs.": [67, 112, 149], "It": [68], "also": [69], "explores": [70], "advancements": [71, 135], "resin": [73], "systems,": [74], "fiber": [75], "reinforcements,": [76], "coatings,": [78], "while": [79], "addressing": [80], "challenges": [82], "related": [83], "long-term": [85], "durability,": [86], "reliability,": [87], "safety.": [89, 159], "Furthermore,": [90], "emphasizes": [93], "recent": [94], "developments": [95], "testing": [97, 163], "protocols,": [98], "certification": [99], "standards,": [100], "emerging": [102], "research": [103], "trends": [104, 176], "focused": [105], "on": [106], "creating": [107], "next": [109], "generation": [110], "Overall,": [113], "this": [114], "offers": [116], "a": [117], "thorough": [118], "insight": [119], "into": [120], "current": [122], "capabilities": [123], "future": [125], "possibilities": [126], "COPVs,": [130], "promoting": [131], "innovation": [132], "further": [134], "field.": [138], "\u2022": [139, 150, 160, 174], "Review": [140], "Advances": [151], "liner": [153], "fabrication": [154], "overwraps": [157], "for": [158, 165, 183], "Evaluation": [161], "methods": [164], "COPVs": [168, 182], "aerospace": [170], "H\u2082": [172], "storage.": [173], "Future": [175], "cost-effective,": [178], "recyclable": [179], "hydrogen.": [184]}, "publication_date": "2025-02-12", "publication_year": 2025, "type": "review", "open_access": {"is_oa": true, "oa_status": "hybrid", "oa_url": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.1016/j.ijhydene.2025.02.108", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.1016/j.ijhydene.2025.02.108", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T10219", "display_name": "Mechanical Behavior of Composites", "score": 0.9961000084877014, "subfield": {"id": "https://openalex.org/subfields/2211", "display_name": "Mechanics of Materials"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}, {"id": "https://openalex.org/W3107397139", "doi": "https://doi.org/10.1007/978-3-030-64823-7_34", "title": "A Digital Twin for Safety and Risk Management: A Prototype for a Hydrogen High-Pressure Vessel", "abstract_inverted_index": null, "publication_date": "2020-01-01", "publication_year": 2020, "type": "book-chapter", "open_access": {"is_oa": true, "oa_status": "green", "oa_url": "https://aaltodoc.aalto.fi/handle/123456789/109586", "any_repository_has_fulltext": true}, "primary_location": {"id": "doi:10.1007/978-3-030-64823-7_34", "is_oa": false, "landing_page_url": "https://doi.org/10.1007/978-3-030-64823-7_34", "pdf_url": null, "source": {"id": "https://openalex.org/S106296714", "display_name": "Lecture notes in computer science", "issn_l": "0302-9743", "issn": ["0302-9743", "1611-3349"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310319900", "host_organization_name": "Springer Science+Business Media", "host_organization_lineage": ["https://openalex.org/P4310319900", "https://openalex.org/P4310319965"], "host_organization_lineage_names": ["Springer Science+Business Media", "Springer Nature"], "type": "book series"}, "license": null, "license_id": null, "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Lecture Notes in Computer Science", "raw_type": "book-chapter"}, "locations": [{"id": "doi:10.1007/978-3-030-64823-7_34", "is_oa": false, "landing_page_url": "https://doi.org/10.1007/978-3-030-64823-7_34", "pdf_url": null, "source": {"id": "https://openalex.org/S106296714", "display_name": "Lecture notes in computer science", "issn_l": "0302-9743", "issn": ["0302-9743", "1611-3349"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310319900", "host_organization_name": "Springer Science+Business Media", "host_organization_lineage": ["https://openalex.org/P4310319900", "https://openalex.org/P4310319965"], "host_organization_lineage_names": ["Springer Science+Business Media", "Springer Nature"], "type": "book series"}, "license": null, "license_id": null, "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Lecture Notes in Computer Science", "raw_type": "book-chapter"}, {"id": "pmh:oai:aaltodoc.aalto.fi:123456789/109586", "is_oa": true, "landing_page_url": "https://research.aalto.fi/en/publications/57f0a92f-a7bc-46e0-b384-871ebfa22cb9", "pdf_url": "https://aaltodoc.aalto.fi/handle/123456789/109586", "source": {"id": "https://openalex.org/S4306401663", "display_name": "Aaltodoc (Aalto University)", "issn_l": null, "issn": null, "is_oa": false, "is_in_doaj": false, "is_core": false, "host_organization": "https://openalex.org/I9927081", "host_organization_name": "Aalto University", "host_organization_lineage": ["https://openalex.org/I9927081"], "host_organization_lineage_names": [], "type": "repository"}, "license": "other-oa", "license_id": "https://openalex.org/licenses/other-oa", "version": "submittedVersion", "is_accepted": false, "is_published": false, "raw_source_name": null, "raw_type": "acceptedVersion"}], "primary_topic": {"id": "https://openalex.org/T10763", "display_name": "Digital Transformation in Industry", "score": 0.9955000281333923, "subfield": {"id": "https://openalex.org/subfields/2209", "display_name": "Industrial and Manufacturing Engineering"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}, {"id": "https://openalex.org/W4405974102", "doi": "https://doi.org/10.3390/pr13010074", "title": "Mapping the Knowledge Domain of Pressure Vessels and Piping Fields for Safety Research in Industrial Processes: A Bibliometric Analysis", "abstract_inverted_index": {"With": [0], "the": [1, 26, 32, 43, 82, 127, 142, 180, 193, 203, 215, 219, 226, 233, 240], "rapid": [2], "advancement": [3, 241], "of": [4, 34, 45, 111, 116, 145, 195, 223, 229, 235, 242], "modern": [5], "industries,": [6], "pressure": [7, 35, 146], "vessels": [8, 147], "and": [9, 21, 28, 37, 75, 90, 100, 104, 106, 119, 121, 124, 148, 155, 158, 162, 187, 192, 197, 200, 209, 221, 239, 255], "piping": [10, 38], "have": [11], "become": [12], "increasingly": [13], "integral": [14], "to": [15, 51, 67, 213], "sectors": [16], "such": [17, 96], "as": [18, 55, 97], "energy,": [19], "petrochemicals,": [20], "process": [22], "industries.": [23], "To": [24], "grasp": [25], "research": [27, 83, 139, 258], "application": [29, 194], "status": [30], "in": [31, 42, 58, 84, 131, 259], "field": [33, 86], "vessel": [36], "safety,": [39], "670": [40], "publications": [41], "Web": [44], "Science": [46, 99], "core": [47], "database": [48], "from": [49], "2008": [50], "2024": [52], "were": [53, 65], "taken": [54], "data": [56], "samples": [57], "this": [59, 85, 132, 260], "paper.": [60], "The": [61, 78, 108, 134, 169], "knowledge": [62], "mapping": [63], "tools": [64], "used": [66], "carry": [68], "out": [69], "co-occurrence": [70], "analysis,": [71, 157], "keyword": [72], "burst": [73], "detection,": [74], "co-citation": [76], "analysis.": [77], "results": [79], "show": [80], "that": [81], "presents": [87], "a": [88], "multidisciplinary": [89, 261], "cross-disciplinary": [91], "state,": [92], "involving": [93], "multiple": [94], "disciplines": [95], "Nuclear": [98], "Technology,": [101], "Engineering": [102, 123], "Mechanics,": [103], "Energy": [105], "Fuels.": [107], "\u201cInternational": [109, 114], "Journal": [110, 115], "Hydrogen": [112], "Energy\u201d,": [113], "Pressure": [117], "Vessels": [118], "Piping\u201d,": [120], "\u201cNuclear": [122], "Design\u201d": [125], "are": [126], "primary": [128], "publication": [129], "outlets": [130], "domain.": [133], "study": [135], "identifies": [136], "three": [137, 176], "major": [138], "hotspots:": [140], "(1)": [141, 178], "safety": [143, 181, 220, 253], "performance": [144], "piping,": [149], "(2)": [150, 189], "structural": [151], "integrity,": [152], "failure": [153], "mechanisms,": [154], "stress": [156], "(3)": [159, 201], "numerical": [160], "simulation": [161], "thermal\u2013hydraulic": [163], "analysis": [164], "under": [165], "various": [166], "operating": [167], "conditions.": [168], "current": [170, 216], "challenges": [171, 217], "can": [172], "be": [173], "summarized": [174], "into": [175], "aspects:": [177], "addressing": [179], "risks": [182], "brought": [183], "by": [184], "new": [185], "technologies": [186], "materials,": [188], "promoting": [190], "innovation": [191], "detection": [196, 228], "monitoring": [198], "technologies,": [199], "strengthening": [202], "building": [204], "capacity": [205], "for": [206, 251], "accident": [207], "prevention": [208], "emergency": [210], "management.": [211], "Specific": [212], "China,": [214], "include": [218], "management": [222], "aging": [224], "equipment,": [225], "effective": [227], "circumferential": [230], "weld": [231], "cracks,": [232], "refinement": [234], "risk": [236], "assessment": [237], "models,": [238], "smart": [243], "technology": [244], "applications.": [245], "These": [246], "findings": [247], "offer": [248], "valuable": [249], "insights": [250], "advancing": [252], "practices": [254], "guiding": [256], "future": [257], "field.": [262]}, "publication_date": "2025-01-01", "publication_year": 2025, "type": "article", "open_access": {"is_oa": true, "oa_status": "gold", "oa_url": "https://www.mdpi.com/2227-9717/13/1/74/pdf?version=1735722405", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.3390/pr13010074", "is_oa": true, "landing_page_url": "https://doi.org/10.3390/pr13010074", "pdf_url": "https://www.mdpi.com/2227-9717/13/1/74/pdf?version=1735722405", "source": {"id": "https://openalex.org/S4210201879", "display_name": "Processes", "issn_l": "2227-9717", "issn": ["2227-9717"], "is_oa": true, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310310987", "host_organization_name": "Multidisciplinary Digital Publishing Institute", "host_organization_lineage": ["https://openalex.org/P4310310987"], "host_organization_lineage_names": ["Multidisciplinary Digital Publishing Institute"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Processes", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.3390/pr13010074", "is_oa": true, "landing_page_url": "https://doi.org/10.3390/pr13010074", "pdf_url": "https://www.mdpi.com/2227-9717/13/1/74/pdf?version=1735722405", "source": {"id": "https://openalex.org/S4210201879", "display_name": "Processes", "issn_l": "2227-9717", "issn": ["2227-9717"], "is_oa": true, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310310987", "host_organization_name": "Multidisciplinary Digital Publishing Institute", "host_organization_lineage": ["https://openalex.org/P4310310987"], "host_organization_lineage_names": ["Multidisciplinary Digital Publishing Institute"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Processes", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T11357", "display_name": "Risk and Safety Analysis", "score": 0.9919000267982483, "subfield": {"id": "https://openalex.org/subfields/1804", "display_name": "Statistics, Probability and Uncertainty"}, "field": {"id": "https://openalex.org/fields/18", "display_name": "Decision Sciences"}, "domain": {"id": "https://openalex.org/domains/2", "display_name": "Social Sciences"}}}, {"id": "https://openalex.org/W4391130399", "doi": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "title": "Development of machine learning based classifier for the pressure test result prediction of type IV composite overwrapped pressure vessels", "abstract_inverted_index": {"The": [0, 69, 88], "stringent": [1], "safety": [2, 47, 194], "regulations": [3], "of": [4, 29, 33, 67, 77, 155, 161, 175, 185], "type": [5], "IV": [6], "composite": [7], "overwrapped": [8], "pressure": [9, 85, 128, 149], "vessels": [10], "(COPVs)": [11], "for": [12, 60, 147, 180], "commercial": [13, 189], "vehicles": [14], "mandate": [15], "a": [16, 74, 176], "certification": [17], "process": [18], "involving": [19], "pressurization": [20], "up": [21], "to": [22, 42, 166, 192], "1050": [23], "bar,": [24], "with": [25, 152], "the": [26, 97, 122, 126, 144, 148, 172, 182], "critical": [27], "requirement": [28], "withstanding": [30], "burst": [31, 49, 83, 127], "pressures": [32], "1570": [34], "bar.": [35], "Analyzing": [36], "proof": [37], "test": [38, 89, 151], "data": [39, 80, 123], "is": [40], "crucial": [41], "enhance": [43], "and": [44, 64, 84, 107, 117, 133], "ensure": [45], "tank": [46], "regarding": [48], "pressure.": [50], "In": [51], "this": [52], "study,": [53], "we": [54], "developed": [55], "various": [56], "machine": [57, 177], "learning": [58, 178], "classifiers": [59, 70], "structure": [61], "health": [62], "monitoring": [63], "damage": [65, 183], "prediction": [66], "COPVs.": [68], "were": [71, 91, 102], "trained": [72, 108], "using": [73, 109], "substantial": [75], "amount": [76], "acoustic": [78], "emission": [79], "collected": [81, 124], "during": [82, 96], "cycling": [86, 150], "tests.": [87, 195], "results": [90], "employed": [92], "as": [93], "label": [94], "inputs": [95], "training": [98, 121], "process.": [99], "Statistical": [100], "features": [101], "extracted": [103], "per": [104], "time": [105], "unit": [106], "Naive": [110], "Bayes,": [111], "Logistic": [112], "Regression,": [113], "Decision": [114, 131], "Tree,": [115, 132], "XGBoost,": [116], "TabNet": [118, 141, 158], "models.": [119], "Upon": [120], "from": [125], "test,": [129], "TabNet,": [130], "XGBoost": [134], "achieved": [135], "classification": [136, 167], "accuracies": [137], "above": [138], "0.94.": [139], "Notably,": [140], "demonstrated": [142], "also": [143], "best": [145], "performance": [146], "an": [153], "accuracy": [154], "0.98.": [156], "Furthermore,": [157], "provided": [159], "visualizations": [160], "feature": [162], "sensitivity": [163], "in": [164, 188], "relation": [165], "results.": [168], "This": [169], "study": [170], "marks": [171], "first": [173], "development": [174], "classifier": [179], "predicting": [181], "state": [184], "COPV": [186], "tanks": [187], "applications": [190], "pertaining": [191], "required": [193]}, "publication_date": "2024-01-23", "publication_year": 2024, "type": "article", "open_access": {"is_oa": true, "oa_status": "hybrid", "oa_url": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.1016/j.ijhydene.2024.01.182", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by-nc-nd", "license_id": "https://openalex.org/licenses/cc-by-nc-nd", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.1016/j.ijhydene.2024.01.182", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by-nc-nd", "license_id": "https://openalex.org/licenses/cc-by-nc-nd", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T10219", "display_name": "Mechanical Behavior of Composites", "score": 0.9993000030517578, "subfield": {"id": "https://openalex.org/subfields/2211", "display_name": "Mechanics of Materials"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}], "group_by": []} \ No newline at end of file diff --git a/tests/test_openalex_collector_units.py b/tests/test_openalex_collector_units.py new file mode 100644 index 0000000..5e17479 --- /dev/null +++ b/tests/test_openalex_collector_units.py @@ -0,0 +1,92 @@ +"""B-3 PR3 — OpenAlex 파서·초록복원·license 순수 단위 테스트 (plan safety-library-b3-1). + +fixture = OpenAlex /works 실응답 박제(process safety/pressure vessel OA 5건 — +cc-by/cc-by-nc-nd/license None, 초록 있음/없음). run()/적재(DB)는 PR3 라이브 검증. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "app")) + +from workers.openalex_collector import ( # noqa: E402 + _reconstruct_abstract, + build_filter, + license_meta, + parse_openalex_works, +) + +FIX = Path(__file__).parent / "fixtures" / "openalex_works_response.json" + + +def _works(): + count, cursor, works = parse_openalex_works(FIX.read_text(encoding="utf-8")) + return count, {w.openalex_id: w for w in works}, works + + +# ─── 피드 레벨 ─── + +def test_count_and_results(): + count, by_id, works = _works() + assert count == 1111 + assert len(works) == 5 + assert all(w.openalex_id.startswith("W") and "/" not in w.openalex_id for w in works) + + +# ─── 초록 보유 + CC 라이선스 ─── + +def test_work_with_abstract_and_cc(): + _, by_id, _ = _works() + w = by_id["W2910511816"] + assert w.doi and w.doi.startswith("10.") and w.doi == w.doi.lower() # normalize_doi + assert len(w.abstract) > 50 # inverted-index 복원 + assert w.oa_status == "diamond" and w.is_oa is True + assert w.license == "cc-by" + assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True + + +# ─── 초록 없는 thin 레코드(skip 대상) ─── + +def test_work_without_abstract(): + _, by_id, _ = _works() + w = by_id["W3107397139"] + assert w.abstract == "" # inverted-index 부재 → 빈 초록 + lm = license_meta(w.license, w.is_oa, w.source_name) + assert lm["redistribute"] is False # license None → 비배포 + + +# ─── cc-by-nc-nd 도 CC 계열 → redistribute True ─── + +def test_cc_variant_redistribute(): + _, by_id, _ = _works() + w = by_id["W4391130399"] + assert w.license == "cc-by-nc-nd" + assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True + + +# ─── 초록 inverted-index 복원 순서 ─── + +def test_reconstruct_abstract_order(): + inv = {"Safety": [0], "of": [1, 4], "pressure": [2], "vessels": [3], "design": [5]} + assert _reconstruct_abstract(inv) == "Safety of pressure vessels of design" + assert _reconstruct_abstract(None) == "" + assert _reconstruct_abstract({}) == "" + + +# ─── license_meta 분기 ─── + +def test_license_meta_branches(): + assert license_meta("cc-by", True, "X")["redistribute"] is True + assert license_meta("cc0", True, "X")["redistribute"] is True + none_oa = license_meta(None, True, "X") + assert none_oa["redistribute"] is False and none_oa["scheme"] == "open-unspecified" + closed = license_meta(None, False, "X") + assert closed["redistribute"] is False and closed["scheme"] == "proprietary" + + +# ─── 쿼리 빌더 ─── + +def test_build_filter(): + assert build_filter("process safety") == "title_and_abstract.search:process safety" + assert build_filter("process safety", "2026-06-01") == \ + "title_and_abstract.search:process safety,from_publication_date:2026-06-01"