diff --git a/app/main.py b/app/main.py index 6f6aa8d..edfa6f7 100644 --- a/app/main.py +++ b/app/main.py @@ -56,6 +56,9 @@ async def lifespan(app: FastAPI): from workers.mailplus_archive import run as mailplus_run from workers.statute_collector import run as statute_run from workers.news_collector import run as news_collector_run + from workers.arxiv_collector import run as arxiv_collector_run + from workers.openalex_collector import run as openalex_collector_run + from workers.paper_doi_reconcile import run as paper_doi_reconcile_run from workers.fulltext_worker import reconcile_unresolved as fulltext_reconcile_run from workers.kosha_collector import run as kosha_collector_run from workers.csb_collector import run as csb_collector_run @@ -139,6 +142,9 @@ async def lifespan(app: FastAPI): # plan ds-s1-backend-1 B-4: dedup 컬럼(duplicate_of/duplicate_count) 야간 절대 재계산. # soft-delete 잔여 드리프트 정리(멱등, 드리프트 없으면 no-op). cron 03:30 (다른 잡과 비충돌). scheduler.add_job(dedup_reconcile_run, CronTrigger(hour=3, minute=30, timezone=KST), id="dedup_reconcile") + # B-3 PR4: 레거시 paper 행 arXiv DataCite DOI 스탬프(재유입 차단). keyless·in-DB·enqueue 0. + # dedup_reconcile(03:30)·fulltext_reconcile(03:40) 와 별 worker·비충돌 슬롯. + scheduler.add_job(paper_doi_reconcile_run, CronTrigger(hour=3, minute=50, timezone=KST), id="paper_doi_reconcile") # crawl-24x7 C-2: KOSHA 재해사례 diff + GUIDE 점진 백필 (daily, 새벽 잡들과 비충돌 슬롯). scheduler.add_job(kosha_collector_run, CronTrigger(hour=6, minute=40, timezone=KST), id="kosha_collector") # 사이클 3 C-2 잔여: CSB sitemap lastmod diff (weekly 월, cap 40 + 워터마크 점진 백필). @@ -147,6 +153,12 @@ async def lifespan(app: FastAPI): scheduler.add_job(api_standards_run, CronTrigger(day=5, hour=7, minute=5, timezone=KST), id="api_standards_collector") # 사이클 3 C-2 잔여: CCPS Beacon 월간 PDF (playwright 익명 경유 — WAF 차단 시 health 로 가시화). scheduler.add_job(ccps_collector_run, CronTrigger(day=5, hour=7, minute=20, timezone=KST), id="ccps_collector") + # B-3 PR2: arXiv 키워드 필터 수집기 (daily 07:30 KST — statute 07:00 직후 빈 슬롯). + # signal-only 초록 색인, per-run cap 으로 임베드 큐 보호. keyless. + scheduler.add_job(arxiv_collector_run, CronTrigger(hour=7, minute=30, timezone=KST), id="arxiv_collector") + # B-3 PR3: OpenAlex 백본 수집기 (daily 07:45 KST). scaffold-first(키 부재 explicit-skip), + # signal-only 초록 색인, per-run cap + cursor watermark. 키=OPENALEX_API_KEY(credentials.env). + scheduler.add_job(openalex_collector_run, CronTrigger(hour=7, minute=45, timezone=KST), id="openalex_collector") scheduler.start() # Phase 2.1 (async 구조): QueryAnalyzer prewarm. diff --git a/app/services/papers/__init__.py b/app/services/papers/__init__.py new file mode 100644 index 0000000..a526dd6 --- /dev/null +++ b/app/services/papers/__init__.py @@ -0,0 +1,5 @@ +"""B-3 논문 수집 트랙 공유 모듈 (plan safety-library-b3-1). + +doi — DOI 정규화·dedup 키·2-Document(holder/parent_doi child) extract_meta 계약 (순수). +holder — 서지 holder 공유 dedup 조회 (DB). +""" diff --git a/app/services/papers/doi.py b/app/services/papers/doi.py new file mode 100644 index 0000000..8507927 --- /dev/null +++ b/app/services/papers/doi.py @@ -0,0 +1,141 @@ +"""B-3 논문 DOI 코어 — 정규화·dedup 키·2-Document(서지 holder / parent_doi child) 계약. + +plan safety-library-b3-1 PR1 (keyless·마이그 0). + +핵심 계약(모든 논문 수집기·reconcile·구매 PDF 스탬프가 공유): +- DOI 정규화는 이 단일 함수(normalize_doi) 경유 — **저장=조회 동일 함수** + (migration 351 주석 명시, news_collector._normalize_url 의 store=lookup 불변식 선례). + 같은 논문이 다른 표기(https://doi.org/ vs doi: vs 대문자)로 들어와도 한 holder 로 붕괴. +- dedup 키 = lower(extract_meta #>> '{paper,doi}') — 라이브 partial-unique 인덱스 + uq_documents_paper_doi(WHERE material_type='paper' AND ... IS NOT NULL)가 강제. +- 2-Document(R2-B1): paper.doi 는 **서지 Document 단일 보유**. OA/구매 전문 PDF 는 + doi 없이 paper.parent_doi 로 holder 링크(NULL doi 라 인덱스 밖 → 다중행 무충돌). + holder 와 child 는 doi/parent_doi 를 **상호 배타**로 가진다. +""" + +import hashlib +import re + +# 소문자화 후 비교하므로 전부 소문자 prefix. 긴 것부터(dx.doi.org 가 doi.org 보다 먼저). +_DOI_PREFIXES = ( + "https://dx.doi.org/", + "http://dx.doi.org/", + "https://doi.org/", + "http://doi.org/", + "dx.doi.org/", + "doi.org/", + "doi:", +) + + +def normalize_doi(raw: str | None) -> str | None: + """DOI 정규화 — 소문자 + URL/doi: prefix 제거 + 양끝 공백·잡음 제거. 단일 함수(저장=조회). + + 유효 DOI(10. 으로 시작)가 아니면 None. 저장측·조회측·dedup 키 생성이 모두 이 함수를 + 공유해야 dedup 이 성립한다(raw 를 그대로 저장하고 정규화로 조회하면 영구 미스). + """ + if not raw: + return None + s = raw.strip().lower() + for p in _DOI_PREFIXES: + if s.startswith(p): + s = s[len(p):] + break + s = s.strip() + # 인용문 끝 잡음(마침표/쉼표/세미콜론)만 제거. 괄호 '()' 는 DOI 일부일 수 있어 보존한다 + # (예: 10.1016/s0010-8650(00)80003-2) — 과삭제는 서로 다른 논문을 한 holder 로 병합하는 + # 데이터 손상이라 near-dup(과소삭제)보다 위험. API 소스(OpenAlex/arXiv)의 doi 는 이미 깨끗. + s = s.rstrip(".,;") + if not s.startswith("10."): + return None + return s + + +# arXiv id: 신형 'YYMM.NNNNN'(+vN) 또는 구형 'archive(.SUBJ)/NNNNNNN'. 'arXiv:' 접두 흡수. +_ARXIV_ID_RE = re.compile( + r"arxiv:\s*([a-z\-]+(?:\.[a-z]{2})?/\d{7}|\d{4}\.\d{4,5})(v\d+)?", re.IGNORECASE +) + + +def parse_arxiv_id(text: str | None) -> str | None: + """본문/제목에서 arXiv id(versionless) 추출. 없으면 None. 레거시 reconcile 의 입력.""" + if not text: + return None + m = _ARXIV_ID_RE.search(text) + return m.group(1) if m else None + + +def arxiv_doi(arxiv_id: str | None) -> str | None: + """arXiv DataCite DOI = 10.48550/arxiv.{id} (정규화). 저널 DOI 없는 프리프린트의 canonical + paper.doi 통일 키 — OpenAlex 가 프리프린트에 동일 DOI 부여(실측 확인). 모든 수집기·reconcile 가 + 같은 함수로 같은 DOI 를 써야 교차소스 dedup 이 성립.""" + if not arxiv_id: + return None + return normalize_doi(f"10.48550/arXiv.{arxiv_id}") + + +_DOI_IN_TEXT_RE = re.compile(r"10\.\d{4,9}/[^\s\"'<>]+", re.IGNORECASE) + + +def parse_doi_from_text(text: str | None) -> str | None: + """본문에서 첫 DOI 추출(정규화). 구매 PDF 의 paper.parent_doi 링크용(PDF 구조 무관 — 전체 스캔). + DOI 끝 구두점은 normalize_doi 가 정리. 없으면 None.""" + if not text: + return None + m = _DOI_IN_TEXT_RE.search(text) + return normalize_doi(m.group(0)) if m else None + + +def paper_doi_hash(normalized_doi: str) -> str: + """서지 holder 의 Document.file_hash — sha256('paper|{doi}')[:32]. + + statute 의 'statute|{jur}|{native_id}|{version_key}' 다중부 키 선례를 따른다. + 인자는 normalize_doi() 출력(정규화 완료값)이어야 한다 — raw 를 넣으면 dedup 이 깨진다. + """ + if not normalized_doi: + raise ValueError("paper_doi_hash 는 정규화된 DOI 필요 (normalize_doi 먼저)") + return hashlib.sha256(f"paper|{normalized_doi}".encode()).hexdigest()[:32] + + +def read_paper_doi(extract_meta: dict | None) -> str | None: + """holder 의 정규화 DOI 읽기 — 인덱스 식 lower(extract_meta #>> '{paper,doi}') 의 조회측 거울. + + 방어적 재정규화(이미 정규화돼 저장되지만 레거시·외부 주입 대비). + """ + if not extract_meta: + return None + paper = extract_meta.get("paper") + if not isinstance(paper, dict): + return None + return normalize_doi(paper.get("doi")) + + +def with_paper_doi(extract_meta: dict | None, normalized_doi: str) -> dict: + """서지 holder 의 extract_meta 에 paper.doi 주입 (merge-safe, 타 키 보존). + + holder 전용 — parent_doi 는 제거(상호 배타). 반환값은 새 dict(입력 비변경). + """ + if not normalized_doi: + raise ValueError("with_paper_doi 는 정규화된 DOI 필요") + meta = dict(extract_meta or {}) + paper = dict(meta.get("paper") or {}) + paper["doi"] = normalized_doi + paper.pop("parent_doi", None) + meta["paper"] = paper + return meta + + +def with_parent_doi(extract_meta: dict | None, parent_normalized_doi: str) -> dict: + """child(OA/구매 전문 PDF)의 extract_meta 에 paper.parent_doi 주입 (merge-safe, 타 키 보존). + + child 는 paper.doi 를 갖지 않는다(NULL → partial-unique 인덱스 밖, 2-Document 무충돌). + 반환값은 새 dict(입력 비변경). + """ + if not parent_normalized_doi: + raise ValueError("with_parent_doi 는 정규화된 DOI 필요") + meta = dict(extract_meta or {}) + paper = dict(meta.get("paper") or {}) + paper["parent_doi"] = parent_normalized_doi + paper.pop("doi", None) + meta["paper"] = paper + return meta diff --git a/app/services/papers/holder.py b/app/services/papers/holder.py new file mode 100644 index 0000000..2455dc5 --- /dev/null +++ b/app/services/papers/holder.py @@ -0,0 +1,38 @@ +"""B-3 논문 서지 holder 공유 dedup 조회. + +모든 논문 수집기(OpenAlex/arXiv/KoreaScience/J-STAGE)·reconcile·구매 PDF 스탬프가 +ingest 전 이 함수로 holder 존재를 확인한다(있으면 skip 또는 child 링크). + +- 조회 키 = lower(extract_meta #>> '{paper,doi}') == normalize_doi(...) — 라이브 partial-unique + 인덱스 uq_documents_paper_doi 와 동일 식(인덱스 사용). +- .scalars().first() — 교차게시·다중 landing-page 로 2행 이상 매칭 시 MultipleResultsFound + raise 방지(scalar_one_or_none 금지, 2026-06 BBC 수집 중단 선례 / news_collector 동일 규율). +- 서지 holder Document 의 **생성**은 각 수집기/스탬프 경로가 소유한다(초록 signal 문서 vs 구매 + 최소 holder 로 shape 가 다름). 이 모듈은 dedup 조회만 공유한다. + +DB 조회라 본 모듈은 PR2(arXiv 실수집)에서 라이브 검증한다 — PR1 단위 테스트 대상은 doi.py(순수). +""" + +from sqlalchemy import func, select + +from models.document import Document +from services.papers.doi import normalize_doi + +# 인덱스 식과 동일: lower(extract_meta #>> '{paper,doi}') +_DOI_EXPR = func.lower(Document.extract_meta[("paper", "doi")].astext) + + +async def find_paper_holder(session, raw_or_normalized_doi): + """정규화 DOI 로 서지 holder Document 조회. 없으면 None. + + 인자는 raw 든 정규화든 받아 normalize_doi 로 통일(저장=조회 동일 함수 보장). + """ + doi = normalize_doi(raw_or_normalized_doi) + if not doi: + return None + result = await session.execute( + select(Document) + .where(Document.material_type == "paper", _DOI_EXPR == doi) + .limit(1) + ) + return result.scalars().first() diff --git a/app/workers/arxiv_collector.py b/app/workers/arxiv_collector.py new file mode 100644 index 0000000..386733f --- /dev/null +++ b/app/workers/arxiv_collector.py @@ -0,0 +1,370 @@ +"""arXiv 키워드 필터 수집기 — B-3 PR2 (plan safety-library-b3-1). + +bespoke arXiv API(Atom) 수집기. 카테고리 RSS 통째(firehose)가 아니라 +cat:{category} AND (abs:키워드 ...) 로 안전/신뢰성/압력용기 관련분만 좁혀 수집한다. + +- signal-only: 초록만 색인(embed+chunk), summarize 절대 미enqueue — 맥미니 Qwen 큐 무접촉. +- DOI 보유 → paper.doi(서지 holder, partial-unique 인덱스 진입). 없으면 versionless arXiv id 로 + dedup(향후 PR4 reconcile 가 DOI 백필). +- etiquette: 요청 간 ≥3s + HTTP 429 지수 백오프. 카테고리별 submittedDate 워터마크로 증분. +- per-run insert cap(_RUN_CAP) — 광역 수집이 GPU bge-m3 embed 큐를 범람시키지 않게(적대리뷰 A major). + 잔여는 silent-cap 금지(csb idiom): 누락 건수 로깅. +- keyless. enabled=False news_sources 행(6h 뉴스 사이클 비대상) + main.py CronTrigger(자체 폴링). +- arXiv API 는 https 필수(http=301). UA = CRAWL_UA. +""" + +import asyncio +import hashlib +import re +import xml.etree.ElementTree as ET +from dataclasses import dataclass, field +from datetime import datetime, timezone + +import httpx +from sqlalchemy import select + +from core.crawl_politeness import CRAWL_UA +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from models.news_source import NewsSource +from models.queue import enqueue_stage +from services.papers.doi import arxiv_doi, normalize_doi +from services.papers.holder import find_paper_holder +from workers.news_collector import ( + FeedError, + _get_or_create_health, + _record_failure, + _record_success, +) + +logger = setup_logger("arxiv_collector") + +_ARXIV_API = "https://export.arxiv.org/api/query" +_SOURCE_NAME = "arXiv 안전·공학 (keyword)" + +# 신규 카테고리만 — 기존 RSS 행(id 62 physics.app-ph, id 64 cond-mat.mtrl-sci)과 비중복. +_CATEGORIES = ( + "eess.SY", # systems & control + "physics.flu-dyn", # 유체 — 압력/유동 + "physics.comp-ph", # 전산물리 + "math.OC", # 최적화·제어 + "math.NA", # 수치해석 (FEM 등) + "stat.AP", # 응용통계 — 신뢰성 + "cs.CE", # computational engineering +) +# 압력용기·공정안전·구조건전성 도메인 키워드(abs: OR 게이트). 좁게 유지 = 관련성↑·볼륨↓ (튜너블). +_KEYWORDS = ( + "pressure vessel", + "process safety", + "structural integrity", + "fracture mechanics", + "fatigue life", + "corrosion", +) + +_RUN_CAP = 80 # 1회 run 신규 적재 상한(임베드 큐 보호). bulk 시 해제. +_PAGE_SIZE = 50 # max_results per request +_MAX_PAGES_PER_CAT = 4 # 카테고리당 최대 페이지(증분이라 보통 1페이지에 워터마크 도달) +_REQ_SLEEP = 3.0 # arXiv etiquette ≥3s +_MAX_RETRY = 4 +_BACKOFF_BASE = 5.0 + +_NS = { + "a": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", + "opensearch": "http://a9.com/-/spec/opensearch/1.1/", +} +_ABS_ID_RE = re.compile(r"arxiv\.org/abs/(.+?)(v\d+)?$") +_WS_RE = re.compile(r"\s+") + + +# ───────────────────────── 순수 파서 (fixture 단위 테스트 대상) ───────────────────────── + +@dataclass +class ArxivEntry: + arxiv_id: str # versionless, 예: "1209.2405" + version: str | None # "v1" 또는 None + title: str + summary: str # 초록 + published: datetime | None + doi: str | None # normalize_doi 적용 + journal_ref: str | None + primary_category: str | None + categories: list = field(default_factory=list) + abs_url: str | None = None + pdf_url: str | None = None + + +def _clean(text: str | None) -> str: + return _WS_RE.sub(" ", text).strip() if text else "" + + +def _parse_id(raw_id: str | None) -> tuple[str | None, str | None]: + """'http://arxiv.org/abs/1209.2405v1' → ('1209.2405', 'v1'). versionless id 가 dedup 키.""" + m = _ABS_ID_RE.search((raw_id or "").strip()) + if not m: + return None, None + return m.group(1), m.group(2) + + +def _parse_dt(s: str | None) -> datetime | None: + if not s: + return None + try: + return datetime.fromisoformat(s.replace("Z", "+00:00")) + except ValueError: + return None + + +def build_search_query(category: str, keywords=_KEYWORDS) -> str: + """cat:{category} AND (abs:kw1 OR abs:"kw with space" ...). 공백 키워드는 따옴표 구절.""" + kw = " OR ".join(f'abs:"{k}"' if " " in k else f"abs:{k}" for k in keywords) + return f"cat:{category} AND ({kw})" + + +def parse_arxiv_feed(xml_text: str) -> tuple[int, list[ArxivEntry]]: + """arXiv Atom 응답 → (total_results, [ArxivEntry]). 순수 함수.""" + root = ET.fromstring(xml_text) + raw_total = root.findtext("opensearch:totalResults", default="0", namespaces=_NS) + try: + total = int(raw_total) + except (TypeError, ValueError): + total = 0 + entries: list[ArxivEntry] = [] + for e in root.findall("a:entry", _NS): + aid, ver = _parse_id(e.findtext("a:id", namespaces=_NS)) + if not aid: + continue + prim = e.find("arxiv:primary_category", _NS) + abs_url = pdf_url = None + for ln in e.findall("a:link", _NS): + if ln.get("rel") == "alternate" and (ln.get("type") or "").startswith("text/html"): + abs_url = ln.get("href") + elif ln.get("title") == "pdf": + pdf_url = ln.get("href") + entries.append(ArxivEntry( + arxiv_id=aid, + version=ver, + title=_clean(e.findtext("a:title", namespaces=_NS)), + summary=_clean(e.findtext("a:summary", namespaces=_NS)), + published=_parse_dt(e.findtext("a:published", namespaces=_NS)), + doi=normalize_doi(e.findtext("arxiv:doi", namespaces=_NS)), + journal_ref=_clean(e.findtext("arxiv:journal_ref", namespaces=_NS)) or None, + primary_category=prim.get("term") if prim is not None else None, + categories=[c.get("term") for c in e.findall("a:category", _NS)], + abs_url=abs_url, + pdf_url=pdf_url, + )) + return total, entries + + +# ───────────────────────── 적재 (DB — PR2 라이브 검증) ───────────────────────── + +def _build_paper_meta(source: NewsSource, entry: ArxivEntry, doi: str | None) -> dict: + """extract_meta — license + source + paper 식별. 서지 holder 는 paper.doi(있으면) 보유.""" + paper: dict = {"arxiv_id": entry.arxiv_id} + if doi: + paper["doi"] = doi # partial-unique 인덱스 진입 (교차소스 dedup) + if entry.journal_ref: + paper["journal_ref"] = entry.journal_ref + if entry.primary_category: + paper["primary_category"] = entry.primary_category + meta: dict = { + "source_id": source.id, + "source_name": source.name, + "source_region": "INT", # arXiv = 국제 preprint. paper.jurisdiction 은 NULL 유지(A-2). + "paper": paper, + # arXiv 기본 라이선스 = 비배포(보수적). restricted 부재 → 초록은 RAG 사용 가능. + # (명시 CC 검출은 OAI 인터페이스 필요 — Atom API 미포함, PR 후속/관찰.) + "license": {"scheme": "arxiv", "redistribute": False, "attribution": "arXiv"}, + } + if entry.published: + meta["published_at"] = entry.published.isoformat() + return meta + + +async def _ingest_entry(session, source: NewsSource, entry: ArxivEntry) -> bool: + """1건 적재. 반환 = 신규 여부. signal-only(embed+chunk, summarize 없음).""" + arxiv_hash = hashlib.sha256(f"arxiv|{entry.arxiv_id}".encode()).hexdigest()[:32] + # 재수집 dedup(arXiv id) — .first()(다중행 방어) + dup = await session.execute( + select(Document.id).where(Document.file_hash == arxiv_hash).limit(1) + ) + if dup.scalars().first(): + return False + # arXiv canonical DOI = 저널 DOI 또는 arXiv DataCite DOI(프리프린트도 paper.doi 보유 → PR3 와 dedup) + doi = entry.doi or arxiv_doi(entry.arxiv_id) + # 교차소스 dedup(DOI holder 이미 존재 — partial-unique 인덱스 백스톱 선제 회피) + if doi and await find_paper_holder(session, doi): + return False + + body = entry.summary or entry.title + doc = Document( + file_path=f"crawl/arxiv/{entry.arxiv_id}", + file_hash=arxiv_hash, + file_format="article", + file_size=len(body.encode()), + file_type="note", + title=entry.title, + extracted_text=f"{entry.title}\n\n{body}", + extracted_at=datetime.now(timezone.utc), + extractor_version="arxiv-api-signal", + md_status="skipped", + md_extraction_error="arXiv abstract: signal-only, markdown 비대상", + source_channel="crawl", + data_origin="external", + edit_url=entry.abs_url, + review_status="approved", + material_type="paper", + jurisdiction=None, # paper = NULL 불변(A-2). 지역은 extract_meta.paper.source_region. + published_date=entry.published.date() if entry.published else None, + extract_meta=_build_paper_meta(source, entry, doi), + ) + session.add(doc) + await session.flush() + # signal-only: 검색 색인만. summarize/fulltext 절대 enqueue 안 함(맥미니 큐 무접촉). + await enqueue_stage(session, doc.id, "embed") + await enqueue_stage(session, doc.id, "chunk") + return True + + +async def _get_or_create_source(session) -> NewsSource: + result = await session.execute( + select(NewsSource).where(NewsSource.name == _SOURCE_NAME) + ) + source = result.scalars().first() + if source is None: + source = NewsSource( + name=_SOURCE_NAME, feed_url=_ARXIV_API, feed_type="atom", + fetch_method="signal-only", fulltext_policy="none", + source_channel="crawl", category="Engineering", language="en", + country=None, # paper → jurisdiction NULL (country 미전파) + material_type="paper", + license_scheme="arxiv", license_redistribute=False, + enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 자체 폴링 + ) + session.add(source) + await session.flush() + return source + + +def _watermark(source: NewsSource, category: str) -> datetime | None: + raw = (source.selector_override or {}).get("arxiv_watermark", {}).get(category) + if not raw: + return None + return _parse_dt(raw) + + +def _set_watermark(source: NewsSource, category: str, value: datetime) -> None: + cfg = dict(source.selector_override or {}) + wm = dict(cfg.get("arxiv_watermark") or {}) + wm[category] = value.isoformat() + cfg["arxiv_watermark"] = wm + source.selector_override = cfg # JSONB 변경 감지 위해 재할당 + + +async def _fetch(client: httpx.AsyncClient, query: str, start: int) -> str: + params = { + "search_query": query, "start": start, "max_results": _PAGE_SIZE, + "sortBy": "submittedDate", "sortOrder": "descending", + } + for attempt in range(_MAX_RETRY): + resp = await client.get(_ARXIV_API, params=params) + if resp.status_code == 429: + await asyncio.sleep(_BACKOFF_BASE * (2 ** attempt)) + continue + resp.raise_for_status() + return resp.text + raise FeedError(f"arXiv 429 재시도 초과: {query[:48]}") + + +async def run(bulk: bool = False, limit: int = 0) -> None: + """daily 진입점(스케줄러). bulk/limit 은 CLI 전용(bulk=cap 해제·깊은 페이징).""" + now = datetime.now(timezone.utc) + async with async_session() as session: + source = await _get_or_create_source(session) + await session.commit() + source_id = source.id + + run_cap = (limit or 10**9) if bulk else (min(limit, _RUN_CAP) if limit else _RUN_CAP) + inserted = 0 + seen = 0 + failures: list[str] = [] + + async with httpx.AsyncClient( + timeout=30.0, headers={"User-Agent": CRAWL_UA}, follow_redirects=True + ) as client: + for category in _CATEGORIES: + if inserted >= run_cap: + break + query = build_search_query(category) + async with async_session() as session: + src = await session.get(NewsSource, source_id) + watermark = _watermark(src, category) + newest_seen: datetime | None = None + max_pages = (10**6 if bulk else _MAX_PAGES_PER_CAT) + try: + for page in range(max_pages): + if inserted >= run_cap: + break + xml_text = await _fetch(client, query, page * _PAGE_SIZE) + total, entries = parse_arxiv_feed(xml_text) + if not entries: + break + stop = False + for entry in entries: + seen += 1 + if entry.published: + newest_seen = max(newest_seen or entry.published, entry.published) + # 증분: 워터마크 이하 도달 시 이 카테고리 종료(이미 본 구간) + if watermark and not bulk and entry.published <= watermark: + stop = True + break + async with async_session() as session: + src = await session.get(NewsSource, source_id) + if await _ingest_entry(session, src, entry): + inserted += 1 + await session.commit() + else: + await session.rollback() + if inserted >= run_cap: + break + await asyncio.sleep(_REQ_SLEEP) + if stop or (page + 1) * _PAGE_SIZE >= total: + break + # 카테고리 워터마크 전진(이번 run 최신 발행일) + if newest_seen: + async with async_session() as session: + src = await session.get(NewsSource, source_id) + _set_watermark(src, category, newest_seen) + await session.commit() + except (httpx.HTTPError, FeedError, ET.ParseError) as e: + msg = f"[{category}] {e or repr(e)}" + logger.error(f"[arxiv] {msg}") + failures.append(msg) + + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + if failures and inserted == 0: + _record_failure(health, "; ".join(failures)[:500], now) + else: + _record_success(health, inserted, False, now) + await session.commit() + + deferred = "" if inserted < run_cap else f" (cap {run_cap} 도달 — 잔여는 다음 run 이월)" + logger.info( + f"[arxiv] {len(_CATEGORIES)}개 카테고리 스캔 {seen}건 → 신규 {inserted}건{deferred}" + + (f" / 실패 {len(failures)}건" if failures else "") + ) + + +if __name__ == "__main__": + # CLI = 수동/백필 전용. --bulk = cap 해제·깊은 페이징, --limit N = 상한 N(라이브 검증용). + import argparse + + parser = argparse.ArgumentParser(description="arXiv 안전·공학 키워드 수집기") + parser.add_argument("--bulk", action="store_true", help="cap 해제 + 깊은 페이징 백필") + parser.add_argument("--limit", type=int, default=0, help="신규 적재 상한(0=기본 cap)") + args = parser.parse_args() + asyncio.run(run(bulk=args.bulk, limit=args.limit)) diff --git a/app/workers/openalex_collector.py b/app/workers/openalex_collector.py new file mode 100644 index 0000000..a52c867 --- /dev/null +++ b/app/workers/openalex_collector.py @@ -0,0 +1,393 @@ +"""OpenAlex 백본 수집기 — B-3 PR3 (plan safety-library-b3-1). + +OpenAlex = 발견+dedup 글로벌 백본(JP/EU/US 논문 다 색인 + 정본 DOI). 전문은 안 줌(oa_url 포인터만). +- scaffold-first: OPENALEX_API_KEY 부재 시 FeedError(explicit-skip, silent fallback 금지). 키=무료. +- signal-only: 초록(inverted-index 복원)만 색인(embed+chunk), summarize 절대 미enqueue(맥미니 큐 무접촉). + PDF 는 절대 OpenAlex 경유로 안 받음(oa_url 은 링크/신호일 뿐). +- 관련성 사전필터 = title_and_abstract.search 키워드(서버측) + per-run insert cap(임베드 firehose 차단, + 적대리뷰 A major). cursor 페이징 + from_publication_date 워터마크로 증분. +- 초록 없는 thin 레코드(주로 비-OA 메타)는 skip — Phase-1 재료 품질 유지. +- DOI → paper.doi(holder, partial-unique 인덱스, 교차소스 dedup). 없으면 openalex id fallback. +- license: 명시 CC → redistribute=true / 그 외 OA·closed → false(restricted 부재 = 초록 RAG 사용 가능). +- enabled=False news_sources 행 + main.py CronTrigger(자체 폴링). list+filter 비용 미미($1/일 크레딧). +""" + +import asyncio +import hashlib +import json +import os +from dataclasses import dataclass +from datetime import date, datetime, timezone + +import httpx +from sqlalchemy import select + +from core.crawl_politeness import CRAWL_UA +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from models.news_source import NewsSource +from models.queue import enqueue_stage +from services.papers.doi import normalize_doi +from services.papers.holder import find_paper_holder +from workers.news_collector import ( + FeedError, + _get_or_create_health, + _record_failure, + _record_success, +) + +logger = setup_logger("openalex_collector") + +_API = "https://api.openalex.org/works" +_SOURCE_NAME = "OpenAlex 안전·공학 (keyword)" +_ENV_KEY = "OPENALEX_API_KEY" + +# 압력용기·공정안전·구조건전성 도메인 키워드(키워드별 1쿼리 = 관련성 사전필터). +_KEYWORDS = ( + "pressure vessel safety", + "process safety", + "structural integrity", + "fracture mechanics", + "fatigue life assessment", +) + +# 도메인 직결 저널 ISSN 시드(OpenAlex sources 실측 확인) — 키워드 매칭 누락분까지 전수 커버. +# KR 안전/가스/기계 + JP 고압. KR/JP 관심 = OpenAlex 깨끗한 API 로 직접(KoreaScience/J-STAGE 전용 +# 스크래퍼 불요 — Phase-1 메타는 OpenAlex 와 중복, 전용 수집기의 유니크 가치=무료 전문 PDF=Phase-2). +_JOURNAL_ISSNS = ( + ("한국안전학회지", "1738-3803"), + ("한국가스학회지", "1226-8402"), + ("대한기계학회논문집 A", "1226-4873"), + ("대한기계학회논문집 B", "1226-4881"), + ("KSME International J.", "1226-4865"), + ("Review of High Pressure Sci&Tech (JP)", "0917-639X"), +) + +_RUN_CAP = 60 # 1회 run 신규 적재 상한(임베드 큐 보호). bulk 시 해제. +_PER_PAGE = 50 +_MAX_PAGES_PER_KW = 4 # 키워드당 최대 페이지(증분이라 보통 1페이지에 워터마크 도달) +_REQ_SLEEP = 1.0 # 페이지 간 polite 간격 +_MAX_RETRY = 4 +_BACKOFF_BASE = 5.0 + + +# ───────────────────────── 순수 파서 (fixture 단위 테스트 대상) ───────────────────────── + +@dataclass +class OpenAlexWork: + openalex_id: str # "W2910511816" + doi: str | None # normalize_doi 적용 + title: str + abstract: str # inverted-index 복원 (없으면 "") + publication_date: str | None + oa_status: str | None # closed/green/bronze/hybrid/gold/diamond + oa_url: str | None + is_oa: bool + license: str | None # cc-by / cc-by-nc-nd / None + source_name: str | None + primary_topic: str | None + work_type: str | None + + +def _clean(text): + return " ".join(text.split()).strip() if text else "" + + +def _reconstruct_abstract(inv: dict | None) -> str: + """abstract_inverted_index({word:[positions]}) → 평문 초록. 없으면 ''.""" + if not inv: + return "" + positions = [(pos, word) for word, idxs in inv.items() for pos in idxs] + positions.sort() + return " ".join(w for _, w in positions) + + +def license_meta(license_str: str | None, is_oa: bool, source_name: str | None) -> dict: + """extract_meta.license — 명시 CC/public-domain 만 redistribute=true. restricted 부재(초록 색인 자유). + + redistribute=false 라도 restricted 가 없으면 RAG 사용 가능(초록). 비-CC 전문의 RAG verbatim 차단은 + Phase-2 전문 승격 단계가 restricted=true 로 처리(L-1) — Phase-1(초록)은 무해. + """ + attribution = source_name or "OpenAlex" + if license_str and (license_str.startswith("cc") or license_str == "public-domain"): + return {"scheme": license_str, "redistribute": True, "attribution": attribution} + return { + "scheme": "open-unspecified" if is_oa else "proprietary", + "redistribute": False, + "attribution": attribution, + } + + +def parse_openalex_works(json_text: str) -> tuple[int, str | None, list[OpenAlexWork]]: + """OpenAlex /works 응답 → (count, next_cursor, [OpenAlexWork]). 순수 함수.""" + d = json.loads(json_text) + meta = d.get("meta") or {} + count = meta.get("count") or 0 + next_cursor = meta.get("next_cursor") + works: list[OpenAlexWork] = [] + for w in d.get("results") or []: + oid = (w.get("id") or "").rstrip("/").rsplit("/", 1)[-1] + if not oid: + continue + oa = w.get("open_access") or {} + pl = w.get("primary_location") or {} + pt = w.get("primary_topic") or {} + works.append(OpenAlexWork( + openalex_id=oid, + doi=normalize_doi(w.get("doi")), + title=_clean(w.get("title")), + abstract=_reconstruct_abstract(w.get("abstract_inverted_index")), + publication_date=w.get("publication_date"), + oa_status=oa.get("oa_status"), + oa_url=oa.get("oa_url") or None, + is_oa=bool(oa.get("is_oa")), + license=pl.get("license"), + source_name=(pl.get("source") or {}).get("display_name"), + primary_topic=pt.get("display_name"), + work_type=w.get("type"), + )) + return count, next_cursor, works + + +def build_filter(keyword: str, from_date: str | None = None) -> str: + f = f"title_and_abstract.search:{keyword}" + if from_date: + f += f",from_publication_date:{from_date}" + return f + + +def build_issn_filter(issn: str, from_date: str | None = None) -> str: + f = f"primary_location.source.issn:{issn}" + if from_date: + f += f",from_publication_date:{from_date}" + return f + + +def _seeds() -> list[tuple[str, str, str]]: + """수집 시드 = (라벨, 워터마크키, 종류). 도메인 저널 ISSN 우선(cap 우선권) → 키워드.""" + s: list[tuple[str, str, str]] = [(label, issn, "issn") for label, issn in _JOURNAL_ISSNS] + s += [(kw, kw, "kw") for kw in _KEYWORDS] + return s + + +# ───────────────────────── 적재 (DB — PR3 라이브 검증) ───────────────────────── + +def _build_paper_meta(source: NewsSource, w: OpenAlexWork) -> dict: + paper: dict = {"openalex_id": w.openalex_id} + if w.doi: + paper["doi"] = w.doi # partial-unique 인덱스 진입(교차소스 dedup) + if w.oa_status: + paper["oa_status"] = w.oa_status + if w.oa_url: + paper["oa_url"] = w.oa_url # 링크/신호 — 자동 fetch 안 함 + if w.primary_topic: + paper["topic"] = w.primary_topic + meta: dict = { + "source_id": source.id, + "source_name": source.name, + "source_region": "INT", # OpenAlex = 글로벌. paper.jurisdiction 은 NULL 유지(A-2). + "paper": paper, + "license": license_meta(w.license, w.is_oa, w.source_name), + } + if w.publication_date: + meta["published_at"] = w.publication_date + return meta + + +async def _ingest_work(session, source: NewsSource, w: OpenAlexWork) -> bool: + """1건 적재. 반환 = 신규 여부. signal-only. 초록 없으면 skip(thin 레코드 배제).""" + if not w.abstract: + return False # 초록 없는 thin 레코드(주로 비-OA 메타) — Phase-1 재료 품질 유지 + oid_hash = hashlib.sha256(f"openalex|{w.openalex_id}".encode()).hexdigest()[:32] + dup = await session.execute( + select(Document.id).where(Document.file_hash == oid_hash).limit(1) + ) + if dup.scalars().first(): + return False + if w.doi and await find_paper_holder(session, w.doi): + return False # 교차소스 dedup(arXiv 등이 이미 holder 보유) + + pub_date = None + if w.publication_date: + try: + pub_date = date.fromisoformat(w.publication_date) + except ValueError: + pub_date = None + body = w.abstract + doc = Document( + file_path=f"crawl/openalex/{w.openalex_id}", + file_hash=oid_hash, + file_format="article", + file_size=len(body.encode()), + file_type="note", + title=w.title, + extracted_text=f"{w.title}\n\n{body}", + extracted_at=datetime.now(timezone.utc), + extractor_version="openalex-signal", + md_status="skipped", + md_extraction_error="OpenAlex abstract: signal-only, markdown 비대상", + source_channel="crawl", + data_origin="external", + edit_url=w.oa_url or f"https://openalex.org/{w.openalex_id}", + review_status="approved", + material_type="paper", + jurisdiction=None, + published_date=pub_date, + extract_meta=_build_paper_meta(source, w), + ) + session.add(doc) + await session.flush() + await enqueue_stage(session, doc.id, "embed") + await enqueue_stage(session, doc.id, "chunk") + return True + + +async def _get_or_create_source(session) -> NewsSource: + result = await session.execute( + select(NewsSource).where(NewsSource.name == _SOURCE_NAME) + ) + source = result.scalars().first() + if source is None: + source = NewsSource( + name=_SOURCE_NAME, feed_url=_API, feed_type="json", + fetch_method="signal-only", fulltext_policy="none", + source_channel="crawl", category="Engineering", language="en", + country=None, material_type="paper", + license_scheme="openalex", license_redistribute=False, + enabled=False, + ) + session.add(source) + await session.flush() + return source + + +def _api_key() -> str: + key = os.getenv(_ENV_KEY, "").strip() + if not key: + raise FeedError(f"{_ENV_KEY} 미설정 — OpenAlex 수집 불가 (scaffold-first explicit-skip)") + return key + + +def _watermark(source: NewsSource, keyword: str) -> str | None: + return (source.selector_override or {}).get("openalex_watermark", {}).get(keyword) + + +def _set_watermark(source: NewsSource, keyword: str, value: str) -> None: + cfg = dict(source.selector_override or {}) + wm = dict(cfg.get("openalex_watermark") or {}) + wm[keyword] = value + cfg["openalex_watermark"] = wm + source.selector_override = cfg + + +async def _fetch(client: httpx.AsyncClient, key: str, filter_str: str, cursor: str) -> str: + params = { + "filter": filter_str, "per-page": _PER_PAGE, "cursor": cursor, + "sort": "publication_date:desc", "api_key": key, + } + for attempt in range(_MAX_RETRY): + resp = await client.get(_API, params=params) + if resp.status_code == 429: + await asyncio.sleep(_BACKOFF_BASE * (2 ** attempt)) + continue + resp.raise_for_status() + return resp.text + raise FeedError(f"OpenAlex 429 재시도 초과: {filter_str[:48]}") + + +async def run(bulk: bool = False, limit: int = 0) -> None: + """daily 진입점(스케줄러). 키 부재 = explicit-skip(health 실패 기록).""" + now = datetime.now(timezone.utc) + async with async_session() as session: + source = await _get_or_create_source(session) + await session.commit() + source_id = source.id + + try: + key = _api_key() + except FeedError as e: + logger.warning(f"[openalex] {e}") + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_failure(health, str(e), now) + await session.commit() + return + + run_cap = (limit or 10**9) if bulk else (min(limit, _RUN_CAP) if limit else _RUN_CAP) + inserted = 0 + seen = 0 + failures: list[str] = [] + + async with httpx.AsyncClient( + timeout=30.0, headers={"User-Agent": CRAWL_UA}, follow_redirects=True + ) as client: + for label, wm_key, kind in _seeds(): + if inserted >= run_cap: + break + async with async_session() as session: + src = await session.get(NewsSource, source_id) + watermark = None if bulk else _watermark(src, wm_key) + filter_str = (build_issn_filter(wm_key, watermark) if kind == "issn" + else build_filter(wm_key, watermark)) + newest: str | None = None + cursor = "*" + max_pages = (10**6 if bulk else _MAX_PAGES_PER_KW) + try: + for _page in range(max_pages): + if inserted >= run_cap: + break + text = await _fetch(client, key, filter_str, cursor) + _count, next_cursor, works = parse_openalex_works(text) + if not works: + break + for w in works: + seen += 1 + if w.publication_date and (newest is None or w.publication_date > newest): + newest = w.publication_date + async with async_session() as session: + src = await session.get(NewsSource, source_id) + if await _ingest_work(session, src, w): + inserted += 1 + await session.commit() + else: + await session.rollback() + if inserted >= run_cap: + break + await asyncio.sleep(_REQ_SLEEP) + if not next_cursor: + break + cursor = next_cursor + if newest: + async with async_session() as session: + src = await session.get(NewsSource, source_id) + _set_watermark(src, wm_key, newest) + await session.commit() + except (httpx.HTTPError, FeedError, ValueError) as e: + msg = f"[{label}] {e or repr(e)}" + logger.error(f"[openalex] {msg}") + failures.append(msg) + + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + if failures and inserted == 0: + _record_failure(health, "; ".join(failures)[:500], now) + else: + _record_success(health, inserted, False, now) + await session.commit() + + deferred = "" if inserted < run_cap else f" (cap {run_cap} 도달 — 잔여 다음 run 이월)" + logger.info( + f"[openalex] {len(_seeds())}개 시드(ISSN+키워드) 스캔 {seen}건 → 신규 {inserted}건{deferred}" + + (f" / 실패 {len(failures)}건" if failures else "") + ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="OpenAlex 안전·공학 키워드 백본 수집기") + parser.add_argument("--bulk", action="store_true", help="cap 해제 + 깊은 cursor 페이징 백필") + parser.add_argument("--limit", type=int, default=0, help="신규 적재 상한(0=기본 cap)") + args = parser.parse_args() + asyncio.run(run(bulk=args.bulk, limit=args.limit)) diff --git a/app/workers/paper_doi_reconcile.py b/app/workers/paper_doi_reconcile.py new file mode 100644 index 0000000..38d1843 --- /dev/null +++ b/app/workers/paper_doi_reconcile.py @@ -0,0 +1,102 @@ +"""paper DOI reconcile — B-3 PR4(레거시 arXiv) + PR5(구매 PDF) (plan safety-library-b3-1). + +paper.doi/parent_doi 둘 다 없는 paper 행을 두 갈래로 정리: +- 레거시 arXiv 초록(holder): arXiv id → arxiv_doi(10.48550/arxiv.{id}) 스탬프 → partial-unique + 인덱스 편입 → 재유입 차단('동일-DOI 재유입 차단만'). +- 구매 PDF(child, license.restricted=true — Papers_Purchased 드롭): 본문 DOI 파싱 → paper.parent_doi + 링크(서지 holder 와 DOI 공유로 연결). child 는 doi 미보유(인덱스 밖) → unique 무충돌. + +- KEYLESS·결정적(OpenAlex 호출 0)·in-DB·enqueue 0(콘텐츠 무변경). dedup_reconcile(file_hash 캐시)와 + 별 worker(적대리뷰 B·C major). 선재 DOI holder 존재 시 arXiv 행도 parent_doi 마킹(unique 위반 회피). +""" + +import asyncio + +from sqlalchemy import select + +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from services.papers.doi import ( + arxiv_doi, + parse_arxiv_id, + parse_doi_from_text, + with_paper_doi, + with_parent_doi, +) +from services.papers.holder import find_paper_holder + +logger = setup_logger("paper_doi_reconcile") + +_DOI_TEXT = Document.extract_meta[("paper", "doi")].astext +_PARENT_DOI_TEXT = Document.extract_meta[("paper", "parent_doi")].astext + + +def _is_restricted(meta: dict) -> bool: + return (meta.get("license") or {}).get("restricted") in (True, "true") + + +async def run(limit: int = 0) -> None: + """paper.doi/parent_doi 없는 paper 행 reconcile(멱등). limit=0 = 전건.""" + stamped = marked_dup = skipped_no_arxiv = 0 + linked_purchased = skipped_purchased_no_doi = 0 + async with async_session() as session: + q = ( + select(Document) + .where( + Document.material_type == "paper", + _DOI_TEXT.is_(None), + _PARENT_DOI_TEXT.is_(None), + ) + .order_by(Document.id) + ) + if limit: + q = q.limit(limit) + rows = (await session.execute(q)).scalars().all() + + for row in rows: + meta = dict(row.extract_meta or {}) + paper = dict(meta.get("paper") or {}) + + # PR5: 구매 PDF(restricted) = child → 본문 DOI 파싱 → parent_doi 링크 + if _is_restricted(meta): + doi = parse_doi_from_text(row.extracted_text) + if not doi: + skipped_purchased_no_doi += 1 + continue + row.extract_meta = with_parent_doi(meta, doi) + linked_purchased += 1 + continue + + # PR4: 레거시 arXiv 초록(holder) = arXiv DataCite DOI 스탬프 + arxiv_id = paper.get("arxiv_id") or parse_arxiv_id(row.extracted_text) + doi = arxiv_doi(arxiv_id) + if not doi: + skipped_no_arxiv += 1 + continue + paper["arxiv_id"] = arxiv_id + meta["paper"] = paper + holder = await find_paper_holder(session, doi) + if holder is not None and holder.id != row.id: + row.extract_meta = with_parent_doi(meta, doi) # 선재 중복 → child 마킹 + marked_dup += 1 + else: + row.extract_meta = with_paper_doi(meta, doi) # holder 스탬프, 인덱스 진입 + stamped += 1 + # 콘텐츠 무변경 → enqueue 없음(summarize/embed/chunk 0) + await session.commit() + + logger.info( + f"[paper_doi_reconcile] {len(rows)}행 → arXiv 스탬프 {stamped} · 선재중복 {marked_dup} · " + f"arXiv id 없음 skip {skipped_no_arxiv} / 구매PDF parent_doi 링크 {linked_purchased} · " + f"구매PDF DOI 없음 skip {skipped_purchased_no_doi}" + ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="paper DOI reconcile (arXiv 레거시 + 구매 PDF, keyless)") + parser.add_argument("--limit", type=int, default=0, help="처리 상한(0=전건)") + args = parser.parse_args() + asyncio.run(run(limit=args.limit)) diff --git a/tests/fixtures/arxiv_search_pressure_vessel.xml b/tests/fixtures/arxiv_search_pressure_vessel.xml new file mode 100644 index 0000000..28269b8 --- /dev/null +++ b/tests/fixtures/arxiv_search_pressure_vessel.xml @@ -0,0 +1,383 @@ + + + https://arxiv.org/api/m9A/71G4hH6NGyarIQjqA3n6Zzk + arXiv Query: search_query=abs:"pressure vessel"&id_list=&start=0&max_results=10 + 2026-06-13T21:57:59Z + + 10 + 89 + 0 + + http://arxiv.org/abs/1209.2405v1 + A Survey of Pressure Vessel Code Compliance for Superconducting RF Cryomodules + 2012-09-11T19:34:46Z + + + Superconducting radio frequency (SRF) cavities made from niobium and cooled with liquid helium are becoming key components of many particle accelerators. The helium vessels surrounding the RF cavities, portions of the niobium cavities themselves, and also possibly the vacuum vessels containing these assemblies, generally fall under the scope of local and national pressure vessel codes. In the U.S., Department of Energy rules require national laboratories to follow national consensus pressure vessel standards or to show "a level of safety greater than or equal to" that of the applicable standard. Thus, while used for its superconducting properties, niobium ends up being treated as a low-temperature pressure vessel material. Niobium material is not a code listed material and therefore requires the designer to understand the mechanical properties for material used in each pressure vessel fabrication; compliance with pressure vessel codes therefore becomes a problem. This report summarizes the approaches that various institutions have taken in order to bring superconducting RF cryomodules into compliance with pressure vessel codes. + + 2012-09-11T19:34:46Z + 7 pp + + + Thomas Peterson + Fermilab + + + Arkadiy Klebaner + Fermilab + + + Tom Nicol + Fermilab + + + Jay Theilacker + Fermilab + + + Hitoshi Hayano + KEK, Tsukuba + + + Eiji Kako + KEK, Tsukuba + + + Hirotaka Nakai + KEK, Tsukuba + + + Akira Yamamoto + KEK, Tsukuba + + + Kay Jensch + DESY + + + Axel Matheisen + DESY + + + John Mammosser + Jefferson Lab + + 10.1063/1.4707088 + + + + http://arxiv.org/abs/2003.02057v1 + Investigation of Unit-1 Nuclear Reactor of the Fukushima Daiichi by Cosmic Muon Radiography + 2020-03-03T03:21:53Z + + + We have investigated the status of the nuclear fuel assemblies in Unit-1 reactor of the Fukushima Daiichi Nuclear Power plant by the method called Cosmic Muon Radiography. In this study, muon tracking detectors were placed outside of the reactor building. We succeeded in identifying the inner structure of the reactor complex such as the reactor containment vessel, pressure vessel, and other structures of the reactor building, through the concrete wall of the reactor building. We found that a large amount of fuel assemblies was missing in the original fuel loading zone inside the pressure vessel. It can be naturally interpreted that most of the nuclear fuel was melt and dropped down to the bottom of the pressure vessel or even below. + + + 2020-03-03T03:21:53Z + 14 pages, 17 figures + + + Hirofumi Fujii + High Energy Accelerator Research Organization + + + Kazuhiko Hara + University of Tsukuba + + + Kohei Hayashi + High Energy Accelerator Research Organization + + + Hidekazu Kakuno + Tokyo Metropolitan University + + + Hideyo Kodama + High Energy Accelerator Research Organization + + + Kanetada Nagamine + High Energy Accelerator Research Organization + + + Kotaro Sato + High Energy Accelerator Research Organization + + + Shin-Hong Kim + University of Tsukuba + + + Atsuto Suzuki + High Energy Accelerator Research Organization + + + Takayuki Sumiyoshi + Tokyo Metropolitan University + + + Kazuki Takahashi + University of Tsukuba + + + Fumihiko Takasaki + High Energy Accelerator Research Organization + + + Shuji Tanaka + High Energy Accelerator Research Organization + + + Satoru Yamashita + University of Tokyo + + + + http://arxiv.org/abs/1609.07515v1 + Low Background Stainless Steel for the Pressure Vessel in the PandaX-II Dark Matter Experiment + 2016-09-21T10:33:04Z + + + We report on the custom produced low radiation background stainless steel and the welding rod for the PandaX experiment, one of the deep underground experiments to search for dark matter and neutrinoless double beta decay using xenon. The anthropogenic 60 Co concentration in these samples is at the range of 1 mBq/kg or lower. We also discuss the radioactivity of nuclear-grade stainless steel from TISCO which has a similar background rate. The PandaX-II pressure vessel was thus fabricated using the stainless steel from CISRI and TISCO. Based on the analysis of the radioactivity data, we also made discussions on potential candidate for low background metal materials for future pressure vessel development. + + + 2016-09-21T10:33:04Z + + + Tao Zhang + + + Changbo Fu + + + Xiangdong Ji + + + Jianglai Liu + + + Xiang Liu + + + Xuming Wang + + + Chunfa Yao + + + Xunhua Yuan + + 10.1088/1748-0221/11/09/T09004 + + + + http://arxiv.org/abs/2308.09786v1 + Mechanical design of the optical modules intended for IceCube-Gen2 + 2023-08-18T19:20:09Z + + + IceCube-Gen2 is an expansion of the IceCube neutrino observatory at the South Pole that aims to increase the sensitivity to high-energy neutrinos by an order of magnitude. To this end, about 10,000 new optical modules will be installed, instrumenting a fiducial volume of about 8 km^3. Two newly developed optical module types increase current sensitivity per module by a factor of three by integrating 16 and 18 newly developed four-inch PMTs in specially designed 12.5-inch diameter pressure vessels. Both designs use conical silicone gel pads to optically couple the PMTs to the pressure vessel to increase photon collection efficiency. The outside portion of gel pads are pre-cast onto each PMT prior to integration, while the interiors are filled and cast after the PMT assemblies are installed in the pressure vessel via a pushing mechanism. This paper presents both the mechanical design, as well as the performance of prototype modules at high pressure (70 MPa) and low temperature (-40 degree Celsius), characteristic of the environment inside the South Pole ice. + + + 2023-08-18T19:20:09Z + Presented at the 38th International Cosmic Ray Conference (ICRC2023). See arXiv:2307.13048 for all IceCube-Gen2 contributions + + + Yuya Makino + for the IceCube-Gen2 Collaboration + + + + http://arxiv.org/abs/0804.0261v1 + Circulation in Blowdown Flows + 2008-04-01T22:22:32Z + + + The blowdown of high pressure gas in a pressure vessel produces rapid adiabatic cooling of the gas remaining in the vessel. The gas near the wall is warmed by conduction from the wall, producing radial temperature and density gradients that affect the flow, the mass efflux rate and the thermodynamic states of both the outflowing and the contained gas. The resulting buoyancy-driven flow circulates gas through the vessel and reduces, but does not eliminate, these gradients. The purpose of this note is to estimate when blowdown cooling is rapid enough that the gas in the pressure vessel is neither isothermal nor isopycnic, though it remains isobaric. I define a dimensionless number, the buoyancy circulation number BC, that parametrizes these effects. + + 2008-04-01T22:22:32Z + 5 pp., no figures + + J. Pressure Vessel Tech. 131, 034501 (2009) + + J. I. Katz + + + + http://arxiv.org/abs/1204.0234v1 + Substantiation of Thermodynamic Criteria of Explosion Safety in Process of Severe Accidents in Pressure Vessel Reactors + 2012-03-27T11:21:14Z + + + The paper represents original development of thermodynamic criteria of occurrence conditions of steam-gas explosions in the process of severe accidents. The received results can be used for modelling of processes of severe accidents in pressure vessel reactors. + + 2012-03-27T11:21:14Z + 5 pages, 1 figure + + + V. I. Skalozubov + + + V. N. Vashchenko + + + S. S. Jarovoj + + + V. Yu. Kochnyeva + + + + http://arxiv.org/abs/2511.11485v1 + Data-efficient U-Net for Segmentation of Carbide Microstructures in SEM Images of Steel Alloys + 2025-11-14T17:01:02Z + + + Understanding reactor-pressure-vessel steel microstructure is crucial for predicting mechanical properties, as carbide precipitates both strengthen the alloy and can initiate cracks. In scanning electron microscopy images, gray-value overlap between carbides and matrix makes simple thresholding ineffective. We present a data-efficient segmentation pipeline using a lightweight U-Net (30.7~M parameters) trained on just \textbf{10 annotated scanning electron microscopy images}. Despite limited data, our model achieves a \textbf{Dice-Sørensen coefficient of 0.98}, significantly outperforming the state-of-the-art in the field of metallurgy (classical image analysis: 0.85), while reducing annotation effort by one order of magnitude compared to the state-of-the-art data efficient segmentation model. This approach enables rapid, automated carbide quantification for alloy design and generalizes to other steel types, demonstrating the potential of data-efficient deep learning in reactor-pressure-vessel steel analysis. + + + 2025-11-14T17:01:02Z + + Machine Learning and the Physical Sciences Workshop @ NeurIPS 2025 https://openreview.net/forum?id=xYY5pn4f8N + + Alinda Ezgi Gerçek + + + Till Korten + + + Paul Chekhonin + + + Maleeha Hassan + + + Peter Steinbach + + + + http://arxiv.org/abs/2511.09689v1 + An ASME-Compliant Helium-4 Evaporation Refrigerator for the SpinQuest Experiment + 2025-11-12T19:45:47Z + + + This paper presents the design, safety basis, and commissioning results of a 1 K liquid helium-4 (4He) evaporation refrigerator developed for the Fermilab SpinQuest Experiment (E1039). The system represents the first high power helium evaporation refrigerator operated in a fixed target scattering experiment at Fermilab and was engineered to comply with the Fermilab ES\&H Manual (FESHM) requirements governing pressure vessels, piping, cryogenic systems, and vacuum vessels. The design is mapped to ASME B31.3 (Process Piping) and the ASME Boiler and Pressure Vessel Code (BPVC) for pressure boundary integrity and overpressure protection, with documented compliance to FESHM Chapters 5031 (Pressure Vessels), 5031.1 (Piping Systems), and 5033 (Vacuum Vessels). This work documents the methodology used to reach compliance and approval for the 4He evaporation refrigerator at Fermilab which the field lacks. Design considerations specific to the high radiation target-cave environment including remotely located instrumentation approximately 20 m from the cryostat are summarized, together with the relief-system sizing methodology used to accommodate transient heat loads from dynamic nuclear polarization microwaves and the high-intensity proton beam. Commissioning data from July 2024 confirms that the system satisfies all thermal performance and safety objectives. + + 2025-11-12T19:45:47Z + For IEEE Transactions in Nuclear Physics, 11 pages, 14 figures + + + Jordan D. Roberts + + + Vibodha Bandara + + + Kenichi Nakano + + + Dustin Keller + + + + http://arxiv.org/abs/1507.04072v1 + High-Voltage Terminal Test of Test Stand for 1-MV Electrostatic Accelerator + 2015-07-15T02:41:11Z + + + The Korea Multipurpose Accelerator Complex (KOMAC) has been developing a 300-kV test stand for a 1-MV electrostatic accelerator ion source. The ion source and accelerating tube will be installed in a high-pressure vessel. The ion source in the high-pressure vessel is required to have a high reliability. The test stand has been proposed and developed to confirm the stable operating conditions of the ion source. The ion source will be tested at the test stand to verify the long-time operating conditions. The test stand comprises a 300-kV high-voltage terminal, a battery for the ion-source power, a 60-Hz inverter, 200-MHz RF power, a 5-kV extraction power supply, a 300-kV accelerating tube, and a vacuum system. The results of the 300-kV high-voltage terminal tests are presented in this paper. + + 2015-07-15T02:41:11Z + International Conference on Accelerators and Beam Utilization (ICABU2014) + + Yong-Sub Cho KNS (2014); W. Sima IEEE (2004) 480-483; LA-UR-87-126 (1987); Jeong-tae Kim KNS (2014) + + Sae-Hoon Park + + + Yu-Seok Kim + + 10.3938/jkps + + + + http://arxiv.org/abs/2005.05585v1 + Investigation of the Status of Unit 2 Nuclear Reactor of the Fukushima Daiichi by the Cosmic Muon Radiography + 2020-05-12T07:26:37Z + + + We have investigated the status of the nuclear debris in the Unit-2 Nuclear Reactor of the Fukushima Daiichi Nuclear Power plant by the method called Cosmic Muon Radiography. In this measurement, the muon detector was placed outside of the reactor building as was the case of the measurement for the Unit-1 Reactor. Compared to the previous measurements, the detector was down-sized, which made us possible to locate it closer to the reactor and to investigate especially the lower part of the fuel loading zone. We identified the inner structures of the reactor such as the containment vessel, pressure vessel and other objects through the thick concrete wall of the reactor building. Furthermore, the observation showed existence of heavy material at the bottom of the pressure vessel, which can be interpreted as the debris of melted nuclear fuel dropped from the loading zone. + + 2020-05-12T07:26:37Z + 11 figures and 2 tables + + + Hirofumi Fujii + + + Kazuhiko Hara + + + Shugo Hashimoto + + + Kohei Hayashi + + + Hidekazu Kakuno + + + Hideyo Kodama + + + Gi Meiki + + + Masato Mizokami + + + Shinya Mizokami + + + Kanetada Nagamine + + + Kotaro Sato + + + Shunsuke Sekita + + + Hiroshi Shirai + + + Shin-Hong Kim + + + Takayuki Sumiyoshi + + + Atsuto Suzuki + + + Yoshihisa Takada + + + Kazuki Takahashi + + + Yu Takahashi + + + Fumihiko Takasaki + + + Daichi Yamada + + + Satoru Yamashita + + + diff --git a/tests/fixtures/openalex_works_response.json b/tests/fixtures/openalex_works_response.json new file mode 100644 index 0000000..d4666fc --- /dev/null +++ b/tests/fixtures/openalex_works_response.json @@ -0,0 +1 @@ +{"meta": {"count": 1111, "db_response_time_ms": 66, "page": 1, "per_page": 5, "groups_count": null, "x_query": {"oql": "works where it's open access\n and title/abstract contains (process safety pressure vessel)\nreturn id, DOI, title, abstract inverted index, date, year, type, open access, primary location, locations, primary topic", "oqo": {"get_rows": "works", "filter_rows": [{"column_id": "open_access.is_oa", "value": true}, {"column_id": "title_and_abstract.search", "value": "process safety pressure vessel", "operator": "contains"}], "select": ["id", "doi", "title", "abstract_inverted_index", "publication_date", "publication_year", "type", "open_access", "primary_location", "locations", "primary_topic"], "per_page": 5}, "url": "/works?filter=open_access.is_oa:true,title_and_abstract.search:process safety pressure vessel&select=id,doi,title,abstract_inverted_index,publication_date,publication_year,type,open_access,primary_location,locations,primary_topic&per_page=5"}, "cost_usd": 0.001}, "results": [{"id": "https://openalex.org/W2910511816", "doi": "https://doi.org/10.1088/1757-899x/469/1/012009", "title": "A critical review and analysis of pressure vessel structures", "abstract_inverted_index": {"This": [0], "paper": [1], "provides": [2], "an": [3], "overview": [4], "of": [5, 28, 54, 89, 95, 107], "the": [6, 18, 29, 41, 47, 52, 65, 70, 84, 90, 93, 96, 102, 108, 119], "pressure": [7, 30, 77, 97, 121], "vessel,": [8, 42], "starting": [9], "with": [10, 51, 114], "its": [11], "background": [12], "and": [13, 25, 56, 87, 104], "a": [14], "brief": [15], "history.": [16], "Then,": [17], "geometry,": [19], "main": [20], "components,": [21], "classification,": [22], "applications,": [23], "materials": [24], "fabrication": [26], "process": [27], "vessel": [31, 98, 122], "are": [32, 112], "also": [33], "discussed.": [34], "When": [35], "designing": [36], "or": [37, 78], "performing": [38], "optimization": [39], "on": [40], "it": [43], "is": [44, 99], "crucial": [45], "for": [46, 118], "designers": [48], "to": [49, 58, 63], "familiar": [50], "types": [53], "failures": [55], "loadings,": [57], "select": [59], "appropriate": [60], "analytical": [61], "methods": [62], "analyse": [64], "vessel.": [66, 91], "As": [67], "well": [68], "as": [69, 74], "design": [71, 76, 94], "parameters": [72], "such": [73], "thickness,": [75], "allowable": [79], "stresses,": [80], "which": [81], "can": [82], "alter": [83], "performance,": [85], "efficiency": [86], "safety": [88], "Since": [92], "governed": [100], "by": [101], "codes": [103, 111], "standards,": [105], "some": [106], "commonly": [109], "used": [110], "presented,": [113], "more": [115], "details": [116], "included": [117], "ASME": [120], "code.": [123]}, "publication_date": "2019-01-16", "publication_year": 2019, "type": "review", "open_access": {"is_oa": true, "oa_status": "diamond", "oa_url": "https://doi.org/10.1088/1757-899x/469/1/012009", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.1088/1757-899x/469/1/012009", "is_oa": true, "landing_page_url": "https://doi.org/10.1088/1757-899x/469/1/012009", "pdf_url": null, "source": {"id": "https://openalex.org/S4210189194", "display_name": "IOP Conference Series Materials Science and Engineering", "issn_l": "1757-8981", "issn": ["1757-8981", "1757-899X"], "is_oa": true, "is_in_doaj": false, "is_core": false, "host_organization": "https://openalex.org/P4310320083", "host_organization_name": "IOP Publishing", "host_organization_lineage": ["https://openalex.org/P4310320083", "https://openalex.org/P4310311669"], "host_organization_lineage_names": ["IOP Publishing", "Institute of Physics"], "type": "conference"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "IOP Conference Series: Materials Science and Engineering", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.1088/1757-899x/469/1/012009", "is_oa": true, "landing_page_url": "https://doi.org/10.1088/1757-899x/469/1/012009", "pdf_url": null, "source": {"id": "https://openalex.org/S4210189194", "display_name": "IOP Conference Series Materials Science and Engineering", "issn_l": "1757-8981", "issn": ["1757-8981", "1757-899X"], "is_oa": true, "is_in_doaj": false, "is_core": false, "host_organization": "https://openalex.org/P4310320083", "host_organization_name": "IOP Publishing", "host_organization_lineage": ["https://openalex.org/P4310320083", "https://openalex.org/P4310311669"], "host_organization_lineage_names": ["IOP Publishing", "Institute of Physics"], "type": "conference"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "IOP Conference Series: Materials Science and Engineering", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T12581", "display_name": "Engineering Structural Analysis Methods", "score": 0.9983999729156494, "subfield": {"id": "https://openalex.org/subfields/2210", "display_name": "Mechanical Engineering"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}, {"id": "https://openalex.org/W4407429739", "doi": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "title": "A review of type IV composite overwrapped pressure vessels", "abstract_inverted_index": {"Type": [0, 22, 65, 128, 147, 166, 180], "IV": [1, 23, 66, 129, 148, 167, 181], "Composite": [2], "overwrapped": [3], "pressure": [4], "vessels": [5, 24], "(COPVs)": [6], "are": [7, 25], "commonly": [8], "used": [9], "in": [10, 41, 72, 96, 136, 152, 169, 177], "high-pressure": [11], "environments": [12], "such": [13], "as": [14], "aerospace,": [15], "automotive,": [16], "and": [17, 43, 60, 77, 88, 101, 124, 133, 144, 155, 171], "industrial": [18], "sectors.": [19], "The": [20], "COPV": [21], "characterized": [26], "by": [27, 34], "their": [28], "non-metallic": [29], "liner,": [30], "which": [31], "is": [32], "surrounded": [33], "composite": [35, 156], "materials,": [36, 57, 143], "leading": [37], "to": [38, 84], "significant": [39], "reductions": [40], "weight": [42], "enhanced": [44], "storage": [45], "capacity.": [46], "This": [47], "review": [48, 92, 115], "article": [49], "outlines": [50], "the": [51, 55, 81, 91, 108, 121, 137], "essential": [52], "aspects": [53], "of": [54, 111, 127, 141, 146, 162], "design,": [56, 142], "manufacturing": [58, 145], "processes,": [59], "performance": [61], "criteria": [62], "associated": [63], "with": [64], "COPVs.": [67, 112, 149], "It": [68], "also": [69], "explores": [70], "advancements": [71, 135], "resin": [73], "systems,": [74], "fiber": [75], "reinforcements,": [76], "coatings,": [78], "while": [79], "addressing": [80], "challenges": [82], "related": [83], "long-term": [85], "durability,": [86], "reliability,": [87], "safety.": [89, 159], "Furthermore,": [90], "emphasizes": [93], "recent": [94], "developments": [95], "testing": [97, 163], "protocols,": [98], "certification": [99], "standards,": [100], "emerging": [102], "research": [103], "trends": [104, 176], "focused": [105], "on": [106], "creating": [107], "next": [109], "generation": [110], "Overall,": [113], "this": [114], "offers": [116], "a": [117], "thorough": [118], "insight": [119], "into": [120], "current": [122], "capabilities": [123], "future": [125], "possibilities": [126], "COPVs,": [130], "promoting": [131], "innovation": [132], "further": [134], "field.": [138], "\u2022": [139, 150, 160, 174], "Review": [140], "Advances": [151], "liner": [153], "fabrication": [154], "overwraps": [157], "for": [158, 165, 183], "Evaluation": [161], "methods": [164], "COPVs": [168, 182], "aerospace": [170], "H\u2082": [172], "storage.": [173], "Future": [175], "cost-effective,": [178], "recyclable": [179], "hydrogen.": [184]}, "publication_date": "2025-02-12", "publication_year": 2025, "type": "review", "open_access": {"is_oa": true, "oa_status": "hybrid", "oa_url": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.1016/j.ijhydene.2025.02.108", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.1016/j.ijhydene.2025.02.108", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2025.02.108", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T10219", "display_name": "Mechanical Behavior of Composites", "score": 0.9961000084877014, "subfield": {"id": "https://openalex.org/subfields/2211", "display_name": "Mechanics of Materials"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}, {"id": "https://openalex.org/W3107397139", "doi": "https://doi.org/10.1007/978-3-030-64823-7_34", "title": "A Digital Twin for Safety and Risk Management: A Prototype for a Hydrogen High-Pressure Vessel", "abstract_inverted_index": null, "publication_date": "2020-01-01", "publication_year": 2020, "type": "book-chapter", "open_access": {"is_oa": true, "oa_status": "green", "oa_url": "https://aaltodoc.aalto.fi/handle/123456789/109586", "any_repository_has_fulltext": true}, "primary_location": {"id": "doi:10.1007/978-3-030-64823-7_34", "is_oa": false, "landing_page_url": "https://doi.org/10.1007/978-3-030-64823-7_34", "pdf_url": null, "source": {"id": "https://openalex.org/S106296714", "display_name": "Lecture notes in computer science", "issn_l": "0302-9743", "issn": ["0302-9743", "1611-3349"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310319900", "host_organization_name": "Springer Science+Business Media", "host_organization_lineage": ["https://openalex.org/P4310319900", "https://openalex.org/P4310319965"], "host_organization_lineage_names": ["Springer Science+Business Media", "Springer Nature"], "type": "book series"}, "license": null, "license_id": null, "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Lecture Notes in Computer Science", "raw_type": "book-chapter"}, "locations": [{"id": "doi:10.1007/978-3-030-64823-7_34", "is_oa": false, "landing_page_url": "https://doi.org/10.1007/978-3-030-64823-7_34", "pdf_url": null, "source": {"id": "https://openalex.org/S106296714", "display_name": "Lecture notes in computer science", "issn_l": "0302-9743", "issn": ["0302-9743", "1611-3349"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310319900", "host_organization_name": "Springer Science+Business Media", "host_organization_lineage": ["https://openalex.org/P4310319900", "https://openalex.org/P4310319965"], "host_organization_lineage_names": ["Springer Science+Business Media", "Springer Nature"], "type": "book series"}, "license": null, "license_id": null, "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Lecture Notes in Computer Science", "raw_type": "book-chapter"}, {"id": "pmh:oai:aaltodoc.aalto.fi:123456789/109586", "is_oa": true, "landing_page_url": "https://research.aalto.fi/en/publications/57f0a92f-a7bc-46e0-b384-871ebfa22cb9", "pdf_url": "https://aaltodoc.aalto.fi/handle/123456789/109586", "source": {"id": "https://openalex.org/S4306401663", "display_name": "Aaltodoc (Aalto University)", "issn_l": null, "issn": null, "is_oa": false, "is_in_doaj": false, "is_core": false, "host_organization": "https://openalex.org/I9927081", "host_organization_name": "Aalto University", "host_organization_lineage": ["https://openalex.org/I9927081"], "host_organization_lineage_names": [], "type": "repository"}, "license": "other-oa", "license_id": "https://openalex.org/licenses/other-oa", "version": "submittedVersion", "is_accepted": false, "is_published": false, "raw_source_name": null, "raw_type": "acceptedVersion"}], "primary_topic": {"id": "https://openalex.org/T10763", "display_name": "Digital Transformation in Industry", "score": 0.9955000281333923, "subfield": {"id": "https://openalex.org/subfields/2209", "display_name": "Industrial and Manufacturing Engineering"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}, {"id": "https://openalex.org/W4405974102", "doi": "https://doi.org/10.3390/pr13010074", "title": "Mapping the Knowledge Domain of Pressure Vessels and Piping Fields for Safety Research in Industrial Processes: A Bibliometric Analysis", "abstract_inverted_index": {"With": [0], "the": [1, 26, 32, 43, 82, 127, 142, 180, 193, 203, 215, 219, 226, 233, 240], "rapid": [2], "advancement": [3, 241], "of": [4, 34, 45, 111, 116, 145, 195, 223, 229, 235, 242], "modern": [5], "industries,": [6], "pressure": [7, 35, 146], "vessels": [8, 147], "and": [9, 21, 28, 37, 75, 90, 100, 104, 106, 119, 121, 124, 148, 155, 158, 162, 187, 192, 197, 200, 209, 221, 239, 255], "piping": [10, 38], "have": [11], "become": [12], "increasingly": [13], "integral": [14], "to": [15, 51, 67, 213], "sectors": [16], "such": [17, 96], "as": [18, 55, 97], "energy,": [19], "petrochemicals,": [20], "process": [22], "industries.": [23], "To": [24], "grasp": [25], "research": [27, 83, 139, 258], "application": [29, 194], "status": [30], "in": [31, 42, 58, 84, 131, 259], "field": [33, 86], "vessel": [36], "safety,": [39], "670": [40], "publications": [41], "Web": [44], "Science": [46, 99], "core": [47], "database": [48], "from": [49], "2008": [50], "2024": [52], "were": [53, 65], "taken": [54], "data": [56], "samples": [57], "this": [59, 85, 132, 260], "paper.": [60], "The": [61, 78, 108, 134, 169], "knowledge": [62], "mapping": [63], "tools": [64], "used": [66], "carry": [68], "out": [69], "co-occurrence": [70], "analysis,": [71, 157], "keyword": [72], "burst": [73], "detection,": [74], "co-citation": [76], "analysis.": [77], "results": [79], "show": [80], "that": [81], "presents": [87], "a": [88], "multidisciplinary": [89, 261], "cross-disciplinary": [91], "state,": [92], "involving": [93], "multiple": [94], "disciplines": [95], "Nuclear": [98], "Technology,": [101], "Engineering": [102, 123], "Mechanics,": [103], "Energy": [105], "Fuels.": [107], "\u201cInternational": [109, 114], "Journal": [110, 115], "Hydrogen": [112], "Energy\u201d,": [113], "Pressure": [117], "Vessels": [118], "Piping\u201d,": [120], "\u201cNuclear": [122], "Design\u201d": [125], "are": [126], "primary": [128], "publication": [129], "outlets": [130], "domain.": [133], "study": [135], "identifies": [136], "three": [137, 176], "major": [138], "hotspots:": [140], "(1)": [141, 178], "safety": [143, 181, 220, 253], "performance": [144], "piping,": [149], "(2)": [150, 189], "structural": [151], "integrity,": [152], "failure": [153], "mechanisms,": [154], "stress": [156], "(3)": [159, 201], "numerical": [160], "simulation": [161], "thermal\u2013hydraulic": [163], "analysis": [164], "under": [165], "various": [166], "operating": [167], "conditions.": [168], "current": [170, 216], "challenges": [171, 217], "can": [172], "be": [173], "summarized": [174], "into": [175], "aspects:": [177], "addressing": [179], "risks": [182], "brought": [183], "by": [184], "new": [185], "technologies": [186], "materials,": [188], "promoting": [190], "innovation": [191], "detection": [196, 228], "monitoring": [198], "technologies,": [199], "strengthening": [202], "building": [204], "capacity": [205], "for": [206, 251], "accident": [207], "prevention": [208], "emergency": [210], "management.": [211], "Specific": [212], "China,": [214], "include": [218], "management": [222], "aging": [224], "equipment,": [225], "effective": [227], "circumferential": [230], "weld": [231], "cracks,": [232], "refinement": [234], "risk": [236], "assessment": [237], "models,": [238], "smart": [243], "technology": [244], "applications.": [245], "These": [246], "findings": [247], "offer": [248], "valuable": [249], "insights": [250], "advancing": [252], "practices": [254], "guiding": [256], "future": [257], "field.": [262]}, "publication_date": "2025-01-01", "publication_year": 2025, "type": "article", "open_access": {"is_oa": true, "oa_status": "gold", "oa_url": "https://www.mdpi.com/2227-9717/13/1/74/pdf?version=1735722405", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.3390/pr13010074", "is_oa": true, "landing_page_url": "https://doi.org/10.3390/pr13010074", "pdf_url": "https://www.mdpi.com/2227-9717/13/1/74/pdf?version=1735722405", "source": {"id": "https://openalex.org/S4210201879", "display_name": "Processes", "issn_l": "2227-9717", "issn": ["2227-9717"], "is_oa": true, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310310987", "host_organization_name": "Multidisciplinary Digital Publishing Institute", "host_organization_lineage": ["https://openalex.org/P4310310987"], "host_organization_lineage_names": ["Multidisciplinary Digital Publishing Institute"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Processes", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.3390/pr13010074", "is_oa": true, "landing_page_url": "https://doi.org/10.3390/pr13010074", "pdf_url": "https://www.mdpi.com/2227-9717/13/1/74/pdf?version=1735722405", "source": {"id": "https://openalex.org/S4210201879", "display_name": "Processes", "issn_l": "2227-9717", "issn": ["2227-9717"], "is_oa": true, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310310987", "host_organization_name": "Multidisciplinary Digital Publishing Institute", "host_organization_lineage": ["https://openalex.org/P4310310987"], "host_organization_lineage_names": ["Multidisciplinary Digital Publishing Institute"], "type": "journal"}, "license": "cc-by", "license_id": "https://openalex.org/licenses/cc-by", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "Processes", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T11357", "display_name": "Risk and Safety Analysis", "score": 0.9919000267982483, "subfield": {"id": "https://openalex.org/subfields/1804", "display_name": "Statistics, Probability and Uncertainty"}, "field": {"id": "https://openalex.org/fields/18", "display_name": "Decision Sciences"}, "domain": {"id": "https://openalex.org/domains/2", "display_name": "Social Sciences"}}}, {"id": "https://openalex.org/W4391130399", "doi": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "title": "Development of machine learning based classifier for the pressure test result prediction of type IV composite overwrapped pressure vessels", "abstract_inverted_index": {"The": [0, 69, 88], "stringent": [1], "safety": [2, 47, 194], "regulations": [3], "of": [4, 29, 33, 67, 77, 155, 161, 175, 185], "type": [5], "IV": [6], "composite": [7], "overwrapped": [8], "pressure": [9, 85, 128, 149], "vessels": [10], "(COPVs)": [11], "for": [12, 60, 147, 180], "commercial": [13, 189], "vehicles": [14], "mandate": [15], "a": [16, 74, 176], "certification": [17], "process": [18], "involving": [19], "pressurization": [20], "up": [21], "to": [22, 42, 166, 192], "1050": [23], "bar,": [24], "with": [25, 152], "the": [26, 97, 122, 126, 144, 148, 172, 182], "critical": [27], "requirement": [28], "withstanding": [30], "burst": [31, 49, 83, 127], "pressures": [32], "1570": [34], "bar.": [35], "Analyzing": [36], "proof": [37], "test": [38, 89, 151], "data": [39, 80, 123], "is": [40], "crucial": [41], "enhance": [43], "and": [44, 64, 84, 107, 117, 133], "ensure": [45], "tank": [46], "regarding": [48], "pressure.": [50], "In": [51], "this": [52], "study,": [53], "we": [54], "developed": [55], "various": [56], "machine": [57, 177], "learning": [58, 178], "classifiers": [59, 70], "structure": [61], "health": [62], "monitoring": [63], "damage": [65, 183], "prediction": [66], "COPVs.": [68], "were": [71, 91, 102], "trained": [72, 108], "using": [73, 109], "substantial": [75], "amount": [76], "acoustic": [78], "emission": [79], "collected": [81, 124], "during": [82, 96], "cycling": [86, 150], "tests.": [87, 195], "results": [90], "employed": [92], "as": [93], "label": [94], "inputs": [95], "training": [98, 121], "process.": [99], "Statistical": [100], "features": [101], "extracted": [103], "per": [104], "time": [105], "unit": [106], "Naive": [110], "Bayes,": [111], "Logistic": [112], "Regression,": [113], "Decision": [114, 131], "Tree,": [115, 132], "XGBoost,": [116], "TabNet": [118, 141, 158], "models.": [119], "Upon": [120], "from": [125], "test,": [129], "TabNet,": [130], "XGBoost": [134], "achieved": [135], "classification": [136, 167], "accuracies": [137], "above": [138], "0.94.": [139], "Notably,": [140], "demonstrated": [142], "also": [143], "best": [145], "performance": [146], "an": [153], "accuracy": [154], "0.98.": [156], "Furthermore,": [157], "provided": [159], "visualizations": [160], "feature": [162], "sensitivity": [163], "in": [164, 188], "relation": [165], "results.": [168], "This": [169], "study": [170], "marks": [171], "first": [173], "development": [174], "classifier": [179], "predicting": [181], "state": [184], "COPV": [186], "tanks": [187], "applications": [190], "pertaining": [191], "required": [193]}, "publication_date": "2024-01-23", "publication_year": 2024, "type": "article", "open_access": {"is_oa": true, "oa_status": "hybrid", "oa_url": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "any_repository_has_fulltext": false}, "primary_location": {"id": "doi:10.1016/j.ijhydene.2024.01.182", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by-nc-nd", "license_id": "https://openalex.org/licenses/cc-by-nc-nd", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}, "locations": [{"id": "doi:10.1016/j.ijhydene.2024.01.182", "is_oa": true, "landing_page_url": "https://doi.org/10.1016/j.ijhydene.2024.01.182", "pdf_url": null, "source": {"id": "https://openalex.org/S48860480", "display_name": "International Journal of Hydrogen Energy", "issn_l": "0360-3199", "issn": ["0360-3199", "1879-3487"], "is_oa": false, "is_in_doaj": false, "is_core": true, "host_organization": "https://openalex.org/P4310320990", "host_organization_name": "Elsevier BV", "host_organization_lineage": ["https://openalex.org/P4310320990"], "host_organization_lineage_names": ["Elsevier BV"], "type": "journal"}, "license": "cc-by-nc-nd", "license_id": "https://openalex.org/licenses/cc-by-nc-nd", "version": "publishedVersion", "is_accepted": true, "is_published": true, "raw_source_name": "International Journal of Hydrogen Energy", "raw_type": "journal-article"}], "primary_topic": {"id": "https://openalex.org/T10219", "display_name": "Mechanical Behavior of Composites", "score": 0.9993000030517578, "subfield": {"id": "https://openalex.org/subfields/2211", "display_name": "Mechanics of Materials"}, "field": {"id": "https://openalex.org/fields/22", "display_name": "Engineering"}, "domain": {"id": "https://openalex.org/domains/3", "display_name": "Physical Sciences"}}}], "group_by": []} \ No newline at end of file diff --git a/tests/test_arxiv_collector_units.py b/tests/test_arxiv_collector_units.py new file mode 100644 index 0000000..d6a1930 --- /dev/null +++ b/tests/test_arxiv_collector_units.py @@ -0,0 +1,75 @@ +"""B-3 PR2 — arXiv 파서·쿼리빌더 순수 단위 테스트 (plan safety-library-b3-1). + +fixture = arXiv API 실응답 박제(abs:"pressure vessel" relevance 10건 — +DOI 보유 / journal_ref 만 보유 / 둘 다 없음 3경로 포함). run()/적재(DB)는 PR2 라이브 검증. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "app")) + +from workers.arxiv_collector import ( # noqa: E402 + build_search_query, + parse_arxiv_feed, +) + +FIX = Path(__file__).parent / "fixtures" / "arxiv_search_pressure_vessel.xml" + + +def _entries(): + total, entries = parse_arxiv_feed(FIX.read_text(encoding="utf-8")) + return total, {e.arxiv_id: e for e in entries}, entries + + +# ─── 피드 레벨 ─── + +def test_feed_total_and_count(): + total, by_id, entries = _entries() + assert total == 89 # fixture totalResults (페이징 재료) + assert len(entries) == 10 + + +def test_versionless_ids(): + _, by_id, entries = _entries() + # arxiv_id 는 versionless (버전 접미는 .version 으로 분리) + assert all("/" not in e.arxiv_id for e in entries) + assert "1209.2405" in by_id and by_id["1209.2405"].version == "v1" + + +# ─── DOI 보유 entry ─── + +def test_entry_with_doi(): + _, by_id, _ = _entries() + e = by_id["1209.2405"] + assert e.doi == "10.1063/1.4707088" # normalize_doi 적용(소문자·정규화) + assert e.journal_ref is None + assert e.primary_category == "physics.acc-ph" + assert e.title.startswith("A Survey of Pressure Vessel") + assert len(e.summary) > 200 # 초록 본문 + assert e.published is not None + assert e.abs_url and "/abs/" in e.abs_url + assert e.pdf_url and "pdf" in e.pdf_url + + +# ─── journal_ref 만 (DOI 없음) — 압력용기 저널 출판분 ─── + +def test_entry_journal_ref_without_doi(): + _, by_id, _ = _entries() + e = by_id["0804.0261"] + assert e.doi is None + assert e.journal_ref and "Pressure Vessel" in e.journal_ref + + +# ─── 둘 다 없음(최근 preprint) 경로도 존재 ─── + +def test_entry_neither_doi_nor_journal_ref_exists(): + _, _, entries = _entries() + assert any(e.doi is None and e.journal_ref is None for e in entries) + + +# ─── 쿼리 빌더 ─── + +def test_build_search_query(): + q = build_search_query("eess.SY", ["pressure vessel", "safety"]) + assert q == 'cat:eess.SY AND (abs:"pressure vessel" OR abs:safety)' diff --git a/tests/test_openalex_collector_units.py b/tests/test_openalex_collector_units.py new file mode 100644 index 0000000..afee7b0 --- /dev/null +++ b/tests/test_openalex_collector_units.py @@ -0,0 +1,106 @@ +"""B-3 PR3 — OpenAlex 파서·초록복원·license 순수 단위 테스트 (plan safety-library-b3-1). + +fixture = OpenAlex /works 실응답 박제(process safety/pressure vessel OA 5건 — +cc-by/cc-by-nc-nd/license None, 초록 있음/없음). run()/적재(DB)는 PR3 라이브 검증. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "app")) + +from workers.openalex_collector import ( # noqa: E402 + _reconstruct_abstract, + _seeds, + build_filter, + build_issn_filter, + license_meta, + parse_openalex_works, +) + +FIX = Path(__file__).parent / "fixtures" / "openalex_works_response.json" + + +def _works(): + count, cursor, works = parse_openalex_works(FIX.read_text(encoding="utf-8")) + return count, {w.openalex_id: w for w in works}, works + + +# ─── 피드 레벨 ─── + +def test_count_and_results(): + count, by_id, works = _works() + assert count == 1111 + assert len(works) == 5 + assert all(w.openalex_id.startswith("W") and "/" not in w.openalex_id for w in works) + + +# ─── 초록 보유 + CC 라이선스 ─── + +def test_work_with_abstract_and_cc(): + _, by_id, _ = _works() + w = by_id["W2910511816"] + assert w.doi and w.doi.startswith("10.") and w.doi == w.doi.lower() # normalize_doi + assert len(w.abstract) > 50 # inverted-index 복원 + assert w.oa_status == "diamond" and w.is_oa is True + assert w.license == "cc-by" + assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True + + +# ─── 초록 없는 thin 레코드(skip 대상) ─── + +def test_work_without_abstract(): + _, by_id, _ = _works() + w = by_id["W3107397139"] + assert w.abstract == "" # inverted-index 부재 → 빈 초록 + lm = license_meta(w.license, w.is_oa, w.source_name) + assert lm["redistribute"] is False # license None → 비배포 + + +# ─── cc-by-nc-nd 도 CC 계열 → redistribute True ─── + +def test_cc_variant_redistribute(): + _, by_id, _ = _works() + w = by_id["W4391130399"] + assert w.license == "cc-by-nc-nd" + assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True + + +# ─── 초록 inverted-index 복원 순서 ─── + +def test_reconstruct_abstract_order(): + inv = {"Safety": [0], "of": [1, 4], "pressure": [2], "vessels": [3], "design": [5]} + assert _reconstruct_abstract(inv) == "Safety of pressure vessels of design" + assert _reconstruct_abstract(None) == "" + assert _reconstruct_abstract({}) == "" + + +# ─── license_meta 분기 ─── + +def test_license_meta_branches(): + assert license_meta("cc-by", True, "X")["redistribute"] is True + assert license_meta("cc0", True, "X")["redistribute"] is True + none_oa = license_meta(None, True, "X") + assert none_oa["redistribute"] is False and none_oa["scheme"] == "open-unspecified" + closed = license_meta(None, False, "X") + assert closed["redistribute"] is False and closed["scheme"] == "proprietary" + + +# ─── 쿼리 빌더 ─── + +def test_build_filter(): + assert build_filter("process safety") == "title_and_abstract.search:process safety" + assert build_filter("process safety", "2026-06-01") == \ + "title_and_abstract.search:process safety,from_publication_date:2026-06-01" + + +# ─── PR6: ISSN 소스 시드 (KR/JP 안전 저널 직접 커버) ─── + +def test_build_issn_filter_and_seeds(): + assert build_issn_filter("1738-3803") == "primary_location.source.issn:1738-3803" + assert build_issn_filter("1738-3803", "2026-01-01") == \ + "primary_location.source.issn:1738-3803,from_publication_date:2026-01-01" + seeds = _seeds() + kinds = [k for _, _, k in seeds] + assert kinds[0] == "issn" # ISSN 시드가 키워드보다 먼저(cap 우선권) + assert any(v == "1738-3803" and k == "issn" for _, v, k in seeds) # 한국안전학회지 포함 diff --git a/tests/test_paper_doi_units.py b/tests/test_paper_doi_units.py new file mode 100644 index 0000000..d356881 --- /dev/null +++ b/tests/test_paper_doi_units.py @@ -0,0 +1,141 @@ +"""B-3 PR1 — 논문 DOI 코어 순수 단위 테스트 (plan safety-library-b3-1). + +holder.find_paper_holder(DB 조회)는 PR2 arXiv 실수집 시 라이브 검증 — 여기선 순수 함수만. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "app")) + +from services.papers.doi import ( # noqa: E402 + arxiv_doi, + normalize_doi, + paper_doi_hash, + parse_arxiv_id, + parse_doi_from_text, + read_paper_doi, + with_paper_doi, + with_parent_doi, +) + + +# ─── normalize_doi: 단일 함수(저장=조회) ─── + +def test_normalize_strips_url_and_lowercases(): + assert normalize_doi("https://doi.org/10.1585/PFR.15.2402039") == "10.1585/pfr.15.2402039" + assert normalize_doi("http://dx.doi.org/10.1115/1.4045678") == "10.1115/1.4045678" + assert normalize_doi("doi:10.1016/j.jlp.2020.104321") == "10.1016/j.jlp.2020.104321" + assert normalize_doi("DOI: 10.1234/ABC") == "10.1234/abc" + + +def test_normalize_trims_whitespace_and_citation_noise(): + assert normalize_doi(" https://doi.org/10.1234/abc ") == "10.1234/abc" + assert normalize_doi("10.1234/abc.") == "10.1234/abc" + assert normalize_doi("10.1234/abc;") == "10.1234/abc" + + +def test_normalize_preserves_parens_in_doi(): + # 괄호는 DOI 일부일 수 있어 보존 (과삭제 = 다른 논문 병합 = 데이터 손상, near-dup 보다 위험) + assert normalize_doi("10.1016/s0010-8650(00)80003-2") == "10.1016/s0010-8650(00)80003-2" + assert normalize_doi("https://doi.org/10.1016/S0010-8650(00)80003-2") == "10.1016/s0010-8650(00)80003-2" + + +def test_normalize_rejects_non_doi(): + assert normalize_doi(None) is None + assert normalize_doi("") is None + assert normalize_doi(" ") is None + assert normalize_doi("not-a-doi") is None + assert normalize_doi("arXiv:2606.08108") is None # arXiv id 는 DOI 아님 + + +def test_normalize_is_idempotent_store_equals_lookup(): + # 저장측·조회측이 같은 함수를 거치면 표기 차이가 한 값으로 붕괴 (dedup 성립 조건) + forms = [ + "https://doi.org/10.1/X", + "doi:10.1/x", + "10.1/X", + " HTTPS://DOI.ORG/10.1/x ", + ] + assert {normalize_doi(f) for f in forms} == {"10.1/x"} + assert normalize_doi(normalize_doi("https://doi.org/10.1/X")) == "10.1/x" # 멱등 + + +# ─── paper_doi_hash: holder file_hash 키 ─── + +def test_paper_doi_hash_deterministic_len32(): + h = paper_doi_hash("10.1234/abc") + assert len(h) == 32 + assert h == paper_doi_hash("10.1234/abc") + + +def test_paper_doi_hash_distinct_per_doi(): + assert paper_doi_hash("10.1/a") != paper_doi_hash("10.1/b") + + +# ─── 2-Document extract_meta 계약 (holder doi / child parent_doi 상호 배타) ─── + +def test_with_paper_doi_holder_shape_and_merge_safe(): + meta = with_paper_doi({"license": {"scheme": "cc_by"}, "source_id": 7}, "10.1/x") + assert meta["paper"]["doi"] == "10.1/x" + assert "parent_doi" not in meta["paper"] + assert meta["license"]["scheme"] == "cc_by" # 타 키 보존 + assert meta["source_id"] == 7 + + +def test_with_parent_doi_child_shape_no_doi(): + meta = with_parent_doi({"license": {"scheme": "proprietary"}}, "10.1/holder") + assert meta["paper"]["parent_doi"] == "10.1/holder" + assert "doi" not in meta["paper"] # child 는 doi 미보유 (partial-unique 인덱스 밖) + assert meta["license"]["scheme"] == "proprietary" + + +def test_holder_child_mutually_exclusive(): + child = with_parent_doi({}, "10.1/p") + promoted = with_paper_doi(child, "10.1/self") + assert promoted["paper"]["doi"] == "10.1/self" + assert "parent_doi" not in promoted["paper"] + + +def test_input_not_mutated(): + src = {"paper": {"doi": "10.1/old"}} + with_parent_doi(src, "10.1/new") + assert src["paper"]["doi"] == "10.1/old" # 원본 dict 불변 + + +# ─── read_paper_doi: 인덱스 식의 조회측 거울 ─── + +def test_read_paper_doi(): + assert read_paper_doi({"paper": {"doi": "10.1/x"}}) == "10.1/x" + assert read_paper_doi({"paper": {"doi": "https://doi.org/10.1/X"}}) == "10.1/x" # 방어적 재정규화 + assert read_paper_doi({}) is None + assert read_paper_doi(None) is None + assert read_paper_doi({"paper": {"parent_doi": "10.1/p"}}) is None # child 는 doi 없음 + assert read_paper_doi({"paper": {}}) is None + + +# ─── PR4: arXiv id 파싱 + arXiv DataCite DOI (교차소스 dedup 통일 키) ─── + +def test_parse_arxiv_id(): + assert parse_arxiv_id("Title arXiv:2606.10236v1 Announce Type: new Abstract") == "2606.10236" + assert parse_arxiv_id("see arXiv:2601.02852 for details") == "2601.02852" + assert parse_arxiv_id("arXiv:cond-mat/0703470v2") == "cond-mat/0703470" + assert parse_arxiv_id("no arxiv here") is None + assert parse_arxiv_id(None) is None + + +def test_arxiv_doi_canonical(): + # OpenAlex canonical 실측 일치: 10.48550/arxiv.{id} (소문자) + assert arxiv_doi("2606.10236") == "10.48550/arxiv.2606.10236" + assert arxiv_doi(None) is None + # 수집기·reconcile 가 같은 함수 → 같은 paper.doi (교차소스 dedup 성립) + assert arxiv_doi(parse_arxiv_id("x arXiv:2606.10236v1 y")) == "10.48550/arxiv.2606.10236" + + +# ─── PR5: 구매 PDF 본문 DOI 파싱 (parent_doi 링크용, PDF 구조 무관) ─── + +def test_parse_doi_from_text(): + assert parse_doi_from_text("ref https://doi.org/10.1016/j.jlp.2024.105474 end") == "10.1016/j.jlp.2024.105474" + assert parse_doi_from_text("DOI 10.1115/1.4045678. Next.") == "10.1115/1.4045678" + assert parse_doi_from_text("no doi here") is None + assert parse_doi_from_text(None) is None