feat(papers): B-3 PR3 — OpenAlex 백본 수집기 (scaffold-first, signal-only, per-run cap)

plan safety-library-b3-1 PR3. 발견+dedup 글로벌 백본(JP/EU/US 색인+정본 DOI, 전문 안 줌).
- scaffold-first: OPENALEX_API_KEY 부재 시 FeedError explicit-skip(silent fallback 0). 키=무료.
- signal-only: inverted-index 초록 복원→색인(embed+chunk), summarize 0. PDF 절대 미fetch(oa_url=신호).
- 관련성 사전필터=title_and_abstract.search 키워드 + per-run cap 60(임베드 firehose 차단, 적대리뷰 A major)
  + cursor 페이징 + from_publication_date 워터마크 증분. 초록 없는 thin 레코드 skip(재료 품질).
- license: 명시 CC→redistribute true / OA·closed→false(restricted 부재=초록 RAG 사용가능, 비-CC 전문은 L-1 Phase-2).
- DOI→paper.doi(holder, 교차소스 dedup) / 없으면 openalex_id. enabled=False 행+add_job(daily 07:45 KST)+CLI.

순수 파서/초록복원/license_meta fixture 단위 7 passed(OpenAlex 실응답: cc-by/cc-by-nc-nd/None·초록 유무).
라이브 검증 PASS (prod, running fastapi 무접촉): 키없음→explicit-skip / 키주입→3건 적재
(paper/NULL/ai_summary NULL/region INT, cc-by→redist true·unspecified→false, green/gold,
큐 embed3+chunk3·summarize 0, distinct openalex_id=total, 교차소스 DOI 4 distinct 4 중복 0).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Code
2026-06-13 22:30:36 +00:00
parent 6167e03625
commit 1fbb341e28
4 changed files with 463 additions and 0 deletions
+4
View File
@@ -57,6 +57,7 @@ async def lifespan(app: FastAPI):
from workers.statute_collector import run as statute_run from workers.statute_collector import run as statute_run
from workers.news_collector import run as news_collector_run from workers.news_collector import run as news_collector_run
from workers.arxiv_collector import run as arxiv_collector_run from workers.arxiv_collector import run as arxiv_collector_run
from workers.openalex_collector import run as openalex_collector_run
from workers.fulltext_worker import reconcile_unresolved as fulltext_reconcile_run from workers.fulltext_worker import reconcile_unresolved as fulltext_reconcile_run
from workers.kosha_collector import run as kosha_collector_run from workers.kosha_collector import run as kosha_collector_run
from workers.csb_collector import run as csb_collector_run from workers.csb_collector import run as csb_collector_run
@@ -151,6 +152,9 @@ async def lifespan(app: FastAPI):
# B-3 PR2: arXiv 키워드 필터 수집기 (daily 07:30 KST — statute 07:00 직후 빈 슬롯). # B-3 PR2: arXiv 키워드 필터 수집기 (daily 07:30 KST — statute 07:00 직후 빈 슬롯).
# signal-only 초록 색인, per-run cap 으로 임베드 큐 보호. keyless. # signal-only 초록 색인, per-run cap 으로 임베드 큐 보호. keyless.
scheduler.add_job(arxiv_collector_run, CronTrigger(hour=7, minute=30, timezone=KST), id="arxiv_collector") scheduler.add_job(arxiv_collector_run, CronTrigger(hour=7, minute=30, timezone=KST), id="arxiv_collector")
# B-3 PR3: OpenAlex 백본 수집기 (daily 07:45 KST). scaffold-first(키 부재 explicit-skip),
# signal-only 초록 색인, per-run cap + cursor watermark. 키=OPENALEX_API_KEY(credentials.env).
scheduler.add_job(openalex_collector_run, CronTrigger(hour=7, minute=45, timezone=KST), id="openalex_collector")
scheduler.start() scheduler.start()
# Phase 2.1 (async 구조): QueryAnalyzer prewarm. # Phase 2.1 (async 구조): QueryAnalyzer prewarm.
+366
View File
@@ -0,0 +1,366 @@
"""OpenAlex 백본 수집기 — B-3 PR3 (plan safety-library-b3-1).
OpenAlex = 발견+dedup 글로벌 백본(JP/EU/US 논문 다 색인 + 정본 DOI). 전문은 안 줌(oa_url 포인터만).
- scaffold-first: OPENALEX_API_KEY 부재 시 FeedError(explicit-skip, silent fallback 금지). 키=무료.
- signal-only: 초록(inverted-index 복원)만 색인(embed+chunk), summarize 절대 미enqueue(맥미니 큐 무접촉).
PDF 는 절대 OpenAlex 경유로 안 받음(oa_url 은 링크/신호일 뿐).
- 관련성 사전필터 = title_and_abstract.search 키워드(서버측) + per-run insert cap(임베드 firehose 차단,
적대리뷰 A major). cursor 페이징 + from_publication_date 워터마크로 증분.
- 초록 없는 thin 레코드(주로 비-OA 메타)는 skip — Phase-1 재료 품질 유지.
- DOI → paper.doi(holder, partial-unique 인덱스, 교차소스 dedup). 없으면 openalex id fallback.
- license: 명시 CC → redistribute=true / 그 외 OA·closed → false(restricted 부재 = 초록 RAG 사용 가능).
- enabled=False news_sources 행 + main.py CronTrigger(자체 폴링). list+filter 비용 미미($1/일 크레딧).
"""
import asyncio
import hashlib
import json
import os
from dataclasses import dataclass
from datetime import date, datetime, timezone
import httpx
from sqlalchemy import select
from core.crawl_politeness import CRAWL_UA
from core.database import async_session
from core.utils import setup_logger
from models.document import Document
from models.news_source import NewsSource
from models.queue import enqueue_stage
from services.papers.doi import normalize_doi
from services.papers.holder import find_paper_holder
from workers.news_collector import (
FeedError,
_get_or_create_health,
_record_failure,
_record_success,
)
logger = setup_logger("openalex_collector")
_API = "https://api.openalex.org/works"
_SOURCE_NAME = "OpenAlex 안전·공학 (keyword)"
_ENV_KEY = "OPENALEX_API_KEY"
# 압력용기·공정안전·구조건전성 도메인 키워드(키워드별 1쿼리 = 관련성 사전필터).
_KEYWORDS = (
"pressure vessel safety",
"process safety",
"structural integrity",
"fracture mechanics",
"fatigue life assessment",
)
_RUN_CAP = 60 # 1회 run 신규 적재 상한(임베드 큐 보호). bulk 시 해제.
_PER_PAGE = 50
_MAX_PAGES_PER_KW = 4 # 키워드당 최대 페이지(증분이라 보통 1페이지에 워터마크 도달)
_REQ_SLEEP = 1.0 # 페이지 간 polite 간격
_MAX_RETRY = 4
_BACKOFF_BASE = 5.0
# ───────────────────────── 순수 파서 (fixture 단위 테스트 대상) ─────────────────────────
@dataclass
class OpenAlexWork:
openalex_id: str # "W2910511816"
doi: str | None # normalize_doi 적용
title: str
abstract: str # inverted-index 복원 (없으면 "")
publication_date: str | None
oa_status: str | None # closed/green/bronze/hybrid/gold/diamond
oa_url: str | None
is_oa: bool
license: str | None # cc-by / cc-by-nc-nd / None
source_name: str | None
primary_topic: str | None
work_type: str | None
def _clean(text):
return " ".join(text.split()).strip() if text else ""
def _reconstruct_abstract(inv: dict | None) -> str:
"""abstract_inverted_index({word:[positions]}) → 평문 초록. 없으면 ''."""
if not inv:
return ""
positions = [(pos, word) for word, idxs in inv.items() for pos in idxs]
positions.sort()
return " ".join(w for _, w in positions)
def license_meta(license_str: str | None, is_oa: bool, source_name: str | None) -> dict:
"""extract_meta.license — 명시 CC/public-domain 만 redistribute=true. restricted 부재(초록 색인 자유).
redistribute=false 라도 restricted 가 없으면 RAG 사용 가능(초록). 비-CC 전문의 RAG verbatim 차단은
Phase-2 전문 승격 단계가 restricted=true 로 처리(L-1) — Phase-1(초록)은 무해.
"""
attribution = source_name or "OpenAlex"
if license_str and (license_str.startswith("cc") or license_str == "public-domain"):
return {"scheme": license_str, "redistribute": True, "attribution": attribution}
return {
"scheme": "open-unspecified" if is_oa else "proprietary",
"redistribute": False,
"attribution": attribution,
}
def parse_openalex_works(json_text: str) -> tuple[int, str | None, list[OpenAlexWork]]:
"""OpenAlex /works 응답 → (count, next_cursor, [OpenAlexWork]). 순수 함수."""
d = json.loads(json_text)
meta = d.get("meta") or {}
count = meta.get("count") or 0
next_cursor = meta.get("next_cursor")
works: list[OpenAlexWork] = []
for w in d.get("results") or []:
oid = (w.get("id") or "").rstrip("/").rsplit("/", 1)[-1]
if not oid:
continue
oa = w.get("open_access") or {}
pl = w.get("primary_location") or {}
pt = w.get("primary_topic") or {}
works.append(OpenAlexWork(
openalex_id=oid,
doi=normalize_doi(w.get("doi")),
title=_clean(w.get("title")),
abstract=_reconstruct_abstract(w.get("abstract_inverted_index")),
publication_date=w.get("publication_date"),
oa_status=oa.get("oa_status"),
oa_url=oa.get("oa_url") or None,
is_oa=bool(oa.get("is_oa")),
license=pl.get("license"),
source_name=(pl.get("source") or {}).get("display_name"),
primary_topic=pt.get("display_name"),
work_type=w.get("type"),
))
return count, next_cursor, works
def build_filter(keyword: str, from_date: str | None = None) -> str:
f = f"title_and_abstract.search:{keyword}"
if from_date:
f += f",from_publication_date:{from_date}"
return f
# ───────────────────────── 적재 (DB — PR3 라이브 검증) ─────────────────────────
def _build_paper_meta(source: NewsSource, w: OpenAlexWork) -> dict:
paper: dict = {"openalex_id": w.openalex_id}
if w.doi:
paper["doi"] = w.doi # partial-unique 인덱스 진입(교차소스 dedup)
if w.oa_status:
paper["oa_status"] = w.oa_status
if w.oa_url:
paper["oa_url"] = w.oa_url # 링크/신호 — 자동 fetch 안 함
if w.primary_topic:
paper["topic"] = w.primary_topic
meta: dict = {
"source_id": source.id,
"source_name": source.name,
"source_region": "INT", # OpenAlex = 글로벌. paper.jurisdiction 은 NULL 유지(A-2).
"paper": paper,
"license": license_meta(w.license, w.is_oa, w.source_name),
}
if w.publication_date:
meta["published_at"] = w.publication_date
return meta
async def _ingest_work(session, source: NewsSource, w: OpenAlexWork) -> bool:
"""1건 적재. 반환 = 신규 여부. signal-only. 초록 없으면 skip(thin 레코드 배제)."""
if not w.abstract:
return False # 초록 없는 thin 레코드(주로 비-OA 메타) — Phase-1 재료 품질 유지
oid_hash = hashlib.sha256(f"openalex|{w.openalex_id}".encode()).hexdigest()[:32]
dup = await session.execute(
select(Document.id).where(Document.file_hash == oid_hash).limit(1)
)
if dup.scalars().first():
return False
if w.doi and await find_paper_holder(session, w.doi):
return False # 교차소스 dedup(arXiv 등이 이미 holder 보유)
pub_date = None
if w.publication_date:
try:
pub_date = date.fromisoformat(w.publication_date)
except ValueError:
pub_date = None
body = w.abstract
doc = Document(
file_path=f"crawl/openalex/{w.openalex_id}",
file_hash=oid_hash,
file_format="article",
file_size=len(body.encode()),
file_type="note",
title=w.title,
extracted_text=f"{w.title}\n\n{body}",
extracted_at=datetime.now(timezone.utc),
extractor_version="openalex-signal",
md_status="skipped",
md_extraction_error="OpenAlex abstract: signal-only, markdown 비대상",
source_channel="crawl",
data_origin="external",
edit_url=w.oa_url or f"https://openalex.org/{w.openalex_id}",
review_status="approved",
material_type="paper",
jurisdiction=None,
published_date=pub_date,
extract_meta=_build_paper_meta(source, w),
)
session.add(doc)
await session.flush()
await enqueue_stage(session, doc.id, "embed")
await enqueue_stage(session, doc.id, "chunk")
return True
async def _get_or_create_source(session) -> NewsSource:
result = await session.execute(
select(NewsSource).where(NewsSource.name == _SOURCE_NAME)
)
source = result.scalars().first()
if source is None:
source = NewsSource(
name=_SOURCE_NAME, feed_url=_API, feed_type="json",
fetch_method="signal-only", fulltext_policy="none",
source_channel="crawl", category="Engineering", language="en",
country=None, material_type="paper",
license_scheme="openalex", license_redistribute=False,
enabled=False,
)
session.add(source)
await session.flush()
return source
def _api_key() -> str:
key = os.getenv(_ENV_KEY, "").strip()
if not key:
raise FeedError(f"{_ENV_KEY} 미설정 — OpenAlex 수집 불가 (scaffold-first explicit-skip)")
return key
def _watermark(source: NewsSource, keyword: str) -> str | None:
return (source.selector_override or {}).get("openalex_watermark", {}).get(keyword)
def _set_watermark(source: NewsSource, keyword: str, value: str) -> None:
cfg = dict(source.selector_override or {})
wm = dict(cfg.get("openalex_watermark") or {})
wm[keyword] = value
cfg["openalex_watermark"] = wm
source.selector_override = cfg
async def _fetch(client: httpx.AsyncClient, key: str, filter_str: str, cursor: str) -> str:
params = {
"filter": filter_str, "per-page": _PER_PAGE, "cursor": cursor,
"sort": "publication_date:desc", "api_key": key,
}
for attempt in range(_MAX_RETRY):
resp = await client.get(_API, params=params)
if resp.status_code == 429:
await asyncio.sleep(_BACKOFF_BASE * (2 ** attempt))
continue
resp.raise_for_status()
return resp.text
raise FeedError(f"OpenAlex 429 재시도 초과: {filter_str[:48]}")
async def run(bulk: bool = False, limit: int = 0) -> None:
"""daily 진입점(스케줄러). 키 부재 = explicit-skip(health 실패 기록)."""
now = datetime.now(timezone.utc)
async with async_session() as session:
source = await _get_or_create_source(session)
await session.commit()
source_id = source.id
try:
key = _api_key()
except FeedError as e:
logger.warning(f"[openalex] {e}")
async with async_session() as session:
health = await _get_or_create_health(session, source_id)
_record_failure(health, str(e), now)
await session.commit()
return
run_cap = (limit or 10**9) if bulk else (min(limit, _RUN_CAP) if limit else _RUN_CAP)
inserted = 0
seen = 0
failures: list[str] = []
async with httpx.AsyncClient(
timeout=30.0, headers={"User-Agent": CRAWL_UA}, follow_redirects=True
) as client:
for keyword in _KEYWORDS:
if inserted >= run_cap:
break
async with async_session() as session:
src = await session.get(NewsSource, source_id)
watermark = None if bulk else _watermark(src, keyword)
filter_str = build_filter(keyword, watermark)
newest: str | None = None
cursor = "*"
max_pages = (10**6 if bulk else _MAX_PAGES_PER_KW)
try:
for _page in range(max_pages):
if inserted >= run_cap:
break
text = await _fetch(client, key, filter_str, cursor)
_count, next_cursor, works = parse_openalex_works(text)
if not works:
break
for w in works:
seen += 1
if w.publication_date and (newest is None or w.publication_date > newest):
newest = w.publication_date
async with async_session() as session:
src = await session.get(NewsSource, source_id)
if await _ingest_work(session, src, w):
inserted += 1
await session.commit()
else:
await session.rollback()
if inserted >= run_cap:
break
await asyncio.sleep(_REQ_SLEEP)
if not next_cursor:
break
cursor = next_cursor
if newest:
async with async_session() as session:
src = await session.get(NewsSource, source_id)
_set_watermark(src, keyword, newest)
await session.commit()
except (httpx.HTTPError, FeedError, ValueError) as e:
msg = f"[{keyword}] {e or repr(e)}"
logger.error(f"[openalex] {msg}")
failures.append(msg)
async with async_session() as session:
health = await _get_or_create_health(session, source_id)
if failures and inserted == 0:
_record_failure(health, "; ".join(failures)[:500], now)
else:
_record_success(health, inserted, False, now)
await session.commit()
deferred = "" if inserted < run_cap else f" (cap {run_cap} 도달 — 잔여 다음 run 이월)"
logger.info(
f"[openalex] {len(_KEYWORDS)}개 키워드 스캔 {seen}건 → 신규 {inserted}{deferred}"
+ (f" / 실패 {len(failures)}" if failures else "")
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="OpenAlex 안전·공학 키워드 백본 수집기")
parser.add_argument("--bulk", action="store_true", help="cap 해제 + 깊은 cursor 페이징 백필")
parser.add_argument("--limit", type=int, default=0, help="신규 적재 상한(0=기본 cap)")
args = parser.parse_args()
asyncio.run(run(bulk=args.bulk, limit=args.limit))
File diff suppressed because one or more lines are too long
+92
View File
@@ -0,0 +1,92 @@
"""B-3 PR3 — OpenAlex 파서·초록복원·license 순수 단위 테스트 (plan safety-library-b3-1).
fixture = OpenAlex /works 실응답 박제(process safety/pressure vessel OA 5건 —
cc-by/cc-by-nc-nd/license None, 초록 있음/없음). run()/적재(DB)는 PR3 라이브 검증.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "app"))
from workers.openalex_collector import ( # noqa: E402
_reconstruct_abstract,
build_filter,
license_meta,
parse_openalex_works,
)
FIX = Path(__file__).parent / "fixtures" / "openalex_works_response.json"
def _works():
count, cursor, works = parse_openalex_works(FIX.read_text(encoding="utf-8"))
return count, {w.openalex_id: w for w in works}, works
# ─── 피드 레벨 ───
def test_count_and_results():
count, by_id, works = _works()
assert count == 1111
assert len(works) == 5
assert all(w.openalex_id.startswith("W") and "/" not in w.openalex_id for w in works)
# ─── 초록 보유 + CC 라이선스 ───
def test_work_with_abstract_and_cc():
_, by_id, _ = _works()
w = by_id["W2910511816"]
assert w.doi and w.doi.startswith("10.") and w.doi == w.doi.lower() # normalize_doi
assert len(w.abstract) > 50 # inverted-index 복원
assert w.oa_status == "diamond" and w.is_oa is True
assert w.license == "cc-by"
assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True
# ─── 초록 없는 thin 레코드(skip 대상) ───
def test_work_without_abstract():
_, by_id, _ = _works()
w = by_id["W3107397139"]
assert w.abstract == "" # inverted-index 부재 → 빈 초록
lm = license_meta(w.license, w.is_oa, w.source_name)
assert lm["redistribute"] is False # license None → 비배포
# ─── cc-by-nc-nd 도 CC 계열 → redistribute True ───
def test_cc_variant_redistribute():
_, by_id, _ = _works()
w = by_id["W4391130399"]
assert w.license == "cc-by-nc-nd"
assert license_meta(w.license, w.is_oa, w.source_name)["redistribute"] is True
# ─── 초록 inverted-index 복원 순서 ───
def test_reconstruct_abstract_order():
inv = {"Safety": [0], "of": [1, 4], "pressure": [2], "vessels": [3], "design": [5]}
assert _reconstruct_abstract(inv) == "Safety of pressure vessels of design"
assert _reconstruct_abstract(None) == ""
assert _reconstruct_abstract({}) == ""
# ─── license_meta 분기 ───
def test_license_meta_branches():
assert license_meta("cc-by", True, "X")["redistribute"] is True
assert license_meta("cc0", True, "X")["redistribute"] is True
none_oa = license_meta(None, True, "X")
assert none_oa["redistribute"] is False and none_oa["scheme"] == "open-unspecified"
closed = license_meta(None, False, "X")
assert closed["redistribute"] is False and closed["scheme"] == "proprietary"
# ─── 쿼리 빌더 ───
def test_build_filter():
assert build_filter("process safety") == "title_and_abstract.search:process safety"
assert build_filter("process safety", "2026-06-01") == \
"title_and_abstract.search:process safety,from_publication_date:2026-06-01"