fdabca2a2f
plan safety-library-b3-1 PR6 (revised). 라이브 정찰: KoreaScience=깨끗한 API 없음(OAI 404)· J-STAGE=ToS bulk 금지, 그리고 Phase-1 메타는 OpenAlex 가 이미 전수 색인(한국안전학회지 1766건 실측) → 전용 스크래퍼 대신 검증된 OpenAlex 수집기에 도메인 저널 ISSN 시드 추가(전용 무료 전문 PDF=Phase-2 park). - _JOURNAL_ISSNS(OpenAlex sources 실측): 한국안전학회지 1738-3803·한국가스학회지 1226-8402· KSME A/B 1226-4873·1226-4881·KSME Intl 1226-4865·JP 고압 0917-639X. - _seeds() = ISSN 시드(cap 우선) + 키워드. build_issn_filter(primary_location.source.issn:). run() 루프 통합(종류별 필터, 워터마크 시드별). 적재/parser/cap/signal-only = PR3 재사용. 단위 8 passed(+ISSN 시드). 라이브 PASS: 키주입 run → 한국안전학회지 5건 적재(ISSN 우선 확인), running fastapi 무접촉. KoreaScience/J-STAGE 전용 fulltext 수집기 = Phase-2 강등(park). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
394 lines
16 KiB
Python
394 lines
16 KiB
Python
"""OpenAlex 백본 수집기 — B-3 PR3 (plan safety-library-b3-1).
|
|
|
|
OpenAlex = 발견+dedup 글로벌 백본(JP/EU/US 논문 다 색인 + 정본 DOI). 전문은 안 줌(oa_url 포인터만).
|
|
- scaffold-first: OPENALEX_API_KEY 부재 시 FeedError(explicit-skip, silent fallback 금지). 키=무료.
|
|
- signal-only: 초록(inverted-index 복원)만 색인(embed+chunk), summarize 절대 미enqueue(맥미니 큐 무접촉).
|
|
PDF 는 절대 OpenAlex 경유로 안 받음(oa_url 은 링크/신호일 뿐).
|
|
- 관련성 사전필터 = title_and_abstract.search 키워드(서버측) + per-run insert cap(임베드 firehose 차단,
|
|
적대리뷰 A major). cursor 페이징 + from_publication_date 워터마크로 증분.
|
|
- 초록 없는 thin 레코드(주로 비-OA 메타)는 skip — Phase-1 재료 품질 유지.
|
|
- DOI → paper.doi(holder, partial-unique 인덱스, 교차소스 dedup). 없으면 openalex id fallback.
|
|
- license: 명시 CC → redistribute=true / 그 외 OA·closed → false(restricted 부재 = 초록 RAG 사용 가능).
|
|
- enabled=False news_sources 행 + main.py CronTrigger(자체 폴링). list+filter 비용 미미($1/일 크레딧).
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import os
|
|
from dataclasses import dataclass
|
|
from datetime import date, datetime, timezone
|
|
|
|
import httpx
|
|
from sqlalchemy import select
|
|
|
|
from core.crawl_politeness import CRAWL_UA
|
|
from core.database import async_session
|
|
from core.utils import setup_logger
|
|
from models.document import Document
|
|
from models.news_source import NewsSource
|
|
from models.queue import enqueue_stage
|
|
from services.papers.doi import normalize_doi
|
|
from services.papers.holder import find_paper_holder
|
|
from workers.news_collector import (
|
|
FeedError,
|
|
_get_or_create_health,
|
|
_record_failure,
|
|
_record_success,
|
|
)
|
|
|
|
logger = setup_logger("openalex_collector")
|
|
|
|
_API = "https://api.openalex.org/works"
|
|
_SOURCE_NAME = "OpenAlex 안전·공학 (keyword)"
|
|
_ENV_KEY = "OPENALEX_API_KEY"
|
|
|
|
# 압력용기·공정안전·구조건전성 도메인 키워드(키워드별 1쿼리 = 관련성 사전필터).
|
|
_KEYWORDS = (
|
|
"pressure vessel safety",
|
|
"process safety",
|
|
"structural integrity",
|
|
"fracture mechanics",
|
|
"fatigue life assessment",
|
|
)
|
|
|
|
# 도메인 직결 저널 ISSN 시드(OpenAlex sources 실측 확인) — 키워드 매칭 누락분까지 전수 커버.
|
|
# KR 안전/가스/기계 + JP 고압. KR/JP 관심 = OpenAlex 깨끗한 API 로 직접(KoreaScience/J-STAGE 전용
|
|
# 스크래퍼 불요 — Phase-1 메타는 OpenAlex 와 중복, 전용 수집기의 유니크 가치=무료 전문 PDF=Phase-2).
|
|
_JOURNAL_ISSNS = (
|
|
("한국안전학회지", "1738-3803"),
|
|
("한국가스학회지", "1226-8402"),
|
|
("대한기계학회논문집 A", "1226-4873"),
|
|
("대한기계학회논문집 B", "1226-4881"),
|
|
("KSME International J.", "1226-4865"),
|
|
("Review of High Pressure Sci&Tech (JP)", "0917-639X"),
|
|
)
|
|
|
|
_RUN_CAP = 60 # 1회 run 신규 적재 상한(임베드 큐 보호). bulk 시 해제.
|
|
_PER_PAGE = 50
|
|
_MAX_PAGES_PER_KW = 4 # 키워드당 최대 페이지(증분이라 보통 1페이지에 워터마크 도달)
|
|
_REQ_SLEEP = 1.0 # 페이지 간 polite 간격
|
|
_MAX_RETRY = 4
|
|
_BACKOFF_BASE = 5.0
|
|
|
|
|
|
# ───────────────────────── 순수 파서 (fixture 단위 테스트 대상) ─────────────────────────
|
|
|
|
@dataclass
|
|
class OpenAlexWork:
|
|
openalex_id: str # "W2910511816"
|
|
doi: str | None # normalize_doi 적용
|
|
title: str
|
|
abstract: str # inverted-index 복원 (없으면 "")
|
|
publication_date: str | None
|
|
oa_status: str | None # closed/green/bronze/hybrid/gold/diamond
|
|
oa_url: str | None
|
|
is_oa: bool
|
|
license: str | None # cc-by / cc-by-nc-nd / None
|
|
source_name: str | None
|
|
primary_topic: str | None
|
|
work_type: str | None
|
|
|
|
|
|
def _clean(text):
|
|
return " ".join(text.split()).strip() if text else ""
|
|
|
|
|
|
def _reconstruct_abstract(inv: dict | None) -> str:
|
|
"""abstract_inverted_index({word:[positions]}) → 평문 초록. 없으면 ''."""
|
|
if not inv:
|
|
return ""
|
|
positions = [(pos, word) for word, idxs in inv.items() for pos in idxs]
|
|
positions.sort()
|
|
return " ".join(w for _, w in positions)
|
|
|
|
|
|
def license_meta(license_str: str | None, is_oa: bool, source_name: str | None) -> dict:
|
|
"""extract_meta.license — 명시 CC/public-domain 만 redistribute=true. restricted 부재(초록 색인 자유).
|
|
|
|
redistribute=false 라도 restricted 가 없으면 RAG 사용 가능(초록). 비-CC 전문의 RAG verbatim 차단은
|
|
Phase-2 전문 승격 단계가 restricted=true 로 처리(L-1) — Phase-1(초록)은 무해.
|
|
"""
|
|
attribution = source_name or "OpenAlex"
|
|
if license_str and (license_str.startswith("cc") or license_str == "public-domain"):
|
|
return {"scheme": license_str, "redistribute": True, "attribution": attribution}
|
|
return {
|
|
"scheme": "open-unspecified" if is_oa else "proprietary",
|
|
"redistribute": False,
|
|
"attribution": attribution,
|
|
}
|
|
|
|
|
|
def parse_openalex_works(json_text: str) -> tuple[int, str | None, list[OpenAlexWork]]:
|
|
"""OpenAlex /works 응답 → (count, next_cursor, [OpenAlexWork]). 순수 함수."""
|
|
d = json.loads(json_text)
|
|
meta = d.get("meta") or {}
|
|
count = meta.get("count") or 0
|
|
next_cursor = meta.get("next_cursor")
|
|
works: list[OpenAlexWork] = []
|
|
for w in d.get("results") or []:
|
|
oid = (w.get("id") or "").rstrip("/").rsplit("/", 1)[-1]
|
|
if not oid:
|
|
continue
|
|
oa = w.get("open_access") or {}
|
|
pl = w.get("primary_location") or {}
|
|
pt = w.get("primary_topic") or {}
|
|
works.append(OpenAlexWork(
|
|
openalex_id=oid,
|
|
doi=normalize_doi(w.get("doi")),
|
|
title=_clean(w.get("title")),
|
|
abstract=_reconstruct_abstract(w.get("abstract_inverted_index")),
|
|
publication_date=w.get("publication_date"),
|
|
oa_status=oa.get("oa_status"),
|
|
oa_url=oa.get("oa_url") or None,
|
|
is_oa=bool(oa.get("is_oa")),
|
|
license=pl.get("license"),
|
|
source_name=(pl.get("source") or {}).get("display_name"),
|
|
primary_topic=pt.get("display_name"),
|
|
work_type=w.get("type"),
|
|
))
|
|
return count, next_cursor, works
|
|
|
|
|
|
def build_filter(keyword: str, from_date: str | None = None) -> str:
|
|
f = f"title_and_abstract.search:{keyword}"
|
|
if from_date:
|
|
f += f",from_publication_date:{from_date}"
|
|
return f
|
|
|
|
|
|
def build_issn_filter(issn: str, from_date: str | None = None) -> str:
|
|
f = f"primary_location.source.issn:{issn}"
|
|
if from_date:
|
|
f += f",from_publication_date:{from_date}"
|
|
return f
|
|
|
|
|
|
def _seeds() -> list[tuple[str, str, str]]:
|
|
"""수집 시드 = (라벨, 워터마크키, 종류). 도메인 저널 ISSN 우선(cap 우선권) → 키워드."""
|
|
s: list[tuple[str, str, str]] = [(label, issn, "issn") for label, issn in _JOURNAL_ISSNS]
|
|
s += [(kw, kw, "kw") for kw in _KEYWORDS]
|
|
return s
|
|
|
|
|
|
# ───────────────────────── 적재 (DB — PR3 라이브 검증) ─────────────────────────
|
|
|
|
def _build_paper_meta(source: NewsSource, w: OpenAlexWork) -> dict:
|
|
paper: dict = {"openalex_id": w.openalex_id}
|
|
if w.doi:
|
|
paper["doi"] = w.doi # partial-unique 인덱스 진입(교차소스 dedup)
|
|
if w.oa_status:
|
|
paper["oa_status"] = w.oa_status
|
|
if w.oa_url:
|
|
paper["oa_url"] = w.oa_url # 링크/신호 — 자동 fetch 안 함
|
|
if w.primary_topic:
|
|
paper["topic"] = w.primary_topic
|
|
meta: dict = {
|
|
"source_id": source.id,
|
|
"source_name": source.name,
|
|
"source_region": "INT", # OpenAlex = 글로벌. paper.jurisdiction 은 NULL 유지(A-2).
|
|
"paper": paper,
|
|
"license": license_meta(w.license, w.is_oa, w.source_name),
|
|
}
|
|
if w.publication_date:
|
|
meta["published_at"] = w.publication_date
|
|
return meta
|
|
|
|
|
|
async def _ingest_work(session, source: NewsSource, w: OpenAlexWork) -> bool:
|
|
"""1건 적재. 반환 = 신규 여부. signal-only. 초록 없으면 skip(thin 레코드 배제)."""
|
|
if not w.abstract:
|
|
return False # 초록 없는 thin 레코드(주로 비-OA 메타) — Phase-1 재료 품질 유지
|
|
oid_hash = hashlib.sha256(f"openalex|{w.openalex_id}".encode()).hexdigest()[:32]
|
|
dup = await session.execute(
|
|
select(Document.id).where(Document.file_hash == oid_hash).limit(1)
|
|
)
|
|
if dup.scalars().first():
|
|
return False
|
|
if w.doi and await find_paper_holder(session, w.doi):
|
|
return False # 교차소스 dedup(arXiv 등이 이미 holder 보유)
|
|
|
|
pub_date = None
|
|
if w.publication_date:
|
|
try:
|
|
pub_date = date.fromisoformat(w.publication_date)
|
|
except ValueError:
|
|
pub_date = None
|
|
body = w.abstract
|
|
doc = Document(
|
|
file_path=f"crawl/openalex/{w.openalex_id}",
|
|
file_hash=oid_hash,
|
|
file_format="article",
|
|
file_size=len(body.encode()),
|
|
file_type="note",
|
|
title=w.title,
|
|
extracted_text=f"{w.title}\n\n{body}",
|
|
extracted_at=datetime.now(timezone.utc),
|
|
extractor_version="openalex-signal",
|
|
md_status="skipped",
|
|
md_extraction_error="OpenAlex abstract: signal-only, markdown 비대상",
|
|
source_channel="crawl",
|
|
data_origin="external",
|
|
edit_url=w.oa_url or f"https://openalex.org/{w.openalex_id}",
|
|
review_status="approved",
|
|
material_type="paper",
|
|
jurisdiction=None,
|
|
published_date=pub_date,
|
|
extract_meta=_build_paper_meta(source, w),
|
|
)
|
|
session.add(doc)
|
|
await session.flush()
|
|
await enqueue_stage(session, doc.id, "embed")
|
|
await enqueue_stage(session, doc.id, "chunk")
|
|
return True
|
|
|
|
|
|
async def _get_or_create_source(session) -> NewsSource:
|
|
result = await session.execute(
|
|
select(NewsSource).where(NewsSource.name == _SOURCE_NAME)
|
|
)
|
|
source = result.scalars().first()
|
|
if source is None:
|
|
source = NewsSource(
|
|
name=_SOURCE_NAME, feed_url=_API, feed_type="json",
|
|
fetch_method="signal-only", fulltext_policy="none",
|
|
source_channel="crawl", category="Engineering", language="en",
|
|
country=None, material_type="paper",
|
|
license_scheme="openalex", license_redistribute=False,
|
|
enabled=False,
|
|
)
|
|
session.add(source)
|
|
await session.flush()
|
|
return source
|
|
|
|
|
|
def _api_key() -> str:
|
|
key = os.getenv(_ENV_KEY, "").strip()
|
|
if not key:
|
|
raise FeedError(f"{_ENV_KEY} 미설정 — OpenAlex 수집 불가 (scaffold-first explicit-skip)")
|
|
return key
|
|
|
|
|
|
def _watermark(source: NewsSource, keyword: str) -> str | None:
|
|
return (source.selector_override or {}).get("openalex_watermark", {}).get(keyword)
|
|
|
|
|
|
def _set_watermark(source: NewsSource, keyword: str, value: str) -> None:
|
|
cfg = dict(source.selector_override or {})
|
|
wm = dict(cfg.get("openalex_watermark") or {})
|
|
wm[keyword] = value
|
|
cfg["openalex_watermark"] = wm
|
|
source.selector_override = cfg
|
|
|
|
|
|
async def _fetch(client: httpx.AsyncClient, key: str, filter_str: str, cursor: str) -> str:
|
|
params = {
|
|
"filter": filter_str, "per-page": _PER_PAGE, "cursor": cursor,
|
|
"sort": "publication_date:desc", "api_key": key,
|
|
}
|
|
for attempt in range(_MAX_RETRY):
|
|
resp = await client.get(_API, params=params)
|
|
if resp.status_code == 429:
|
|
await asyncio.sleep(_BACKOFF_BASE * (2 ** attempt))
|
|
continue
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
raise FeedError(f"OpenAlex 429 재시도 초과: {filter_str[:48]}")
|
|
|
|
|
|
async def run(bulk: bool = False, limit: int = 0) -> None:
|
|
"""daily 진입점(스케줄러). 키 부재 = explicit-skip(health 실패 기록)."""
|
|
now = datetime.now(timezone.utc)
|
|
async with async_session() as session:
|
|
source = await _get_or_create_source(session)
|
|
await session.commit()
|
|
source_id = source.id
|
|
|
|
try:
|
|
key = _api_key()
|
|
except FeedError as e:
|
|
logger.warning(f"[openalex] {e}")
|
|
async with async_session() as session:
|
|
health = await _get_or_create_health(session, source_id)
|
|
_record_failure(health, str(e), now)
|
|
await session.commit()
|
|
return
|
|
|
|
run_cap = (limit or 10**9) if bulk else (min(limit, _RUN_CAP) if limit else _RUN_CAP)
|
|
inserted = 0
|
|
seen = 0
|
|
failures: list[str] = []
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=30.0, headers={"User-Agent": CRAWL_UA}, follow_redirects=True
|
|
) as client:
|
|
for label, wm_key, kind in _seeds():
|
|
if inserted >= run_cap:
|
|
break
|
|
async with async_session() as session:
|
|
src = await session.get(NewsSource, source_id)
|
|
watermark = None if bulk else _watermark(src, wm_key)
|
|
filter_str = (build_issn_filter(wm_key, watermark) if kind == "issn"
|
|
else build_filter(wm_key, watermark))
|
|
newest: str | None = None
|
|
cursor = "*"
|
|
max_pages = (10**6 if bulk else _MAX_PAGES_PER_KW)
|
|
try:
|
|
for _page in range(max_pages):
|
|
if inserted >= run_cap:
|
|
break
|
|
text = await _fetch(client, key, filter_str, cursor)
|
|
_count, next_cursor, works = parse_openalex_works(text)
|
|
if not works:
|
|
break
|
|
for w in works:
|
|
seen += 1
|
|
if w.publication_date and (newest is None or w.publication_date > newest):
|
|
newest = w.publication_date
|
|
async with async_session() as session:
|
|
src = await session.get(NewsSource, source_id)
|
|
if await _ingest_work(session, src, w):
|
|
inserted += 1
|
|
await session.commit()
|
|
else:
|
|
await session.rollback()
|
|
if inserted >= run_cap:
|
|
break
|
|
await asyncio.sleep(_REQ_SLEEP)
|
|
if not next_cursor:
|
|
break
|
|
cursor = next_cursor
|
|
if newest:
|
|
async with async_session() as session:
|
|
src = await session.get(NewsSource, source_id)
|
|
_set_watermark(src, wm_key, newest)
|
|
await session.commit()
|
|
except (httpx.HTTPError, FeedError, ValueError) as e:
|
|
msg = f"[{label}] {e or repr(e)}"
|
|
logger.error(f"[openalex] {msg}")
|
|
failures.append(msg)
|
|
|
|
async with async_session() as session:
|
|
health = await _get_or_create_health(session, source_id)
|
|
if failures and inserted == 0:
|
|
_record_failure(health, "; ".join(failures)[:500], now)
|
|
else:
|
|
_record_success(health, inserted, False, now)
|
|
await session.commit()
|
|
|
|
deferred = "" if inserted < run_cap else f" (cap {run_cap} 도달 — 잔여 다음 run 이월)"
|
|
logger.info(
|
|
f"[openalex] {len(_seeds())}개 시드(ISSN+키워드) 스캔 {seen}건 → 신규 {inserted}건{deferred}"
|
|
+ (f" / 실패 {len(failures)}건" if failures else "")
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="OpenAlex 안전·공학 키워드 백본 수집기")
|
|
parser.add_argument("--bulk", action="store_true", help="cap 해제 + 깊은 cursor 페이징 백필")
|
|
parser.add_argument("--limit", type=int, default=0, help="신규 적재 상한(0=기본 cap)")
|
|
args = parser.parse_args()
|
|
asyncio.run(run(bulk=args.bulk, limit=args.limit))
|