diff --git a/app/core/crawl_politeness.py b/app/core/crawl_politeness.py index 9a36a33..017381b 100644 --- a/app/core/crawl_politeness.py +++ b/app/core/crawl_politeness.py @@ -12,6 +12,7 @@ SSRF 차단은 core.url_validator.validate_feed_url 재사용 (redirect target """ import asyncio +import base64 import random import time import urllib.robotparser @@ -206,11 +207,13 @@ async def fetch_page( # ── B-3 구독 세션 fetch (Playwright 격리 컨테이너 경유) ────────────────────── -async def fetch_page_via_browser(url: str, profile: str) -> tuple[str, str]: - """인증 페이지 1건 — playwright-fetcher 에 위임, politeness 는 사람 속도(30~60s). +async def fetch_page_via_browser(url: str, profile: str | None) -> tuple[str, str]: + """브라우저 페이지 1건 — playwright-fetcher 에 위임, politeness 는 사람 속도(30~60s). - (html_text, final_url) 반환. robots 미적용 — 구독 계약 기반 개인 보관 fetch 로 - 공개 크롤러 규약 대상이 아님 (대신 사람 속도 + 동시 1 + 야간 저빈도가 보호 장치). + profile=None = 익명 컨텍스트 (사이클 3 — 평문 httpx 를 UA 무관 403 하는 공개 + 사이트의 WAF 우회 전용, CCPS aiche.org 실측). 값 = B-3 구독 세션. + (html_text, final_url) 반환. robots 미적용 — 구독 fetch 는 사용자 행위 성격, + 익명 WAF 우회는 월간 1~2회 저빈도 + 사람 속도가 보호 장치. 예외 어휘는 fetch_page 와 동일 (호출측 분기 재사용). """ try: @@ -218,14 +221,16 @@ async def fetch_page_via_browser(url: str, profile: str) -> tuple[str, str]: except ValueError as e: raise CrawlSkip(f"URL 검증 실패: {e}") from e + payload = {"url": url} + if profile: + payload["profile"] = profile + domain = _domain_of(url) async with _get_lock(domain): await _respect_domain_rate(domain, _AUTH_DELAY_MIN, _AUTH_DELAY_MAX) try: async with httpx.AsyncClient(timeout=_FETCHER_TIMEOUT) as client: - resp = await client.post( - f"{_FETCHER_URL}/fetch", json={"url": url, "profile": profile} - ) + resp = await client.post(f"{_FETCHER_URL}/fetch", json=payload) except httpx.TimeoutException as e: raise CrawlFetchError(f"browser fetch timeout: {url}") from e except httpx.HTTPError as e: @@ -250,6 +255,60 @@ async def fetch_page_via_browser(url: str, profile: str) -> tuple[str, str]: return html_text, data.get("final_url", url) +_MAX_DOWNLOAD_BYTES = 60 * 1024 * 1024 # fetcher MAX_DOWNLOAD_BYTES 와 동률 + + +async def download_via_browser( + url: str, *, referer: str | None = None, profile: str | None = None +) -> tuple[bytes, str]: + """바이너리(PDF) 1건 — fetcher /download 위임. (content, content_type) 반환. + + referer = WAF 챌린지 쿠키를 먼저 획득할 목록 페이지 (CCPS Beacon 패턴). + 내부 status 판정: 403/429 = CrawlBlocked, 그 외 4xx = CrawlSkip, 5xx = CrawlFetchError + (fetch_page 와 동일 어휘 — 호출측 분기 재사용). + """ + try: + validate_feed_url(url) + except ValueError as e: + raise CrawlSkip(f"URL 검증 실패: {e}") from e + + payload: dict = {"url": url} + if referer: + payload["referer"] = referer + if profile: + payload["profile"] = profile + + domain = _domain_of(url) + async with _get_lock(domain): + await _respect_domain_rate(domain, _AUTH_DELAY_MIN, _AUTH_DELAY_MAX) + try: + async with httpx.AsyncClient(timeout=_FETCHER_TIMEOUT) as client: + resp = await client.post(f"{_FETCHER_URL}/download", json=payload) + except httpx.TimeoutException as e: + raise CrawlFetchError(f"browser download timeout: {url}") from e + except httpx.HTTPError as e: + raise CrawlFetchError(f"playwright-fetcher 연결 오류: {e}") from e + finally: + _domain_last_request[domain] = time.monotonic() + + if resp.status_code == 503: + raise CrawlBlocked(f"세션 프로필 부재: {profile}") + if resp.status_code != 200: + raise CrawlFetchError(f"playwright-fetcher {resp.status_code}: {url}") + data = resp.json() + inner = int(data.get("status", 0)) + if inner in (403, 429): + raise CrawlBlocked(f"{inner} (browser download): {url}") + if 400 <= inner < 500: + raise CrawlSkip(f"{inner} (browser download): {url}") + if inner != 200: + raise CrawlFetchError(f"{inner} (browser download): {url}") + content = base64.b64decode(data.get("body_b64", "")) + if len(content) > _MAX_DOWNLOAD_BYTES: + raise CrawlSkip(f"크기 초과 (browser download): {url}") + return content, data.get("content_type", "") + + async def probe_session( profile: str, probe_url: str, min_body_chars: int, paywall_markers: list[str] ) -> dict: diff --git a/app/main.py b/app/main.py index 208ee86..f14dbdd 100644 --- a/app/main.py +++ b/app/main.py @@ -56,6 +56,9 @@ async def lifespan(app: FastAPI): from workers.news_collector import run as news_collector_run from workers.fulltext_worker import reconcile_unresolved as fulltext_reconcile_run from workers.kosha_collector import run as kosha_collector_run + from workers.csb_collector import run as csb_collector_run + from workers.api_standards_collector import run as api_standards_run + from workers.ccps_collector import run as ccps_collector_run from workers.queue_consumer import consume_queue, consume_markdown_queue from workers.study_queue_consumer import consume_study_queue from workers.study_session_queue_consumer import consume_study_session_queue @@ -131,6 +134,12 @@ async def lifespan(app: FastAPI): scheduler.add_job(dedup_reconcile_run, CronTrigger(hour=3, minute=30, timezone=KST), id="dedup_reconcile") # crawl-24x7 C-2: KOSHA 재해사례 diff + GUIDE 점진 백필 (daily, 새벽 잡들과 비충돌 슬롯). scheduler.add_job(kosha_collector_run, CronTrigger(hour=6, minute=40, timezone=KST), id="kosha_collector") + # 사이클 3 C-2 잔여: CSB sitemap lastmod diff (weekly 월, cap 40 + 워터마크 점진 백필). + scheduler.add_job(csb_collector_run, CronTrigger(day_of_week="mon", hour=6, minute=50, timezone=KST), id="csb_collector") + # 사이클 3 C-4: API 표준 공지 목록 diff (monthly — 월 1~2건 공지 페이스). + scheduler.add_job(api_standards_run, CronTrigger(day=5, hour=7, minute=5, timezone=KST), id="api_standards_collector") + # 사이클 3 C-2 잔여: CCPS Beacon 월간 PDF (playwright 익명 경유 — WAF 차단 시 health 로 가시화). + scheduler.add_job(ccps_collector_run, CronTrigger(day=5, hour=7, minute=20, timezone=KST), id="ccps_collector") scheduler.start() # Phase 2.1 (async 구조): QueryAnalyzer prewarm. diff --git a/app/workers/api_standards_collector.py b/app/workers/api_standards_collector.py new file mode 100644 index 0000000..ec1061e --- /dev/null +++ b/app/workers/api_standards_collector.py @@ -0,0 +1,250 @@ +"""C-4 ① API 표준 공지(Important Standards Announcements) 수집 워커 (사이클 3). + +RSS 없음. 실측(2026-06-11) 결과 '페이지 diff' 가 아니라 공지별 상세 URL 이 있는 +목록 페이지(10건/페이지, ?page=N&pageSize=10 페이지네이션 ~12+) — 목록 링크 파싱 +→ 신규 상세 페이지만 ingest 가 정확하고 dedup 도 자연스럽다 (rss+page 패턴의 HTML 판). +510/570/653 개정 공지가 업무 직결 — 표준 본문은 유료라 공지만 수집 (카드 C-4). + +스케줄 = monthly (main.py 5일 07:05 KST) — 최근 2페이지 diff (월 1~2건 공지 페이스). +초기 일괄: docker exec hyungi_document_server-fastapi-1 \ + python -m workers.api_standards_collector --bulk # 전 페이지 (~120건, politeness ~30분) + +멱등: edit_url(정규화)+file_hash dedup — 재실행 = 신규분만. +""" + +import argparse +import asyncio +import hashlib +import re +from datetime import datetime, timezone + +from sqlalchemy import select + +from core.crawl_politeness import ( + CrawlBlocked, + CrawlFetchError, + CrawlSkip, + fetch_page, +) +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from models.news_source import NewsSource +from models.queue import enqueue_stage +from workers.fulltext_worker import ( + _WEB_MIN_BODY_LEN, + _extract_body, + _raw_html_path, + _save_raw_html, + _strip_article_footer, +) +from workers.news_collector import ( + _get_or_create_health, + _normalize_url, + _record_failure, + _record_success, +) +from workers.static_corpus_ingest import _page_title + +logger = setup_logger("api_standards") + +_BASE = "https://www.api.org" +_LISTING_PATH = "/products-and-services/standards/important-standards-announcements" +_LISTING_URL = f"{_BASE}{_LISTING_PATH}" +_SOURCE_NAME = "API 표준 공지" + +_SCHEDULED_PAGES = 2 # monthly diff 범위 (20건 — 월 1~2건 페이스에 충분한 겹침) +_BULK_MAX_PAGES = 15 # 실측 12페이지 + 여유. 빈 페이지에서 조기 종료. + +_DETAIL_RE = re.compile( + r'href="(' + re.escape(_LISTING_PATH) + r'/[^"?#]+)"' +) +_DATE_RE = re.compile( + r"(January|February|March|April|May|June|July|August|September|October" + r"|November|December)\s+(\d{1,2}),?\s+(\d{4})" +) +_MONTHS = {m: i for i, m in enumerate( + ["January", "February", "March", "April", "May", "June", "July", + "August", "September", "October", "November", "December"], start=1)} + + +def _parse_listing(html_text: str) -> list[str]: + """상세 공지 절대 URL — 순서 보존 dedup (페이지네이션 링크는 ?가 패턴에서 배제).""" + seen: set[str] = set() + out: list[str] = [] + for m in _DETAIL_RE.finditer(html_text): + url = f"{_BASE}{m.group(1)}" + if url not in seen: + seen.add(url) + out.append(url) + return out + + +def _parse_pub_date(text: str) -> datetime | None: + """본문 첫 'Month DD, YYYY' — 공지 게시일 관행. 실패 = None (색인은 채널 게이트로 무조건).""" + m = _DATE_RE.search(text) + if not m: + return None + try: + return datetime(int(m.group(3)), _MONTHS[m.group(1)], int(m.group(2)), + tzinfo=timezone.utc) + except ValueError: + return None + + +async def _get_or_create_source(session) -> NewsSource: + result = await session.execute( + select(NewsSource).where(NewsSource.name == _SOURCE_NAME) + ) + source = result.scalars().first() + if source is None: + source = NewsSource( + name=_SOURCE_NAME, feed_url=_LISTING_URL, feed_type="rss", + fetch_method="page", fulltext_policy="none", + source_channel="crawl", category="Engineering", language="en", country="US", + enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 monthly 폴링 + ) + session.add(source) + await session.flush() + return source + + +async def _ingest_detail(session, source: NewsSource, url: str) -> str: + """공지 1건. 반환: 'ok' / 'dup' / 'skip'.""" + normalized_url = _normalize_url(url) + ann_hash = hashlib.sha256(f"api-ann|{normalized_url}".encode()).hexdigest()[:32] + existing = await session.execute( + select(Document).where( + (Document.file_hash == ann_hash) + | (Document.edit_url.in_([normalized_url, url])) + ).limit(1) + ) + if existing.scalars().first(): + return "dup" + + try: + html_text, final_url = await fetch_page(url) + except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e: + logger.warning(f"[api-std] fetch 실패 skip: {url} — {type(e).__name__}: {e}") + return "skip" + + body, engine, engine_ver = _extract_body(html_text) + if not engine: + logger.warning(f"[api-std] 추출 실패 skip (< {_WEB_MIN_BODY_LEN}자): {url}") + return "skip" + clean_body = _strip_article_footer(body.replace("\x00", "")) + if len(clean_body) < _WEB_MIN_BODY_LEN: + return "skip" + + now = datetime.now(timezone.utc) + raw_path = _raw_html_path(source.id, ann_hash, now) + raw_saved = True + try: + _save_raw_html(raw_path, html_text) + except OSError as e: + raw_saved = False + logger.error(f"[api-std] 원본 보존 실패 (ingest 는 진행): {e}") + + pub_dt = _parse_pub_date(clean_body) + title = _page_title(html_text, fallback=url.rsplit("/", 1)[-1][:90]) + title = re.sub(r"\s*\|\s*API\s*$", "", title).strip() or title + + doc = Document( + file_path=f"crawl/{_SOURCE_NAME}/{ann_hash}", + file_hash=ann_hash, + file_format="article", + file_size=0, + file_type="note", + title=title, + extracted_text=f"{title}\n\n{clean_body}", + extracted_at=now, + extractor_version=f"listing+page@{engine}", + md_content=clean_body, + md_status="success", + md_extraction_engine=engine, + md_extraction_engine_version=engine_ver, + md_format_version="1.0", + md_generated_at=now, + md_source_hash=hashlib.sha256(html_text.encode("utf-8", errors="replace")).hexdigest(), + md_content_hash=hashlib.sha256(clean_body.encode("utf-8")).hexdigest(), + content_origin="extracted", + source_channel="crawl", + data_origin="external", + edit_url=normalized_url, + review_status="approved", + ai_domain="Engineering", + ai_sub_group=_SOURCE_NAME, + ai_tags=["Engineering/API 표준 공지"], + extract_meta={ + "source_id": source.id, + "source_name": _SOURCE_NAME, + "published_at": pub_dt.isoformat() if pub_dt else None, + "fulltext": { + "status": "api_announcement", + "engine": engine, + "final_url": final_url, + "raw_html_path": str(raw_path) if raw_saved else None, + "body_chars": len(clean_body), + "resolved_at": now.isoformat(), + }, + }, + ) + doc.file_size = len(doc.extracted_text.encode()) + session.add(doc) + await session.flush() + await enqueue_stage(session, doc.id, "summarize") + await enqueue_stage(session, doc.id, "embed") + await enqueue_stage(session, doc.id, "chunk") + logger.info(f"[api-std] ingest {len(clean_body)}자 ({engine}): {title[:60]}") + return "ok" + + +async def run(bulk: bool = False) -> None: + """monthly 진입점 (스케줄러) — bulk 는 CLI 전용 (전 페이지 일괄).""" + now = datetime.now(timezone.utc) + async with async_session() as session: + source = await _get_or_create_source(session) + await session.commit() + source_id = source.id + + max_pages = _BULK_MAX_PAGES if bulk else _SCHEDULED_PAGES + counts = {"ok": 0, "dup": 0, "skip": 0} + try: + for page in range(1, max_pages + 1): + listing_url = ( + _LISTING_URL if page == 1 + else f"{_LISTING_URL}?page={page}&pageSize=10" + ) + html_text, _ = await fetch_page(listing_url) + detail_urls = _parse_listing(html_text) + if not detail_urls: + break # 빈 페이지 = 끝 (bulk 조기 종료) + for url in detail_urls: + async with async_session() as session: + src = await session.get(NewsSource, source_id) + status = await _ingest_detail(session, src, url) + await session.commit() + counts[status] += 1 + logger.info(f"[api-std] 목록 p{page}: 누적 {counts}") + except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e: + logger.error(f"[api-std] 목록 수집 실패: {e}") + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_failure(health, str(e) or repr(e), now) + await session.commit() + return + + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_success(health, counts["ok"], False, now) + src = await session.get(NewsSource, source_id) + src.last_fetched_at = now + await session.commit() + logger.info(f"[api-std] 완료: {counts}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="API 표준 공지 수집") + parser.add_argument("--bulk", action="store_true", help="전 페이지 일괄 (초기 백필)") + args = parser.parse_args() + asyncio.run(run(bulk=args.bulk)) diff --git a/app/workers/ccps_collector.py b/app/workers/ccps_collector.py new file mode 100644 index 0000000..900d100 --- /dev/null +++ b/app/workers/ccps_collector.py @@ -0,0 +1,185 @@ +"""C-2 잔여 ② CCPS Process Safety Beacon 수집 워커 (사이클 3). + +월간 1페이지 PDF + 한국어 번역판 — RAG 청크로 이상적 크기 (카드 C-2). +aiche.org 는 평문 httpx 를 UA 무관 403 (2026-06-11 실측: Archiver UA·브라우저 UA 모두) +→ playwright-fetcher 익명 컨텍스트 경유 (B-3 인프라 재사용): + 목록 페이지 브라우저 fetch → beacon PDF 링크 파싱 → referer 쿠키 승계 다운로드. + +알려진 리스크: WAF 가 헤드리스 자체를 차단하면 _CHALLENGE_MARKERS → CrawlBlocked +→ health 실패 기록 후 종료 (르몽드 B-3 PARK 선례 — 그 경우 대안 = 이메일 구독 +.eml 트랙 결합, [[feedback_antibot_headless_subscription_wall]]). + +스케줄 = monthly (main.py 5일 07:20 KST). 월간 1건 페이스라 diff 는 file_path dedup 으로 충분. +수동: docker exec hyungi_document_server-fastapi-1 python -m workers.ccps_collector +""" + +import asyncio +import hashlib +import re +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urljoin, urlparse + +from sqlalchemy import select + +from core.config import settings +from core.crawl_politeness import ( + CrawlBlocked, + CrawlFetchError, + CrawlSkip, + download_via_browser, + fetch_page_via_browser, +) +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from models.news_source import NewsSource +from models.queue import enqueue_stage +from workers.kosha_collector import _safe_filename +from workers.news_collector import ( + _get_or_create_health, + _record_failure, + _record_success, +) + +logger = setup_logger("ccps_collector") + +_BEACON_URL = "https://www.aiche.org/ccps/resources/process-safety-beacon" +_SOURCE_NAME = "CCPS Process Safety Beacon" +_MAX_PDFS_PER_RUN = 10 # 월간 1~2건(영/한) 페이스 — 페이지 구조 오판 시 폭주 방지 + + +def _beacon_pdf_links(html_text: str, base_url: str) -> list[str]: + """beacon 관련 PDF 링크 — href/앵커텍스트에 'beacon' 포함만 (보수적). + + 필터에 안 걸린 PDF 가 있으면 호출측이 로그로 가시화 (첫 실측에서 패턴 보정용). + """ + seen: set[str] = set() + out: list[str] = [] + for m in re.finditer( + r']*href="([^"]+\.pdf(?:\?[^"]*)?)"[^>]*>(.*?)', + html_text, re.I | re.S, + ): + href, text = m.group(1), re.sub(r"<[^>]+>", " ", m.group(2)) + if "beacon" not in href.lower() and "beacon" not in text.lower(): + continue + absolute = urljoin(base_url, href) + path = urlparse(absolute).path + if path not in seen: + seen.add(path) + out.append(absolute) + return out + + +def _all_pdf_hrefs(html_text: str) -> list[str]: + return sorted({m.group(1) for m in re.finditer(r'href="([^"]+\.pdf(?:\?[^"]*)?)"', html_text, re.I)}) + + +async def _get_or_create_source(session) -> NewsSource: + result = await session.execute( + select(NewsSource).where(NewsSource.name == _SOURCE_NAME) + ) + source = result.scalars().first() + if source is None: + source = NewsSource( + name=_SOURCE_NAME, feed_url=_BEACON_URL, feed_type="rss", + fetch_method="page", fulltext_policy="none", + source_channel="crawl", category="Safety", language="en", country="US", + enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 monthly 폴링 + ) + session.add(source) + await session.flush() + return source + + +async def _ingest_pdf(session, pdf_url: str) -> bool: + """Beacon PDF 1건 → NAS 저장 + Document + extract enqueue. 반환 = 신규 여부.""" + fname = _safe_filename(Path(urlparse(pdf_url).path).name) + rel_path = f"crawl_raw/ccps_beacon/{fname}" + existing = await session.execute( + select(Document).where(Document.file_path == rel_path).limit(1) + ) + if existing.scalars().first(): + return False + + content, content_type = await download_via_browser(pdf_url, referer=_BEACON_URL) + if "pdf" not in content_type.lower() and not content.startswith(b"%PDF"): + raise CrawlSkip(f"PDF 아님 (content-type={content_type[:60]}): {pdf_url}") + + dest = Path(settings.nas_mount_path) / rel_path + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(content) + + doc = Document( + file_path=rel_path, + file_hash=hashlib.sha256(content).hexdigest(), + file_format="pdf", + file_size=len(content), + file_type="immutable", + title=fname.rsplit(".", 1)[0].replace("_", " ").replace("-", " "), + source_channel="crawl", + data_origin="external", + import_source="ccps_beacon", + edit_url=pdf_url, + ai_tags=["Safety/CCPS Beacon"], + extract_meta={"ccps": {"kind": "beacon_pdf"}}, + ) + session.add(doc) + await session.flush() + await enqueue_stage(session, doc.id, "extract") + logger.info(f"[ccps] Beacon ingest: {rel_path} ({len(content)} bytes)") + return True + + +async def run() -> None: + """monthly 진입점 — 실패는 health 기록 (circuit 가 A-8 패널 가시화).""" + now = datetime.now(timezone.utc) + async with async_session() as session: + source = await _get_or_create_source(session) + await session.commit() + source_id = source.id + + try: + html_text, final_url = await fetch_page_via_browser(_BEACON_URL, profile=None) + links = _beacon_pdf_links(html_text, final_url) + if not links: + others = _all_pdf_hrefs(html_text) + # 필터 0건 = 페이지 구조/명명 변경 가능성 — 발견 PDF 를 가시화해 보정 단서 제공 + raise CrawlFetchError( + f"beacon PDF 0건 (전체 PDF {len(others)}건: {others[:5]})" + ) + + new_count = 0 + for pdf_url in links[:_MAX_PDFS_PER_RUN]: + async with async_session() as session: + try: + if await _ingest_pdf(session, pdf_url): + new_count += 1 + await session.commit() + except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e: + await session.rollback() + logger.warning(f"[ccps] PDF 실패 skip ({pdf_url}): {e}") + if len(links) > _MAX_PDFS_PER_RUN: + logger.warning( + f"[ccps] PDF {len(links)}건 중 {_MAX_PDFS_PER_RUN}건만 처리 " + f"(월간 1~2건 가정 초과 — 페이지 구조 확인 필요)" + ) + + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_success(health, new_count, False, now) + src = await session.get(NewsSource, source_id) + src.last_fetched_at = now + await session.commit() + logger.info(f"[ccps] 완료: 신규 {new_count}건 (링크 {len(links)}건)") + except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e: + # CrawlBlocked = WAF 헤드리스 차단 신호 — 연속되면 circuit open (PARK 판단 근거) + logger.error(f"[ccps] 수집 실패: {type(e).__name__}: {e}") + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_failure(health, str(e) or repr(e), now) + await session.commit() + + +if __name__ == "__main__": + asyncio.run(run()) diff --git a/app/workers/csb_collector.py b/app/workers/csb_collector.py new file mode 100644 index 0000000..59d4f65 --- /dev/null +++ b/app/workers/csb_collector.py @@ -0,0 +1,390 @@ +"""C-2 잔여 ① US CSB sitemap diff 수집 워커 (plan crawl-24x7-1, 사이클 3). + +RSS 폐지 → sitemap.xml lastmod diff 폴링이 정석 (정부 사이트라 lastmod 양호 — +2026-06-11 실측 1,307 URL, 조사 보고서 페이지는 루트 슬러그). 페이지 본문(4-tier +≥200자 게이트) + 보고서 PDF(/assets/, recommendation 상태요약 제외) → +기존 extract 파이프라인(marker/kordoc) 재사용. + +스케줄 = weekly (main.py 월 06:50 KST): + 워터마크(selector_override.sitemap_watermark — B-3 probe 설정과 같은 JSONB 슬롯) + 이후 lastmod 만, 오래된 것부터 cap(40페이지/회). 워터마크는 처리분까지만 전진 + = 잔량 자동 점진 백필 (KOSHA GUIDE cap 패턴). cap 미처리 잔량은 매회 로그 + (silent cap 금지). diff 건수 > sanity(300) = sitemap 부패/lastmod 남발 의심 가시 경고. + +초기 일괄 (cap 해제, politeness 로 수 시간 — docker exec -d, 진행 중 같은 서비스 +재배포 금지 [[feedback_docker_exec_orphan_kill]] 자매 함정): + docker exec hyungi_document_server-fastapi-1 \ + python -m workers.csb_collector --limit 3 # 검증용 + docker exec -d hyungi_document_server-fastapi-1 \ + python -m workers.csb_collector --bulk # 전체 + +멱등: 페이지 = edit_url(정규화)+file_hash dedup (first-wins — lastmod 갱신 페이지의 +본문 재적재는 안 함, 갱신의 실체인 신규 PDF 는 개별 dedup 으로 적재됨). +PDF = file_path dedup. 워터마크 경계는 >= 재조회 — 경계 페이지 1회 재fetch 후 +dedup 이 잡는다 (lastmod 실측 distinct 라 누적 재fetch 없음). +""" + +import argparse +import asyncio +import hashlib +import random +import re +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urljoin, urlparse + +import httpx +from sqlalchemy import select + +from core.config import settings +from core.crawl_politeness import ( + CRAWL_UA, + CrawlBlocked, + CrawlFetchError, + CrawlSkip, + fetch_page, +) +from core.database import async_session +from core.utils import setup_logger +from models.document import Document +from models.news_source import NewsSource +from models.queue import enqueue_stage +from workers.fulltext_worker import ( + _WEB_MIN_BODY_LEN, + _extract_body, + _raw_html_path, + _save_raw_html, + _strip_article_footer, +) +from workers.kosha_collector import _safe_filename +from workers.news_collector import ( + FeedError, + _get_or_create_health, + _normalize_url, + _record_failure, + _record_success, +) +from workers.static_corpus_ingest import _page_title + +logger = setup_logger("csb_collector") + +_SITEMAP_URL = "https://www.csb.gov/sitemap.xml" +_SOURCE_NAME = "US CSB 사고조사보고서" + +_RUN_PAGE_CAP = 40 # weekly 1회 처리 상한 — 잔량은 워터마크 미전진으로 자동 이월 +_DIFF_SANITY = 300 # 주간 diff 가 이를 넘으면 sitemap lastmod 남발/부패 의심 (카드 C-2) +_MAX_PDF_BYTES = 50 * 1024 * 1024 +_PDF_DELAY = (2.0, 5.0) # 같은 도메인 연속 PDF 다운로드 간격 (kosha _DOWNLOAD_DELAY 동률) + +# 텍스트 코퍼스 무가치/관리성 섹션 — 첫 path segment 기준 (조사 보고서·뉴스 릴리스는 +# 루트 슬러그라 영향 없음. /news/·/investigations/ 는 목록 페이지뿐이라 제외). +_SKIP_FIRST_SEGMENT = { + "videos", "photos", "events", "members", "disclaimers", "media-room", + "about-the-csb", "about-us", "foia", "news", "investigations", + "site-map", "subscribe", "unsubscribe", "optout", "test", + "privacy-policy", "vulnerability-disclosure-policy", "en-espanol", + "newsletter", "recom-stats", "500.aspx", "documents", "records-details", +} + + +def _parse_sitemap(xml_text: str) -> list[tuple[str, datetime]]: + """(url, lastmod) 목록 — lastmod 없는/파싱불가 항목은 제외 (diff 축이 없음).""" + out: list[tuple[str, datetime]] = [] + for m in re.finditer( + r"\s*([^<]+)\s*([^<]+)", xml_text + ): + try: + lastmod = datetime.fromisoformat(m.group(2).strip()) + except ValueError: + continue + if lastmod.tzinfo is None: + lastmod = lastmod.replace(tzinfo=timezone.utc) + out.append((m.group(1).strip(), lastmod)) + return out + + +def _should_skip(url: str) -> bool: + path = urlparse(url).path.strip("/") + if not path: + return True # 홈 + return path.split("/", 1)[0].lower() in _SKIP_FIRST_SEGMENT + + +def _pdf_links(html_text: str, base_url: str) -> list[str]: + """페이지 내 보고서 PDF — /assets/recommendation/(상태변경 요약 다수)은 제외. + + cache-buster 쿼리(?17346)는 다운로드 URL 에는 유지, dedup/파일명은 path 기준. + """ + seen: set[str] = set() + out: list[str] = [] + for m in re.finditer(r'href="([^"]+\.pdf(?:\?[^"]*)?)"', html_text, re.I): + absolute = urljoin(base_url, m.group(1)) + path = urlparse(absolute).path + if "/assets/recommendation/" in path.lower(): + continue + if (urlparse(absolute).hostname or "").lower() != "www.csb.gov": + continue + if path not in seen: + seen.add(path) + out.append(absolute) + return out + + +async def _download_pdf(url: str, dest: Path) -> int: + """PDF 다운로드 — 크기 cap + 연속 간격 (politeness 는 순차 실행 전제).""" + await asyncio.sleep(random.uniform(*_PDF_DELAY)) + async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client: + resp = await client.get(url, headers={"User-Agent": CRAWL_UA}) + if resp.status_code != 200: + raise FeedError(f"PDF 다운로드 {resp.status_code}: {url}") + if len(resp.content) > _MAX_PDF_BYTES: + raise FeedError(f"PDF 크기 초과 ({len(resp.content)} bytes): {url}") + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(resp.content) + return len(resp.content) + + +async def _get_or_create_source(session) -> NewsSource: + result = await session.execute( + select(NewsSource).where(NewsSource.name == _SOURCE_NAME) + ) + source = result.scalars().first() + if source is None: + source = NewsSource( + name=_SOURCE_NAME, feed_url=_SITEMAP_URL, feed_type="rss", + fetch_method="sitemap+page", fulltext_policy="none", + source_channel="crawl", category="Safety", language="en", country="US", + enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 weekly 폴링 + ) + session.add(source) + await session.flush() + return source + + +def _watermark(source: NewsSource) -> datetime | None: + raw = (source.selector_override or {}).get("sitemap_watermark") + if not raw: + return None + try: + return datetime.fromisoformat(raw) + except ValueError: + return None + + +def _set_watermark(source: NewsSource, value: datetime) -> None: + # JSONB 변경 감지를 위해 dict 재할당 (fulltext_worker._set_fulltext_meta 동일 규약) + cfg = dict(source.selector_override or {}) + cfg["sitemap_watermark"] = value.isoformat() + source.selector_override = cfg + + +async def _ingest_pdf(session, page_slug: str, pdf_url: str) -> bool: + """PDF 1건 → NAS 저장 + Document + extract enqueue. 반환 = 신규 여부.""" + fname = _safe_filename(Path(urlparse(pdf_url).path).name) + rel_path = f"crawl_raw/csb/{page_slug}/{fname}" + existing = await session.execute( + select(Document).where(Document.file_path == rel_path).limit(1) + ) + if existing.scalars().first(): + return False + + dest = Path(settings.nas_mount_path) / rel_path + size = await _download_pdf(pdf_url, dest) + doc = Document( + file_path=rel_path, + file_hash=hashlib.sha256(dest.read_bytes()).hexdigest(), + file_format="pdf", + file_size=size, + file_type="immutable", + title=fname.rsplit(".", 1)[0].replace("_", " "), + source_channel="crawl", + data_origin="external", + import_source="csb_sitemap", + edit_url=pdf_url, + ai_tags=["Safety/CSB/보고서"], + extract_meta={"csb": {"page_slug": page_slug, "kind": "report_pdf"}}, + ) + session.add(doc) + await session.flush() + await enqueue_stage(session, doc.id, "extract") + logger.info(f"[csb] PDF ingest: {rel_path} ({size} bytes)") + return True + + +async def _ingest_url(session, source: NewsSource, url: str, lastmod: datetime) -> dict: + """변경 URL 1건: 페이지 fetch → PDF 전수 스캔(개별 dedup) + 본문 신규면 적재. + + 페이지 재방문(lastmod 갱신)에서도 PDF 스캔은 항상 수행 — 갱신의 실체 + (최종 보고서 추가 등)가 PDF 로 오는 경우가 핵심 가치다. + """ + counts = {"page": 0, "pdf": 0, "skip": 0} + try: + html_text, final_url = await fetch_page(url) + except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e: + logger.warning(f"[csb] fetch 실패 skip: {url} — {type(e).__name__}: {e}") + counts["skip"] = 1 + return counts + + page_slug = _safe_filename(urlparse(url).path.strip("/").split("/")[-1] or "root") + + for pdf_url in _pdf_links(html_text, final_url): + try: + if await _ingest_pdf(session, page_slug, pdf_url): + counts["pdf"] += 1 + except FeedError as e: + logger.warning(f"[csb] PDF 실패 skip ({pdf_url}): {e}") + + # 페이지 본문 — first-wins (이미 있으면 본문 재적재 없음) + normalized_url = _normalize_url(url) + page_hash = hashlib.sha256(f"csb-page|{normalized_url}".encode()).hexdigest()[:32] + existing = await session.execute( + select(Document).where( + (Document.file_hash == page_hash) + | (Document.edit_url.in_([normalized_url, url])) + ).limit(1) + ) + if existing.scalars().first(): + return counts + + body, engine, engine_ver = _extract_body(html_text) + if not engine: + logger.info(f"[csb] 본문 부족 — 페이지 비적재 (PDF 만): {url}") + return counts + clean_body = _strip_article_footer(body.replace("\x00", "")) + if len(clean_body) < _WEB_MIN_BODY_LEN: + return counts + + now = datetime.now(timezone.utc) + raw_path = _raw_html_path(source.id, page_hash, now) + raw_saved = True + try: + _save_raw_html(raw_path, html_text) + except OSError as e: + raw_saved = False + logger.error(f"[csb] 원본 보존 실패 (ingest 는 진행): {e}") + + title = _page_title(html_text, fallback=page_slug.replace("-", " ")[:90]) + doc = Document( + file_path=f"crawl/{_SOURCE_NAME}/{page_hash}", + file_hash=page_hash, + file_format="article", + file_size=0, + file_type="note", + title=title, + extracted_text=f"{title}\n\n{clean_body}", + extracted_at=now, + extractor_version=f"sitemap+page@{engine}", + md_content=clean_body, + md_status="success", + md_extraction_engine=engine, + md_extraction_engine_version=engine_ver, + md_format_version="1.0", + md_generated_at=now, + md_source_hash=hashlib.sha256(html_text.encode("utf-8", errors="replace")).hexdigest(), + md_content_hash=hashlib.sha256(clean_body.encode("utf-8")).hexdigest(), + content_origin="extracted", + source_channel="crawl", + data_origin="external", + edit_url=normalized_url, + review_status="approved", + ai_domain="Safety", + ai_sub_group=_SOURCE_NAME, + ai_tags=["Safety/CSB"], + extract_meta={ + "source_id": source.id, + "source_name": _SOURCE_NAME, + "published_at": lastmod.isoformat(), + "fulltext": { + "status": "csb_sitemap", + "engine": engine, + "final_url": final_url, + "raw_html_path": str(raw_path) if raw_saved else None, + "body_chars": len(clean_body), + "resolved_at": now.isoformat(), + }, + }, + ) + doc.file_size = len(doc.extracted_text.encode()) + session.add(doc) + await session.flush() + await enqueue_stage(session, doc.id, "summarize") + await enqueue_stage(session, doc.id, "embed") + await enqueue_stage(session, doc.id, "chunk") + counts["page"] = 1 + logger.info(f"[csb] page ingest {len(clean_body)}자 ({engine}): {title[:60]}") + return counts + + +async def run(bulk: bool = False, limit: int = 0) -> None: + """weekly 진입점 (스케줄러) — bulk/limit 은 CLI 전용.""" + now = datetime.now(timezone.utc) + async with async_session() as session: + source = await _get_or_create_source(session) + await session.commit() + source_id = source.id + watermark = _watermark(source) + + try: + xml_text, _ = await fetch_page( + _SITEMAP_URL, content_types=("text/xml", "application/xml", "text/html") + ) + entries = _parse_sitemap(xml_text) + if not entries: + raise FeedError("sitemap 파싱 0건 — 포맷 변경/부패 의심") + except (CrawlBlocked, CrawlSkip, CrawlFetchError, FeedError) as e: + logger.error(f"[csb] sitemap 수집 실패: {e}") + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_failure(health, str(e) or repr(e), now) + await session.commit() + return + + changed = sorted( + ( + (url, lastmod) for url, lastmod in entries + if not _should_skip(url) and (watermark is None or lastmod >= watermark) + ), + key=lambda pair: pair[1], + ) + if watermark is not None and len(changed) > _DIFF_SANITY: + logger.error( + f"[csb] diff {len(changed)}건 > sanity {_DIFF_SANITY} — " + f"sitemap lastmod 남발/부패 의심 (cap 처리는 계속, 관찰 필요)" + ) + + cap = len(changed) if bulk else _RUN_PAGE_CAP + if limit: + cap = min(cap, limit) + todo, deferred = changed[:cap], max(len(changed) - cap, 0) + logger.info( + f"[csb] sitemap {len(entries)}건 중 변경 {len(changed)}건, 처리 {len(todo)}건" + + (f" (잔여 {deferred}건 — 워터마크 미전진으로 자동 이월)" if deferred else "") + ) + + totals = {"page": 0, "pdf": 0, "skip": 0} + for i, (url, lastmod) in enumerate(todo, 1): + async with async_session() as session: + src = await session.get(NewsSource, source_id) + counts = await _ingest_url(session, src, url, lastmod) + _set_watermark(src, lastmod) + await session.commit() + for k in totals: + totals[k] += counts[k] + if i % 10 == 0: + logger.info(f"[csb] 진행 {i}/{len(todo)} {totals}") + + async with async_session() as session: + health = await _get_or_create_health(session, source_id) + _record_success(health, totals["page"] + totals["pdf"], False, now) + src = await session.get(NewsSource, source_id) + src.last_fetched_at = now + await session.commit() + logger.info(f"[csb] 완료: {totals} (변경 {len(changed)}건 중 {len(todo)}건 처리)") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="CSB sitemap diff 수집") + parser.add_argument("--bulk", action="store_true", help="cap 해제 — 초기 일괄") + parser.add_argument("--limit", type=int, default=0, help="처리 상한 (검증용)") + args = parser.parse_args() + asyncio.run(run(bulk=args.bulk, limit=args.limit)) diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py index 69f7649..36f6cc0 100644 --- a/app/workers/news_collector.py +++ b/app/workers/news_collector.py @@ -271,6 +271,15 @@ async def _enqueue_processing(session, doc: Document, source: NewsSource, pub_dt fulltext_worker 가 승격(또는 격하) 확정 후 enqueue (RSS 요약 선요약 → 풀텍스트 도착 시 summarize_worker 의 '이미 요약 있음 skip' 에 막히는 순서 함정 회피). """ + if source.fetch_method == "signal-only": + # B-4: 시그널 = 검색 색인만 (embed/chunk). fulltext/summarize 절대 enqueue 안 함 — + # 레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어 우선). + # 요약 LLM 스킵 = 맥미니 부하 0. 다이제스트/브리핑은 ai_summary IS NULL 문서를 + # 처음부터 제외(services/digest/loader.py)하므로 시그널 문서가 자연 배제된다. + if source.source_channel == "crawl" or (datetime.now(timezone.utc) - pub_dt).days <= 30: + await enqueue_stage(session, doc.id, "embed") + await enqueue_stage(session, doc.id, "chunk") + return if source.fulltext_policy == "page" and doc.edit_url: await enqueue_stage(session, doc.id, "fulltext") return @@ -286,6 +295,28 @@ async def _enqueue_processing(session, doc: Document, source: NewsSource, pub_dt await enqueue_stage(session, doc.id, "chunk") +def _entry_body(source: NewsSource, entry, summary: str) -> tuple[str, str]: + """(body, extractor_version) — 정책별 본문 선택, 순수 함수 (shape 테스트 대상). + + signal-only: 피드 요약이 곧 본문 — 절단 없음 (arXiv 초록 1.3~1.6K자 보존, + 1000자 cap 적용 시 초록 꼬리 유실). 페이지 fetch 는 어떤 경우에도 없음 (B-4). + feed-full: 피드 본문이 전문인 소스만 신뢰 (truncate·광고 삽입이 흔해 일반 + 소스의 summary/content:encoded 를 전문으로 오인 저장 금지 — A-6). + """ + if source.fetch_method == "signal-only": + body = _clean_html( + entry.get("summary", "") or entry.get("description", ""), max_len=None + ) + return (body or summary), "rss-signal" + if source.fulltext_policy == "feed-full": + content_list = entry.get("content") or [] + raw_body = content_list[0].get("value", "") if content_list else "" + full_body = _clean_html(raw_body or entry.get("summary", ""), max_len=None) + if len(full_body) > len(summary): + return full_body, "rss-feed-full" + return summary, "rss" + + def _build_extract_meta(source: NewsSource, pub_dt: datetime) -> dict: """fulltext_worker / 패널이 쓰는 출처 메타 (documents 에 source FK 가 없어 여기 기록).""" return { @@ -415,17 +446,8 @@ async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: if not summary: summary = title - # A-6: feed-full 소스만 피드 본문을 전문으로 신뢰 (truncate·광고 삽입이 흔해 - # 일반 소스의 summary/content:encoded 를 전문으로 오인 저장 금지) - body = summary - is_feed_full = False - if source.fulltext_policy == "feed-full": - content_list = entry.get("content") or [] - raw_body = content_list[0].get("value", "") if content_list else "" - full_body = _clean_html(raw_body or entry.get("summary", ""), max_len=None) - if len(full_body) > len(summary): - body = full_body - is_feed_full = True + # 정책별 본문 선택 — signal-only(무절단 요약) / feed-full(피드 전문) / 기본(요약) + body, extractor_version = _entry_body(source, entry, summary) link = entry.get("link", "") @@ -469,7 +491,7 @@ async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: title=title, extracted_text=f"{title}\n\n{body}", extracted_at=datetime.now(timezone.utc), - extractor_version="rss-feed-full" if is_feed_full else "rss", + extractor_version=extractor_version, # article = 텍스트 네이티브(본문=extracted_text). markdown 단계 미enqueue 라 # 기본값 'pending' 이면 영구 비수렴 → backlog 지표 오염 + md_status_pending partial # 인덱스 비대. 생성 시점에 terminal 'skipped' 로 명시(변환 비대상). diff --git a/migrations/327_seed_crawl_cycle3_sources.sql b/migrations/327_seed_crawl_cycle3_sources.sql new file mode 100644 index 0000000..c0046c5 --- /dev/null +++ b/migrations/327_seed_crawl_cycle3_sources.sql @@ -0,0 +1,32 @@ +-- crawl-24x7 사이클 3 소스 seed (B-4 시그널 + C-4 공학 지속수집) — 2026-06-11 전 URL live 검증. +-- 326 선례: WHERE NOT EXISTS idempotent, 기존 행 보존, 신규만 insert (단일 statement). +-- fetch_method='signal-only' (B-4): 헤드라인+요약만 인제스트, 페이지 fetch 0, +-- summarize 스킵(검색 색인만 — embed/chunk). 다이제스트는 ai_summary NULL 제외라 자연 배제. +-- Bloomberg = anti-bot 최강이라 본문 수집 비권고 → 시그널 전용. 피드에 비디오 혼재 실측 → skip-video. +-- Economist = 실측 200 (Archiver UA 는 feed-reader 로 취급됨 — 브라우저 UA 만 403). 구독 없음 = 시그널. +-- Nikkei Asia = RSS 1.0(RDF) 실측 — feedparser 가 네이티브 정규화 (title/link 만, 요약·날짜 없음 +-- = 제목 시그널). 코드 분기 불요 (tests/test_crawl_cycle3_shapes.py fixture 회귀로 박제). +-- arXiv/ASME = 초록이 곧 본문 (C-4 2단: 초록 색인 먼저, 선별 전문은 Phase 3) → signal-only 재사용. +-- IEEE Spectrum = 피드 description 이 전문 (7.9~14K자 실측) → feed-full. 카테고리 필터 = topic 피드. +INSERT INTO news_sources + (name, country, language, feed_type, feed_url, category, enabled, + fetch_method, fulltext_policy, source_channel, parser_quirk) +SELECT v.name, v.country, v.language, v.feed_type, v.feed_url, v.category, v.enabled, + v.fetch_method, v.fulltext_policy, v.source_channel::source_channel, v.parser_quirk +FROM (VALUES + -- B-4: 시그널 전용 (news 채널 — 헤드라인 시그널) + ('Bloomberg Markets', 'US', 'en', 'rss', 'https://feeds.bloomberg.com/markets/news.rss', 'Economy', true, 'signal-only', 'none', 'news', 'skip-video'), + ('Bloomberg Technology', 'US', 'en', 'rss', 'https://feeds.bloomberg.com/technology/news.rss', 'Technology', true, 'signal-only', 'none', 'news', 'skip-video'), + ('Economist Latest', 'GB', 'en', 'rss', 'https://www.economist.com/latest/rss.xml', 'International', true, 'signal-only', 'none', 'news', NULL), + ('Nikkei Asia', 'JP', 'en', 'rss', 'https://asia.nikkei.com/rss/feed/nar', 'International', true, 'signal-only', 'none', 'news', NULL), + -- C-4: 공학 지속수집 (crawl 채널 — 도메인 재료. API 공지/CSB/CCPS 는 전용 워커가 runtime 등록) + ('ASME J. Pressure Vessel Technology', 'US', 'en', 'rss', 'https://asmedigitalcollection.asme.org/rss/site_1000037/LatestOpenIssueArticles_1000020.xml', 'Engineering', true, 'signal-only', 'none', 'crawl', NULL), + ('arXiv cond-mat.mtrl-sci', 'US', 'en', 'rss', 'https://rss.arxiv.org/rss/cond-mat.mtrl-sci', 'Engineering', true, 'signal-only', 'none', 'crawl', NULL), + ('arXiv physics.app-ph', 'US', 'en', 'rss', 'https://rss.arxiv.org/rss/physics.app-ph', 'Engineering', true, 'signal-only', 'none', 'crawl', NULL), + ('IEEE Spectrum Energy', 'US', 'en', 'rss', 'https://spectrum.ieee.org/feeds/topic/energy.rss', 'Engineering', true, 'rss', 'feed-full', 'crawl', NULL), + ('IEEE Spectrum Robotics', 'US', 'en', 'rss', 'https://spectrum.ieee.org/feeds/topic/robotics.rss', 'Engineering', true, 'rss', 'feed-full', 'crawl', NULL) +) AS v(name, country, language, feed_type, feed_url, category, enabled, + fetch_method, fulltext_policy, source_channel, parser_quirk) +WHERE NOT EXISTS ( + SELECT 1 FROM news_sources ns WHERE ns.name = v.name +); diff --git a/services/playwright-fetcher/server.py b/services/playwright-fetcher/server.py index 78d2e4a..78a85ce 100644 --- a/services/playwright-fetcher/server.py +++ b/services/playwright-fetcher/server.py @@ -1,4 +1,4 @@ -"""B-3 구독 세션 Playwright fetcher (plan crawl-24x7-1). +"""B-3 구독 세션 Playwright fetcher (plan crawl-24x7-1) + 익명 브라우저 fetch/다운로드 (사이클 3). storage_state JSON(쿠키+localStorage 스냅샷) 기반 인증 페이지 fetch + 내용 기반 probe. - 동시 1 인스턴스 (글로벌 세마포어) — 계정 보호 + 사람 속도는 호출측 politeness 가 담당. @@ -7,9 +7,15 @@ storage_state JSON(쿠키+localStorage 스냅샷) 기반 인증 페이지 fetch 부재 = 503 profile_missing (silent fallback 없음 — 호출측이 degrade). - 시간 기반 만료 판정 금지 — probe 는 알려진 유료 기사에서 본문 길이 + 페이월 마커 부재 검증 (만료 후 200 '페이월 안내문'이 본문으로 저장되는 silent corruption 차단). + +사이클 3 증축 (C-2 CCPS Beacon — aiche.org 가 평문 httpx 를 UA 무관 403): +- /fetch profile 생략 = 익명 컨텍스트 (storage_state 없음, 공개 페이지의 WAF 우회 전용). +- /download = referer 페이지를 먼저 방문(WAF 쿠키 획득) 후 같은 컨텍스트의 + request.get 으로 바이너리(PDF) 다운로드 — base64 반환, 60MB cap. """ import asyncio +import base64 import logging from pathlib import Path @@ -23,6 +29,7 @@ logger = logging.getLogger("playwright-fetcher") AUTH_DIR = Path("/auth") NAV_TIMEOUT_MS = 45_000 SETTLE_MS = 1_500 # domcontentloaded 후 lazy 본문 settle 대기 +MAX_DOWNLOAD_BYTES = 60 * 1024 * 1024 app = FastAPI(title="playwright-fetcher") _browser_slot = asyncio.Semaphore(1) # 동시 1 인스턴스 (B-3 ① persistent 제약과 동일 규율) @@ -30,7 +37,8 @@ _browser_slot = asyncio.Semaphore(1) # 동시 1 인스턴스 (B-3 ① persisten class FetchReq(BaseModel): url: str - profile: str = Field(pattern=r"^[a-z0-9_-]{1,50}$") + # None = 익명 컨텍스트 (공개 페이지 WAF 우회 — CCPS). 값 = B-3 구독 세션. + profile: str | None = Field(default=None, pattern=r"^[a-z0-9_-]{1,50}$") class ProbeReq(BaseModel): @@ -40,6 +48,13 @@ class ProbeReq(BaseModel): paywall_markers: list[str] = [] +class DownloadReq(BaseModel): + url: str + # referer 페이지를 먼저 방문해 WAF 챌린지 쿠키를 컨텍스트에 적재 후 다운로드 + referer: str | None = None + profile: str | None = Field(default=None, pattern=r"^[a-z0-9_-]{1,50}$") + + def _state_path(profile: str) -> Path: p = AUTH_DIR / f"{profile}.json" if not p.is_file(): @@ -47,16 +62,23 @@ def _state_path(profile: str) -> Path: return p -async def _browse(url: str, state: Path) -> tuple[str, str, str]: +def _context_kwargs(state: Path | None) -> dict: + kwargs = {"viewport": {"width": 1366, "height": 900}} + if state is not None: + # B-3 르몽드 세션 회귀 방지 — 기존 인증 fetch 의 locale 그대로 + kwargs["storage_state"] = str(state) + kwargs["locale"] = "fr-FR" + else: + kwargs["locale"] = "en-US" + return kwargs + + +async def _browse(url: str, state: Path | None) -> tuple[str, str, str]: """(html, final_url, visible_text). 요청당 브라우저 — 종료를 finally 로 보장.""" async with async_playwright() as pw: browser = await pw.chromium.launch(headless=True) try: - context = await browser.new_context( - storage_state=str(state), - viewport={"width": 1366, "height": 900}, - locale="fr-FR", - ) + context = await browser.new_context(**_context_kwargs(state)) page = await context.new_page() await page.goto(url, wait_until="domcontentloaded", timeout=NAV_TIMEOUT_MS) await page.wait_for_timeout(SETTLE_MS) @@ -76,17 +98,53 @@ def health(): @app.post("/fetch") async def fetch(req: FetchReq): - state = _state_path(req.profile) + state = _state_path(req.profile) if req.profile else None async with _browser_slot: try: html, final_url, _ = await _browse(req.url, state) except PlaywrightError as e: logger.warning("fetch 실패 %s: %s", req.url, e) raise HTTPException(502, detail={"error_reason": "browse_failed", "message": str(e)[:300]}) - logger.info("fetch ok profile=%s %s (%d bytes)", req.profile, req.url, len(html)) + logger.info("fetch ok profile=%s %s (%d bytes)", req.profile or "-", req.url, len(html)) return {"html": html, "final_url": final_url} +@app.post("/download") +async def download(req: DownloadReq): + """바이너리(PDF 등) 다운로드 — referer 방문으로 WAF 쿠키 획득 후 같은 컨텍스트로 GET. + + 응답의 status/content_type 판정은 호출측(crawl_politeness) 책임 — 여기서는 + 전송 계층 오류만 502 로 구분 (silent fallback 없음). + """ + state = _state_path(req.profile) if req.profile else None + async with _browser_slot: + try: + async with async_playwright() as pw: + browser = await pw.chromium.launch(headless=True) + try: + context = await browser.new_context(**_context_kwargs(state)) + if req.referer: + page = await context.new_page() + await page.goto(req.referer, wait_until="domcontentloaded", + timeout=NAV_TIMEOUT_MS) + await page.wait_for_timeout(SETTLE_MS) + resp = await context.request.get(req.url, timeout=NAV_TIMEOUT_MS) + body = await resp.body() + finally: + await browser.close() + except PlaywrightError as e: + logger.warning("download 실패 %s: %s", req.url, e) + raise HTTPException(502, detail={"error_reason": "download_failed", "message": str(e)[:300]}) + if len(body) > MAX_DOWNLOAD_BYTES: + raise HTTPException(502, detail={"error_reason": "too_large", "bytes": len(body)}) + logger.info("download status=%d %s (%d bytes)", resp.status, req.url, len(body)) + return { + "status": resp.status, + "content_type": resp.headers.get("content-type", ""), + "body_b64": base64.b64encode(body).decode(), + } + + @app.post("/probe") async def probe(req: ProbeReq): """내용 기반 세션 probe — ok=False 사유를 명시 반환 (호출측이 health 에 기록).""" diff --git a/tests/fixtures/api_standards_announcements_listing.html b/tests/fixtures/api_standards_announcements_listing.html new file mode 100644 index 0000000..8c1d672 --- /dev/null +++ b/tests/fixtures/api_standards_announcements_listing.html @@ -0,0 +1,848 @@ + + + + + +American Petroleum Institute | API | Standards News Highlights + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + + +
+ +
+
+ + +
+
+ + + + + + + + + + +
+
+
+

Standards News Highlights

+
+
+
+ + +
+
+ + + +
+
+ +
+
+ +
+
+
+
+
+

API Announces 47th Edition of Foundational Line Pipe Standard

+ +

WASHINGTON, June 2, 2026 — The American Petroleum Institute (API) today announced the publication of the 47th edition of API Specification 5L (API Spec 5L), Line Pipe. Originally published in 1924 as API’s first standard, API Spec 5L has supported the safe and reliable manufacture of steel line pipe used to transport oil and gas for more than a century. The 47th edition includes important new requirements across more than 15 topic areas, including high-frequency weld (HFW) pipe quality and pipe used in CO2 transport.

+

More »

+
+
+

API Specification 5L, Line Pipe

+ +

API has published the 47th edition of American Petroleum Institute Specification 5L (API Spec 5L), Line Pipe. Originally introduced as API’s first standard in 1924, the updated edition includes new requirements across more than 15 topic areas to support the safe and reliable manufacture of steel line pipe used in energy transportation, including CO2 transport.  

+

More »

+
+
+

API Recommended Practice 1192 (RP 1192), Transportation of Carbon Dioxide by Pipeline

+ +

API has published Recommended Practice 1192 (RP 1192), Transportation of Carbon Dioxide by Pipeline.This first edition standard provides performance requirements for the safe and reliable transport of carbon dioxide (CO2) by pipeline. It also addresses the design, construction, operation, and management of CO2 pipelines

+

More »

+
+
+

API Strengthens Requirements for Steel Casing and Tubing

+ +

WASHINGTON, May 5, 2025 — The American Petroleum Institute (API) is pleased to announce the publication of an Addendum to the 11th edition of the API 5CT, Casing and Tubing. The update strengthens the requirements for the manufacture of steel casing and tubing used in oil and gas drilling and production operations, enhancing safety, environmental protection and operational integrity.

+

More »

+
+
+

Addendum to API RP 1183 for Improved Dent Screening

+ +

In December 2020, American Petroleum Institute (API) published Recommended Practice 1183, First Edition (RP 1183), Assessment and Management of Dents in Pipelines. Since being issued, RP 1183 has been applied by pipeline operating companies and engineering consultants providing services to the energy pipeline industry to evaluate dents and deformations on pipeline systems and to support mitigation and repair decisions. RP 1183 includes various screening tools to estimate the remaining fatigue life of a dent in a pipeline. 

+

More »

+
+
+

New API Report Highlights Broader Global Adoption of API Standards

+ +

February 4, 2025 – The American Petroleum Institute (API) today released a new report, 2025 API StandardsInternational Usage Report, detailing the growing international influence of API standards. The report identifies where governments and standards bodies reference API standards in policies, national and international standards, and technical regulations, highlighting the paramount role of API standards in advancing safety, sustainability, and efficiency across the global natural gas and oil industry.

+

More »

+
+
+

API Enhances 3D Printing Guidelines with Updated Additive Manufacturing Standard

+ +

The American Petroleum Institute (API) is pleased to announce the release of the second edition of API Standard 20S, Qualification of Metal Additive Manufacturing Processes and Components Production Control for Use in the Petroleum and Natural Gas Industries. This update strengthens the industry’s ability to effectively deploy additive manufacturing (AM), or 3D printing, improving efficiency, supply chain resilience and sustainability across oil and natural gas operations.

+

More »

+
+
+

Addendum 1 to API RP 2001, 10th Edition: Fire Protection in Refineries

+ +

API has published Addendum 1 to API Recommended Practice 2001, 10th Edition - "Fire Protection in Refineries."
+
+This addendum strengthens existing fire safety measures by introducing new protocols for pre-planning and incident response in refineries.

+

More »

+
+
+

Addendum to the 11th edition of the API 5CT, Casing and Tubing

+ +

The American Petroleum Institute (API) is pleased to announce the publication of an Addendum to the 11th edition of the API 5CT, Casing and Tubing. The update strengthens the requirements for the manufacture of steel casing and tubing used in oil and gas drilling and production operations, enhancing safety, environmental protection and operational integrity.

+

More »

+
+
+

API Updates Fire Protection Standard for Refineries

+ +

October 24, 2024 – The American Petroleum Institute (API) today announced the publication of Addendum 1 to API RP 2001, 10th Edition, “Fire Protection in Refineries.” This addendum strengthens existing fire safety measures by introducing new protocols for pre-planning and incident response in refineries.

+
+

More »

+
+
+
+ +
+
+ +
+
+
+
+ + +
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+ +
+ + + + + + \ No newline at end of file diff --git a/tests/fixtures/arxiv_appph_rss.xml b/tests/fixtures/arxiv_appph_rss.xml new file mode 100644 index 0000000..5e892b6 --- /dev/null +++ b/tests/fixtures/arxiv_appph_rss.xml @@ -0,0 +1,196 @@ + + + + physics.app-ph updates on arXiv.org + http://rss.arxiv.org/rss/physics.app-ph + physics.app-ph updates on the arXiv.org e-print archive. + + http://www.rssboard.org/rss-specification + en-us + Wed, 10 Jun 2026 04:00:28 +0000 + rss-help@arxiv.org + Wed, 10 Jun 2026 00:00:00 -0400 + + Sunday + Saturday + + + Limits of Trap-assisted Photomultiplication Gain + https://arxiv.org/abs/2606.10236 + arXiv:2606.10236v1 Announce Type: new +Abstract: Photodiodes based on trap-assisted current injection can exhibit internal photomultiplication with apparent quantum efficiencies far exceeding unity, raising the question of whether such gain fundamentally enhances detector sensitivity. We employ a minimal analytical framework based on a single gain-active trapped state coupling photogenerated carriers to contact injection. The gain is intrinsically self-limiting: the injection process that amplifies the current simultaneously accelerates relaxation of the gain-enabling state, producing an inherently nonlinear, operating-point-dependent response. The form of this nonlinearity is not universal -- once the trap level is generalized to an energetic distribution and recombination is allowed to be bimolecular, the same mechanism yields superlinear, linear, or strongly sublinear responses. A single chord gain is therefore not a meaningful device descriptor, and chord-gain comparisons across the literature conflate devices in different regimes. Treating trap occupancy and injection as coupled stochastic processes, we show that internal gain introduces a strictly non-negative fluctuation penalty from the dissipative dynamics that sustain the gain state. A local, small-signal detectivity exhibits a finite optimum yet cannot exceed the intrinsic thermodynamic limit of the underlying unity-gain photodiode. Gain is thus equivalent to driven stochastic amplification: it can suppress downstream readout noise, but cannot reduce the fundamental noise floor set by the primary photodetection process. + oai:arXiv.org:2606.10236v1 + physics.app-ph + Wed, 10 Jun 2026 00:00:00 -0400 + new + http://creativecommons.org/licenses/by/4.0/ + Ardalan Armin + + + Filamentary Transport and Thermoelectric Effects in Mushroom Phase Change Memory Cells + https://arxiv.org/abs/2606.10262 + arXiv:2606.10262v1 Announce Type: new +Abstract: We performed a 2D finite-element electrothermal computational study of thermoelectric effects and filamentary electronic transport in Ge$_2$Sb$_2$Te$_5$ mushroom phase change memory cells during Reset and Set operations, accounting for spatial activation energy variations in amorphous Ge$_2$Sb$_2$Te$_5$ and phase-change dynamics. Reset operations with current going from the top electrode to the narrow 4 nm bottom electrode require $\sim$3x less energy and power, and $\sim$2x lower current to achieve the same Reset resistance, compared to the opposite polarity, due to thermoelectric effects. Filamentary conduction, electrical breakdown, thermal runaway, and local crystallization of amorphous Ge$_2$Sb$_2$Te$_5$ depend on current polarity and thermal boundary conditions, and determine the location, shape, and volume of the programming region, which may be significantly smaller than the semi-cylindrical mushroom region. The programming volume does not scale with contact dimensions larger than 10 nm. Larger contact areas introduce increased device-to-device and cycle-to-cycle variability due to filamentary conduction but are expected to lead to higher reliability and endurance. + oai:arXiv.org:2606.10262v1 + physics.app-ph + cond-mat.mtrl-sci + Wed, 10 Jun 2026 00:00:00 -0400 + new + http://creativecommons.org/licenses/by/4.0/ + Md Samzid Bin Hafiz, Helena Silva, Ali Gokirmak + + + Fast-Neutron Irradiation Effect in Heteroepitaxial $\beta$-Ga$_2$O$_3$ Schottky Diodes Fabricated on Low-Cost Sapphire Substrates + https://arxiv.org/abs/2606.10269 + arXiv:2606.10269v1 Announce Type: new +Abstract: In this work, we investigate the response of Ni/$\beta$-Ga$_2$O$_3$ Schottky barrier diodes fabricated on c-plane sapphire to fast-neutron irradiation up to a fluence of $1\times10^{15}$ n$\cdot$cm$^{-2}$. The LPCVD-grown heteroepitaxial structure consists of an unintentionally doped buffer, an n$^{+}$ contact layer, and an n-type drift layer, with mesa isolation realized by plasma-free Ga-assisted LPCVD etching. Prior to irradiation, the devices exhibit a turn-on voltage of 1.20 V, specific on-resistance of 8.43 m$\Omega\cdot$cm$^2$, ideality factor of 1.32, and Schottky barrier height of 1.29 eV. Following irradiation, the devices remain operational, although the forward current decreases, the turn-on voltage increases to 2.40 V, and the barrier height increases to 1.34 eV. Capacitance-voltage measurements reveal a $\sim$50% reduction in net donor concentration, corresponding to a carrier-removal rate of $\sim$105 cm$^{-1}$. Temperature-dependent measurements from 25 to 250 $^\circ$C confirm that thermionic emission remains the dominant transport mechanism and show significant suppression of reverse leakage current after irradiation. The breakdown voltage increases from 101 to 135 V, consistent with neutron-induced donor compensation. TCAD simulations show a more uniform electric-field distribution and reduced field crowding at the Schottky edge after irradiation. These results provide insight into neutron-induced donor compensation in heteroepitaxial $\beta$-Ga$_2$O$_3$ and demonstrate the ability of LPCVD-grown $\beta$-Ga$_2$O$_3$ Schottky diodes on sapphire to maintain stable operation under high-fluence neutron environments relevant to space and nuclear electronics. + oai:arXiv.org:2606.10269v1 + physics.app-ph + Wed, 10 Jun 2026 00:00:00 -0400 + new + http://arxiv.org/licenses/nonexclusive-distrib/1.0/ + Saleh Ahmed Khan, Ahmed Ibreljic, Sourav Sarker, Stephen Margiotta, Anhar Bhuiyan + + + Virtual-Array Operational Modal Analysis of Rolling Tires Using a Single Tire Cavity Accelerometer + https://arxiv.org/abs/2606.10437 + arXiv:2606.10437v1 Announce Type: new +Abstract: The dynamics of rolling tires significantly influence the low-frequency (0-500 Hz) structure-borne noise within vehicles. Accurately characterizing these dynamics under realistic operating conditions remains challenging. Current state-of-the-art methods, primarily relying on Laser Doppler Vibrometers (LDV), are complex to implement, time-intensive, and generally limited to smooth tires in laboratory environments due to issues with speckle formation on treaded surfaces. This study introduces an innovative strategy for Operational Modal Analysis (OMA) of a rolling tire using a single wireless Tire Cavity Accelerometer (TCA) together with two optical sensors. The methodology leverages the non-integer ratio between the tire and drum diameters in a test rig to create a virtual sensor array. By utilizing optical sensors to time-stamp the cleat impact (on the drum) precisely and the TCA position (on the tire), the vibration responses from multiple revolutions are clustered according to the TCA's circumferential position at the moment of impact. This effectively synthesizes responses from an array of virtual sensors distributed around the tire circumference using data from a single test run. The clustered signals are conditioned using order tracking to remove periodic components arising from contact patch deformation. Both Frequency Domain Decomposition (FDD) and Covariance-based Stochastic Subspace Identification (SSI-Cov) were employed for modal identification. The SSI-Cov method proved more robust, successfully identifying 11 circumferential modes up to 240 Hz. The proposed approach offers a significantly more efficient, cost-effective method for characterizing rolling tire dynamics, which is readily applicable to treaded tires and adaptable for on-road testing. + oai:arXiv.org:2606.10437v1 + physics.app-ph + physics.data-an + Wed, 10 Jun 2026 00:00:00 -0400 + new + http://arxiv.org/licenses/nonexclusive-distrib/1.0/ + Pradosh Pritam Dash, Ricardo Burdisso, Pablo A Tarazaga + + + Finite-temperature Fe K-edge X-ray absorption simulations reveal local structural dynamics of an iron(II) photosensitizer in solution and the crystalline phase + https://arxiv.org/abs/2606.10221 + arXiv:2606.10221v1 Announce Type: cross +Abstract: Interpreting metal K-edge spectra of flexible photosensitizers requires a structural model that separates electronic signatures from thermal motion, solvent disorder, and crystal-packing effects. We combine Fe K-edge X-ray absorption measurements with second-generation Car--Parrinello ab initio molecular dynamics and all-electron Gaussian and augmented-plane-wave simulations for an iron(II) N-heterocyclic carbene photosensitizer in acetonitrile solution and in the crystalline phase. Ensemble-averaged spectra reproduce the main near-edge features in both environments and preserve the experimentally observed similarity of the first Fe coordination shell upon dissolution. Comparison with radial distributions extracted from extended fine-structure measurements validates the Fe--N and Fe--C coordination shells sampled by the trajectories, while element-resolved pair distributions explain why higher-shell experimental contrast is rapidly lost. The same dynamical ensembles reveal a broad out-of-plane distribution of the terpyridine nitrogen atom and a nearly octahedral distribution of the Fe-centered coordination planes. The results show that finite-temperature X-ray absorption simulations can provide a compact structural-dynamics picture of molecular transition metal photosensitizers by linking local spectra, solvent-phase ligand motion, and medium-range structural disorder within one trajectory-based description. + oai:arXiv.org:2606.10221v1 + cond-mat.mtrl-sci + physics.app-ph + physics.chem-ph + physics.comp-ph + Wed, 10 Jun 2026 00:00:00 -0400 + cross + http://arxiv.org/licenses/nonexclusive-distrib/1.0/ + Patrick M\"uller, Lorena Fritsch, Matthias Bauer, Thomas D. K\"uhne + + + Multi-channel Optical Vision Model + https://arxiv.org/abs/2606.10253 + arXiv:2606.10253v1 Announce Type: cross +Abstract: Spatial multiplexing is one of the natural strengths of optics, yet in optical neural networks, it is often used mainly as parallel throughput. Here, we show that spatial multiplexing in an optical neural network can be used not only to process multiple inputs in parallel, but also to define a trainable representational coordinate of the model. In three implemented scenarios, parallel-input processing, class-code readout and channel-mixed feature interaction, spatial channels act as independent learners, structured code dimensions, and interacting feature groups. The programmable free-space optical processor is trained through an online physical-forward/surrogate-backward scheme, where measured optical outputs define the forward pass while a differentiable surrogate estimates gradients and is continually fine-tuned during training from newly acquired optical data. We demonstrate these channel roles in image classification and regression tasks using multi-layer architectures with more than one million trainable optical phase parameters. We further implement a hybrid optical-electronic vision-language model, in which the optical neural network provides visual tokens to a digital transformer decoder for controlled image-captioning tasks. These results establish spatially multiplexed optical channels as a programmable feature and readout space for hybrid optical vision models. + oai:arXiv.org:2606.10253v1 + physics.optics + physics.app-ph + Wed, 10 Jun 2026 00:00:00 -0400 + cross + http://arxiv.org/licenses/nonexclusive-distrib/1.0/ + Ali Momeni, Guillaume Noetinger, Tim Tuuva, Romain Fleury + + + Spontaneous translation of charged droplets during evaporation on dry surfaces + https://arxiv.org/abs/2606.10755 + arXiv:2606.10755v1 Announce Type: cross +Abstract: Evaporating sessile droplets are usually treated as capillary objects, but droplets generated by routine handling can carry tens to hundreds of picocoulombs of electric charge. Here we combine Faraday-cup charge measurements with optical imaging to determine how such charge evolves as water droplets evaporate on dry polymer substrates. A zero-time protocol shows that a reproducible initial charge is preserved on poly(methylpentene) (PMP), whereas PDMS, SOCAL-coated surfaces, and polystyrene either exchange, dissipate, or inject charge on contact. On PMP, ensemble-resolved measurements reveal two regimes: the charge remains nearly constant during early evaporation and then decreases abruptly once the droplet reaches a small-volume state. This charge collapse coincides with spontaneous lateral translation rather than jetting or breakup. A Rayleigh-normalized analysis, including a spherical-cap stress correction and measured contact-angle retention scale, shows that motion occurs only after evaporation drives the droplet into a high electro-pinning state. High-speed imaging and kinematic analysis support a picture in which the subsequent motion is governed by repeated contact-line depinning and re-pinning: the total distance traveled is strongly affected by dry-surface pinning, whereas the peak translational velocity serves as a more robust indicator of the discharge strength. These results identify a dry-substrate mode of evaporation-driven electrostatic relaxation, distinct from Coulomb fission on lubricated surfaces, in which substrate electrostatic passivity enables charge retention, droplet geometry selects the instability onset, and whole-droplet translation provides the charge-release pathway. + oai:arXiv.org:2606.10755v1 + cond-mat.soft + physics.app-ph + Wed, 10 Jun 2026 00:00:00 -0400 + cross + http://creativecommons.org/licenses/by-nc-nd/4.0/ + Riming Xu, Yanbo Li, Jiawen Zhang, Jin Wang, Yikai Li + + + Programmable Integrated Magnonic Meshes + https://arxiv.org/abs/2605.00290 + arXiv:2605.00290v2 Announce Type: replace +Abstract: Integrated circuits are a cornerstone of modern information technology, and analog wave-based architectures could enable fast and efficient processing beyond conventional charge electronics. In magnonics, spin waves provide a highly tunable, compact and energy-efficient medium for on-chip microwave signal transport and processing. However, progress has been limited to isolated elements or short devices, severely limiting the overall functional complexity and scalability. Here we realize the key elements of universal magnonic circuitry, using a single-step direct laser writing process in yttrium iron garnet, and monolithically cascade them in multi-stage programmable devices and networks. Using magneto-optical Kerr effect microscopy, we show efficient spin-wave propagation and preserved phase coherence in waveguide structures for hundreds of wavelengths. In coupled waveguides, we observe complete and periodic power transfer over several coupling lengths, and in phase shifters we achieve arbitrary, tunable phase delays. By cascading these elements, we realize programmable splitters, frequency demultiplexers, and phase-controlled 2x2 routers, where output power and relative phase can be programmed on demand via external fields. Finally, we realize programmable magnonic interferometric meshes for on-chip radio-frequency signal routing, with up to six magnonic inputs and outputs and seven cascaded stages, without the need for intermediate amplification. These direct-write cascaded networks bridge a long-standing gap in magnonic scalability, offering a viable pathway toward integrated, large-scale architectures for both classical and quantum processing. + oai:arXiv.org:2605.00290v2 + physics.app-ph + cond-mat.mtrl-sci + Wed, 10 Jun 2026 00:00:00 -0400 + replace + http://creativecommons.org/licenses/by/4.0/ + Piero Florio, Matteo Vitali, Valerio Levati, Rasheed M. Ishola, Luca Ciaccarini Mavilla, Nora Lecis, Carsten Dubs, Riccardo Bertacco, Marco Madami, Silvia Tacchi, Daniela Petti, Edoardo Albisetti + + + Interpretable deep convolutional model for nonlinear multivariate time series in complex systems + https://arxiv.org/abs/2501.04339 + arXiv:2501.04339v2 Announce Type: replace-cross +Abstract: We introduce the Deep Convolutional Interpreter for Time Series (DCIts), a deep-learning architecture for nonlinear multivariate time series that provides sample-specific, locally interpretable descriptions of the underlying interaction structure. Unlike standard black-box forecasters, DCIts learns a time- and lag-dependent transition tensor explicitly factorized into two components: a Focuser, which selects relevant source series and time lags via a sparse masking mechanism, and a Modeler, which assigns signed coefficients to these selected interactions. This decomposition yields a local lag-adjacency structure and signed source-lag contributions for every forecast instance, enabling direct inspection of effective connectivity; when higher-order branches are activated, the same framework yields order-resolved elementwise polynomial contributions. Architecturally, DCIts uses a diverse bank of convolutional filters to capture temporal and cross-variable dependencies, which are mapped through a bottleneck network to the transition tensor. On controlled benchmark datasets with a known interaction structure, we demonstrate that DCIts achieves competitive forecasting error relative to a strong interpretable baseline while recovering stable, signed, lag-resolved interaction patterns. The framework thus prioritizes intrinsic interpretability, using forecasting accuracy as a faithfulness constraint rather than the sole objective. + oai:arXiv.org:2501.04339v2 + stat.ML + cs.LG + physics.app-ph + Wed, 10 Jun 2026 00:00:00 -0400 + replace-cross + http://arxiv.org/licenses/nonexclusive-distrib/1.0/ + 10.1063/5.0325209 + Chaos 36, 063116 (2026) + Domjan Baric, Davor Horvatic + + + Probing laser-driven surface and subsurface dynamics via grazing-incidence XFEL scattering and diffraction + https://arxiv.org/abs/2509.12015 + arXiv:2509.12015v2 Announce Type: replace-cross +Abstract: We demonstrate a grazing-incidence x-ray platform that simultaneously records time-resolved grazing-incidence small-angle x-ray scattering (GISAXS) and grazing-incidence x-ray diffraction (GID) from a femtosecond laser-irradiated gold film above the melting threshold, with picosecond resolution at an x-ray free-electron laser (XFEL). By tuning the x-ray incidence angle, the probe depth is set to tens of nanometers, enabling depth-selective sensitivity to near-surface dynamics. GISAXS resolves ultrafast changes in surface nanomorphology (correlation length, roughness), while GID quantifies subsurface lattice compression, grain orientation, melting, and recrystallization. The approach overcomes photon-flux limitations of synchrotron grazing-incidence geometries and provides stringent, time-resolved benchmarks for complex theoretical models of ultrafast laser-matter interaction and warm dense matter. Looking ahead, the same depth-selective methodology is well suited to inertial confinement fusion (ICF): it can visualize buried-interface perturbations and interfacial thermal resistance on micron to sub-micron scales that affect instability seeding and burn propagation. + oai:arXiv.org:2509.12015v2 + physics.optics + physics.app-ph + physics.ins-det + physics.plasm-ph + Wed, 10 Jun 2026 00:00:00 -0400 + replace-cross + http://arxiv.org/licenses/nonexclusive-distrib/1.0/ + 10.1107/S2052252526001727 + IUCrJ Vol.13, Pages 249-259 (2026) + Lisa Randolph, \"Ozg\"ul \"Ozt\"urk, Dmitriy Ksenzov, Lingen Huang, Thomas Kluge, S. V. Rahul, Victorien Bouffetier, Carsten Baehtz, Mohammadreza Banjafar, Erik Brambrink, Fabien Brieuc, Byoung Ick Cho, Sebastian G\"ode, Tobias Held, Hauke H\"oppner, Gerhard Jakob, Mathias Kl\"aui, Zuzana Kon\^opkov\'a, Changhoo Lee, Gyusang Lee, Mikako Makita, Mikhail Mishchenko, Mianzhen Mo, Pascal D. Ndione, Michael Paulus, Alexander Pelka, Franziska Paschke-Bruehl, Thomas R. Preston, Baerbel Rethfeld, Christian R\"odel, Michal \v{S}m\'id, Ling Wang, Sebastian T. Weber, Lennart Wollenweber, Jan-Patrick Schwinkendorf, Christian Gutt, Motoaki Nakatsutsumi + + + Real-space imaging reveals symmetry-selected nonlinear energy routing in a mechanical resonator + https://arxiv.org/abs/2605.01469 + arXiv:2605.01469v2 Announce Type: replace-cross +Abstract: Nonlinear energy exchange between vibrational modes underlies phenomena ranging from internal resonance and wave mixing to frequency-comb generation, yet modal interactions are typically inferred from spectra rather than directly observed in space. Here, we image nonlinear modal energy routing in a nearly mirror-symmetric microelectromechanical resonator using phase-locked multi-harmonic stroboscopic interferometry. By reconstructing the spatial eigenmode content of individual harmonics, we show that harmonics generated by a driven mode can be carried by distinct spatial eigenmodes, directly resolving spatial pathways of nonlinear energy transfer. Our measurements further reveal that this modal routing persists away from integer frequency matching: in the off-resonant regime, generated harmonic components are dominated by eigenmodes sharing the driven mode's mirror parity, whereas spectrally closer opposite-parity modes remain strongly suppressed. A nonlinear modal framework based on geometric nonlinearity shows that the relevant cubic coupling coefficients factorize into symmetry-dependent modal-overlap integrals, identifying mirror parity as the selection rule for nonlinear modal interaction. This work identifies spatial symmetry as a design parameter for nonlinear energy routing and provides a route to symmetry-engineered control of energy flow in multimode nonlinear wave systems. + oai:arXiv.org:2605.01469v2 + physics.optics + physics.app-ph + Wed, 10 Jun 2026 00:00:00 -0400 + replace-cross + http://creativecommons.org/licenses/by-nc-nd/4.0/ + Ya Zhang, Yuko Terasawa, Qian Liu, Shumpei Takenaka, Hua Li, Yutao Xu, Xueyong Wei, Kazuhiko Hirakawa + + + Designing single-layer PDMS devices for micron to millimeter-scale deformations + https://arxiv.org/abs/2605.17402 + arXiv:2605.17402v2 Announce Type: replace-cross +Abstract: The elasticity of PDMS has played a central role in advancing important microfluidic technologies, ranging from early valves to sophisticated organ-on-a-chip systems. However, most deformable microfluidic devices are based on geometries that require complex multi-layer PDMS architectures and include thin membranes, leading to difficult microfabrication and poor stability. Recently, Jain, Belkadi et al. (Biofabrication 16.3 (2024): 035010) introduced a single-layer PDMS device in which a wide and long microfluidic channel was deformed by pressurizing two adjacent air chambers. While they demonstrated how the channel ceiling deformation can be leveraged to compress biological materials, it remains unknown how the device geometry influences this deformation. Here, a systematic numerical study is performed on 14,336 variants of this device, through which the height of the PDMS layer is identified as the main feature that determines the ceiling deformation. Three modes of channel deformation are identified as the geometry are varied: a U shape with a central minimum, a W shape with two minima and a central maximum, or an inverse U shape with an upward-bulging single maximum. The numerical results are validated in experiments that reproduce the three modes for the predicted geometries and demonstrate vertical ceiling deformations ranging from a few microns to the millimeter scale. The generality of this approach is demonstrated for two example applications: A fully closing single-layer microfluidic valve and an optical lens of controllable anisotropic magnification. This work leverages the rapid prototyping enabled by 3D printing or micro-milling to open new perspectives in microfluidic actuation. + oai:arXiv.org:2605.17402v2 + physics.flu-dyn + physics.app-ph + Wed, 10 Jun 2026 00:00:00 -0400 + replace-cross + http://creativecommons.org/licenses/by/4.0/ + Leon V. Gebhard, Alexandre S. Avaro, Gabriel Amselem, Charles N. Baroud + + + Metasurfaces for neutral-atom trapping + https://arxiv.org/abs/2605.30498 + arXiv:2605.30498v2 Announce Type: replace-cross +Abstract: Trapped neutral atoms are one of the leading platforms for quantum information technologies, in particular for quantum computing, but scaling them to array sizes needed for utility-scale quantum computing is a major engineering challenge. Here we review optical metasurfaces as an enabling technology that provides fine control over the phase, amplitude, and polarization of light, with pixel counts far exceeding what is available with spatial light modulators (SLMs) and other active devices. The large pixel counts have recently led to demonstrations of arrays of optical tweezers with hundreds of thousands of sites and arrays of optical bottle-beams with complex three-dimensional trapping profiles. The flexibility and scalability of optical metasurfaces provides a route towards miniaturized, integrated, and highly scalable atomic experiments and instruments. + oai:arXiv.org:2605.30498v2 + physics.optics + physics.app-ph + physics.atom-ph + quant-ph + Wed, 10 Jun 2026 00:00:00 -0400 + replace-cross + http://arxiv.org/licenses/nonexclusive-distrib/1.0/ + Chengyu Fang, Minjeong Kim, Mark Saffman, Jennifer T. Choy, Mikhail Kats + + + diff --git a/tests/fixtures/asme_jpvt_openissues_rss.xml b/tests/fixtures/asme_jpvt_openissues_rss.xml new file mode 100644 index 0000000..6afc1c8 --- /dev/null +++ b/tests/fixtures/asme_jpvt_openissues_rss.xml @@ -0,0 +1,37 @@ + + + + Journal of Pressure Vessel Technology Open Issues + https://asmedigitalcollection.asme.org/pressurevesseltech + + + en-us + Mon, 11 May 2026 00:00:00 GMT + Tue, 12 May 2026 00:00:37 GMT + Silverchair + ASMEDigitalCollection@asme.org + ASMEDigitalCollection@asme.org + + Research on Low-Temperature Mechanical Properties and Fracture Behavior of 09MnNiDR Steel Based on Small Punch Test + https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/5/051504/1232699/Research-on-Low-Temperature-Mechanical-Properties + Mon, 11 May 2026 00:00:00 GMT + <span class="paragraphSection"><div class="boxTitle">Abstract</div>To develop a microdamage evaluation method applicable to in-service equipment under low-temperature conditions, this study systematically investigates the mechanical properties and fracture behavior of 09MnNiDR cryogenic steel over a broad temperature range from room temperature to −196 °C. The small punch test (SPT) technique is employed, supplemented by electron backscatter diffraction (EBSD) and scanning electron microscopy (SEM) for micromechanism analysis. Results indicate that under cryogenic conditions, dislocation slip is suppressed, leading to a more uniform distribution of plastic strain. Concurrently, the deformation process at low temperatures refines the grains within the plastic zone through mechanisms such as mechanical subdivision. As temperature decreases, the material strength increases linearly, exhibiting a significant cryogenic strengthening effect. The fracture mode transitions from ductile to brittle, with a ductile-to-brittle transition zone identified near −150 °C. An empirical formula based on SPT deformation energy is proposed to predict yield and true tensile strength, with prediction errors below 6%. By introducing a normalized energy parameter, an empirical correlation model is established between the SPT ductile-to-brittle transition temperature (DBTT) and the standard Charpy impact transition temperature. This study presents a viable methodology for safety assessment of in-service cryogenic pressure vessels through minimally invasive testing and performance prediction.</span> + 148 + 5 + 051504 + 10.1115/1.4071740 + https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/5/051504/1232699/Research-on-Low-Temperature-Mechanical-Properties + + + Improved Oxidation, Carburization Resistance and Creep Strength of Ethylene Pyrolysis Furnace Tubes at 1100 °C Through Aluminum and Tungsten Alloying + https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/4/041701/1232556/Improved-Oxidation-Carburization-Resistance-and + Mon, 11 May 2026 00:00:00 GMT + <span class="paragraphSection"><div class="boxTitle">Abstract</div>The oxidation resistance, carburization resistance, and mechanical properties of ethylene pyrolysis furnace tube alloys modified by Al/Al-W alloying were comparatively investigated with conventional alloys using various microstructural characterization techniques and mechanical property testing methods. The Al-alloyed 29Cr44Ni4AlNb+microalloy (MA) exhibits superior oxidation and carburization resistance compared to conventional 25Cr35NiNb+MA and 35Cr45NiNb+MA alloys; however, its creep rupture life was significantly reduced. Further addition of W enhanced the solid solution strengthening effect, thereby improving high-temperature tensile properties and mitigating the detrimental impact of Al on creep performance. The creep rupture life of the Al/W-modified 27Cr44Ni5W3Al+MA alloy reached levels comparable to those of conventional alloys while retaining the beneficial effects of Al in improving oxidation and carburization resistance. Through alloying strategies, this study successfully achieved a balance between corrosion resistance and mechanical properties in ethylene pyrolysis furnace tube alloys, enabling them to withstand their harsh service conditions effectively.</span> + 148 + 4 + 041701 + 10.1115/1.4071682 + https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/4/041701/1232556/Improved-Oxidation-Carburization-Resistance-and + + + \ No newline at end of file diff --git a/tests/fixtures/bloomberg_markets_rss.xml b/tests/fixtures/bloomberg_markets_rss.xml new file mode 100644 index 0000000..226cfda --- /dev/null +++ b/tests/fixtures/bloomberg_markets_rss.xml @@ -0,0 +1,8 @@ +<![CDATA[Bloomberg Markets]]>https://bloomberg.com/markets/https://www.bloomberg.com/feeds/static/images/bloomberg_logo_blue.pngBloomberg Marketshttps://bloomberg.com/markets/RSS for NodeWed, 10 Jun 2026 21:53:34 GMT<![CDATA[Global Citizen CEO on First-Ever FIFA Halftime Show]]>https://www.bloomberg.com/news/videos/2026-06-10/global-citizen-ceo-on-first-ever-fifa-halftime-show-videohttps://www.bloomberg.com/news/videos/2026-06-10/global-citizen-ceo-on-first-ever-fifa-halftime-show-videoWed, 10 Jun 2026 21:29:10 GMT<![CDATA[Retired Gen. Kimmitt: Hormuz, Lebanon Are ‘Diversions']]>https://www.bloomberg.com/news/videos/2026-06-10/retired-gen-kimmitt-hormuz-lebanon-are-diversions-videohttps://www.bloomberg.com/news/videos/2026-06-10/retired-gen-kimmitt-hormuz-lebanon-are-diversions-videoWed, 10 Jun 2026 21:19:00 GMT<![CDATA[US Treasury Eases Legal Restrictions Across Venezuela Licenses]]>https://www.bloomberg.com/news/articles/2026-06-10/us-treasury-eases-legal-restrictions-across-venezuela-licenseshttps://www.bloomberg.com/news/articles/2026-06-10/us-treasury-eases-legal-restrictions-across-venezuela-licensesWed, 10 Jun 2026 21:17:56 GMT<![CDATA[Chevron Among Drillers to Feed Argentina Shale NGL Venture]]>https://www.bloomberg.com/news/articles/2026-06-10/chevron-among-drillers-to-feed-key-argentina-shale-ngl-venturehttps://www.bloomberg.com/news/articles/2026-06-10/chevron-among-drillers-to-feed-key-argentina-shale-ngl-ventureWed, 10 Jun 2026 19:54:37 GMT<![CDATA[Stock Investors Eye Deep Run for Japan at Football World Cup]]>https://www.bloomberg.com/news/articles/2026-06-10/stock-investors-eye-deep-run-for-japan-at-football-world-cuphttps://www.bloomberg.com/news/articles/2026-06-10/stock-investors-eye-deep-run-for-japan-at-football-world-cupWed, 10 Jun 2026 21:00:00 GMT<![CDATA[Tech Stocks Sink as Oil Jumps on US-Iran Jitters: Markets Wrap]]>https://www.bloomberg.com/news/articles/2026-06-09/stock-market-today-dow-s-p-live-updateshttps://www.bloomberg.com/news/articles/2026-06-09/stock-market-today-dow-s-p-live-updatesTue, 09 Jun 2026 22:09:15 GMT<![CDATA[Bond Traders Bet on a Fed Hike in 2026 Even After Soft Core CPI]]>https://www.bloomberg.com/news/articles/2026-06-10/bond-traders-keep-bets-on-a-fed-hike-this-year-after-cpi-reporthttps://www.bloomberg.com/news/articles/2026-06-10/bond-traders-keep-bets-on-a-fed-hike-this-year-after-cpi-reportWed, 10 Jun 2026 12:51:58 GMT<![CDATA[Cancer Drugmaker Parabilis Surges 58% in US Trading Debut]]>https://www.bloomberg.com/news/articles/2026-06-10/drugmaker-parabilis-soars-67-after-745-million-ipo-placementhttps://www.bloomberg.com/news/articles/2026-06-10/drugmaker-parabilis-soars-67-after-745-million-ipo-placementWed, 10 Jun 2026 17:10:06 GMT<![CDATA[Citi Says Investors Growing More Selective on Data Center Bonds]]>https://www.bloomberg.com/news/articles/2026-06-10/citi-says-investors-growing-more-selective-on-data-center-bondshttps://www.bloomberg.com/news/articles/2026-06-10/citi-says-investors-growing-more-selective-on-data-center-bondsWed, 10 Jun 2026 20:14:41 GMT<![CDATA[Man Accused of $37 Billion LA Fire Is Innocent, His Lawyer Says]]>https://www.bloomberg.com/news/articles/2026-06-10/man-accused-of-igniting-massive-los-angeles-fire-goes-on-trialhttps://www.bloomberg.com/news/articles/2026-06-10/man-accused-of-igniting-massive-los-angeles-fire-goes-on-trialWed, 10 Jun 2026 13:45:00 GMT<![CDATA[Latham’s Kelly Forecasts Private Equity-Fueled Second-Half Surge]]>https://www.bloomberg.com/news/articles/2026-06-10/latham-s-kelly-forecasts-private-equity-fueled-second-half-surgehttps://www.bloomberg.com/news/articles/2026-06-10/latham-s-kelly-forecasts-private-equity-fueled-second-half-surgeWed, 10 Jun 2026 20:10:45 GMT<![CDATA[An Evening With Bloomberg Weekend]]>https://www.bloomberg.com/news/videos/2026-06-10/an-evening-with-bloomberg-weekend-videohttps://www.bloomberg.com/news/videos/2026-06-10/an-evening-with-bloomberg-weekend-videoWed, 10 Jun 2026 19:46:01 GMT<![CDATA[Why Nike Keeps Stumbling]]>https://www.bloomberg.com/news/videos/2026-06-10/why-nike-keeps-stumbling-videohttps://www.bloomberg.com/news/videos/2026-06-10/why-nike-keeps-stumbling-videoWed, 10 Jun 2026 20:00:06 GMT<![CDATA[SpaceX Tells Investors It Has Lined Up Blue-Chip Credit Ratings]]>https://www.bloomberg.com/news/articles/2026-06-10/spacex-touts-investment-grade-ratings-for-75-billion-ipohttps://www.bloomberg.com/news/articles/2026-06-10/spacex-touts-investment-grade-ratings-for-75-billion-ipoWed, 10 Jun 2026 19:59:28 GMT<![CDATA[Major US Banks Face Federal Probe Over Debanking Allegations]]>https://www.bloomberg.com/news/articles/2026-06-10/major-us-banks-face-federal-probe-over-debanking-allegationshttps://www.bloomberg.com/news/articles/2026-06-10/major-us-banks-face-federal-probe-over-debanking-allegationsWed, 10 Jun 2026 19:04:47 GMT<![CDATA[The Golden Age of IPOs is Here: IPOX's Schuster]]>https://www.bloomberg.com/news/videos/2026-06-10/the-golden-age-of-ipos-is-here-ipox-s-schuster-videohttps://www.bloomberg.com/news/videos/2026-06-10/the-golden-age-of-ipos-is-here-ipox-s-schuster-videoWed, 10 Jun 2026 19:47:53 GMT<![CDATA[SpaceX Price Tag is 'Very Steep': Renaissance's Kennedy]]>https://www.bloomberg.com/news/videos/2026-06-10/spacex-price-tag-is-very-steep-renaissance-s-kennedy-videohttps://www.bloomberg.com/news/videos/2026-06-10/spacex-price-tag-is-very-steep-renaissance-s-kennedy-videoWed, 10 Jun 2026 19:33:25 GMT<![CDATA[UK’s Reeves to Lower Tax Burden for Wealthy US Expats]]>https://www.bloomberg.com/news/articles/2026-06-10/uk-s-reeves-to-lower-tax-burden-for-wealthy-us-expatshttps://www.bloomberg.com/news/articles/2026-06-10/uk-s-reeves-to-lower-tax-burden-for-wealthy-us-expatsWed, 10 Jun 2026 17:59:35 GMT<![CDATA[Capitalism Is a Fact of Life, Starritt Says]]>https://www.bloomberg.com/news/videos/2026-06-10/capitalism-is-a-fact-of-life-starritt-says-videohttps://www.bloomberg.com/news/videos/2026-06-10/capitalism-is-a-fact-of-life-starritt-says-videoWed, 10 Jun 2026 19:26:55 GMT<![CDATA[CIBC to Offer SpaceX Access Through Depositary Receipt]]>https://www.bloomberg.com/news/articles/2026-06-10/cibc-to-offer-spacex-access-through-canadian-depositary-receipthttps://www.bloomberg.com/news/articles/2026-06-10/cibc-to-offer-spacex-access-through-canadian-depositary-receiptWed, 10 Jun 2026 12:54:12 GMT<![CDATA[Why Starritt Wrote 'Drayton and Mackenzie']]>https://www.bloomberg.com/news/videos/2026-06-10/why-starritt-wrote-drayton-and-mackenzie-videohttps://www.bloomberg.com/news/videos/2026-06-10/why-starritt-wrote-drayton-and-mackenzie-videoWed, 10 Jun 2026 19:14:48 GMT<![CDATA[Oil Jumps as Trump Threatens to ‘Hit Iran Hard’ Imminently]]>https://www.bloomberg.com/news/articles/2026-06-09/latest-oil-market-news-and-analysis-for-june-10https://www.bloomberg.com/news/articles/2026-06-09/latest-oil-market-news-and-analysis-for-june-10Tue, 09 Jun 2026 22:03:42 GMT<![CDATA[Ipsos Poll Shows Majority of Adults Would Rejoin EU]]>https://www.bloomberg.com/news/videos/2026-06-10/ipsos-poll-shows-majority-of-adults-would-rejoin-eu-videohttps://www.bloomberg.com/news/videos/2026-06-10/ipsos-poll-shows-majority-of-adults-would-rejoin-eu-videoWed, 10 Jun 2026 19:06:53 GMT<![CDATA[Metals Drop on Renewed Middle East Tensions, US Rate Outlook]]>https://www.bloomberg.com/news/articles/2026-06-10/aluminum-falls-to-one-month-low-on-iran-tension-us-rate-outlookhttps://www.bloomberg.com/news/articles/2026-06-10/aluminum-falls-to-one-month-low-on-iran-tension-us-rate-outlookWed, 10 Jun 2026 03:00:51 GMT<![CDATA[How Will the UK and EU Get Along in 2036?]]>https://www.bloomberg.com/news/videos/2026-06-10/how-will-the-uk-and-eu-get-along-in-2036-videohttps://www.bloomberg.com/news/videos/2026-06-10/how-will-the-uk-and-eu-get-along-in-2036-videoWed, 10 Jun 2026 18:57:03 GMT<![CDATA[SpaceX IPO Draws Billions in Orders From Middle Eastern Funds]]>https://www.bloomberg.com/news/videos/2026-06-10/spacex-ipo-draws-billions-in-orders-from-middle-east-videohttps://www.bloomberg.com/news/videos/2026-06-10/spacex-ipo-draws-billions-in-orders-from-middle-east-videoWed, 10 Jun 2026 18:49:07 GMT<![CDATA[SpaceX IPO Whips Musk Fans Into Frenzy: ‘The More, the Better’]]>https://www.bloomberg.com/news/articles/2026-06-10/musk-stock-fans-say-the-more-the-better-in-spacex-ipo-frenzyhttps://www.bloomberg.com/news/articles/2026-06-10/musk-stock-fans-say-the-more-the-better-in-spacex-ipo-frenzyWed, 10 Jun 2026 18:43:08 GMT<![CDATA[Investment Committee's Message: Don't Overfocus on SpaceX IPO]]>https://www.bloomberg.com/news/videos/2026-06-10/investment-committee-don-t-overfocus-on-spacex-ipo-videohttps://www.bloomberg.com/news/videos/2026-06-10/investment-committee-don-t-overfocus-on-spacex-ipo-videoWed, 10 Jun 2026 18:39:43 GMT<![CDATA[Bitcoin Selloff Leaves Half of All Supply Trading at a Loss]]>https://www.bloomberg.com/news/articles/2026-06-10/bitcoin-selloff-leaves-half-of-all-supply-trading-at-a-losshttps://www.bloomberg.com/news/articles/2026-06-10/bitcoin-selloff-leaves-half-of-all-supply-trading-at-a-lossWed, 10 Jun 2026 11:05:47 GMT<![CDATA[Ashley’s Frasers Seeks to Buy Rest of Boss for $2.3 Billion]]>https://www.bloomberg.com/news/articles/2026-06-10/ashley-s-frasers-offers-to-buy-hugo-boss-in-3-1-billion-dealhttps://www.bloomberg.com/news/articles/2026-06-10/ashley-s-frasers-offers-to-buy-hugo-boss-in-3-1-billion-dealWed, 10 Jun 2026 16:45:02 GMT \ No newline at end of file diff --git a/tests/fixtures/csb_investigation_page_excerpt.html b/tests/fixtures/csb_investigation_page_excerpt.html new file mode 100644 index 0000000..e6a8a99 --- /dev/null +++ b/tests/fixtures/csb_investigation_page_excerpt.html @@ -0,0 +1,11 @@ +Givaudan Sense Colour Explosion + +Appendix C – Reactivity Testing Results and Analysis +Fatal Runaway Reaction and Explosion at Givaudan Sense Colour / D.D. Williamson +Givaudan Explosion Investigation Update + Recommendation Status Change Summary + Recommendation Status Change Summary + Recommendation Status Change Summary + Recommendation Status Change Summary + \ No newline at end of file diff --git a/tests/fixtures/csb_sitemap_sample.xml b/tests/fixtures/csb_sitemap_sample.xml new file mode 100644 index 0000000..3117a1d --- /dev/null +++ b/tests/fixtures/csb_sitemap_sample.xml @@ -0,0 +1 @@ +https://www.csb.gov/recommendations/preventive-maintenance/2022-06-02T17:17:27-06:00weekly0.5https://www.csb.gov/site-map/2017-05-05T23:59:28-06:00weekly0.5https://www.csb.gov/recommendations/preventive-maintenance-investigations/2018-04-27T14:32:25-06:00weekly0.5https://www.csb.gov/investigations/data-quality-/2025-07-28T13:37:44-06:00weekly0.5https://www.csb.gov/recommendations/preventive-maintenances/2022-06-02T17:19:06-06:00weekly0.5https://www.csb.gov/videos/video-feedback-form/2017-05-04T18:17:43-06:00weekly0.5https://www.csb.gov/investigations/2017-05-08T16:06:42-06:00weekly0.5https://www.csb.gov/investigations/completed-investigations/2017-05-30T19:02:58-06:00weekly0.5https://www.csb.gov/investigations/current-investigations/2020-10-19T15:06:55-06:00weekly0.5https://www.csb.gov/videos/2017-03-09T13:38:53-06:00weekly0.5https://www.csb.gov/videos/take-more-action-to-prevent-dust-explosions/2013-05-17T16:46:08-06:00weekly0.5https://www.csb.gov/videos/protect-public-employees-from-workplace-accidents/2013-05-17T16:46:34-06:00weekly0.5 \ No newline at end of file diff --git a/tests/fixtures/economist_latest_rss.xml b/tests/fixtures/economist_latest_rss.xml new file mode 100644 index 0000000..2e0e337 --- /dev/null +++ b/tests/fixtures/economist_latest_rss.xml @@ -0,0 +1,71 @@ + + + + + <![CDATA[Latest Updates]]> + + + + + https://www.economist.com/latest + Wed, 10 Jun 2026 21:11:56 +0000 + Wed, 10 Jun 2026 21:11:56 +0000 + + + + <![CDATA[Syria is an unexpected beneficiary of the Gulf war]]> + + + + + https://www.economist.com/middle-east-and-africa/2026/06/10/syria-is-an-unexpected-beneficiary-of-the-gulf-war + 5737613e-c6cd-4cf0-b7da-fbfb52872f63 + Wed, 10 Jun 2026 19:26:42 +0000 + + + + <![CDATA[How to win the World Cup]]> + + + + + https://www.economist.com/international/2026/06/10/how-to-win-the-world-cup + 1019df1e-5c1e-4784-ae0c-31741c176e41 + Wed, 10 Jun 2026 19:07:01 +0000 + + + + <![CDATA[American capitalism is run by millionaires, not billionaires]]> + + + + + https://www.economist.com/business/2026/06/10/american-capitalism-is-run-by-millionaires-not-billionaires + dbbcb101-a7de-472b-a62c-d969ab033b90 + Wed, 10 Jun 2026 19:01:31 +0000 + + + + <![CDATA[New techniques can predict and prevent lung cancer ]]> + + + + + https://www.economist.com/science-and-technology/2026/06/10/new-techniques-can-predict-and-prevent-lung-cancer + dbc7231c-6c7c-42fb-8930-bb099e1d3015 + Wed, 10 Jun 2026 18:48:35 +0000 + + + + <![CDATA[The World Cup has always been beset by scandal and strife]]> + + + + + https://www.economist.com/international/2026/06/10/the-world-cup-has-always-been-beset-by-scandal-and-strife + f2213e72-3531-4894-a33f-47bce2fea4e9 + Wed, 10 Jun 2026 18:25:19 +0000 + + + + diff --git a/tests/fixtures/ieee_spectrum_energy_rss.xml b/tests/fixtures/ieee_spectrum_energy_rss.xml new file mode 100644 index 0000000..f4b4204 --- /dev/null +++ b/tests/fixtures/ieee_spectrum_energy_rss.xml @@ -0,0 +1,4 @@ + +IEEE Spectrumhttps://spectrum.ieee.org/IEEE Spectrumen-usMon, 08 Jun 2026 14:13:03 -0000https://spectrum.ieee.org/media-library/eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpbWFnZSI6Imh0dHBzOi8vYXNzZXRzLnJibC5tcy8yNjg4NDUyMC9vcmlnaW4ucG5nIiwiZXhwaXJlc19hdCI6MTgyNjE0MzQzOX0.N7fHdky-KEYicEarB5Y-YGrry7baoW61oxUszI23GV4/image.png?width=210https://spectrum.ieee.org/IEEE SpectrumFusion Startup’s Commercial Reactor Design Gets a Big Boosthttps://spectrum.ieee.org/fusion-reactor-tokamak-cfs-arc

Nuclear fusion reactors offer the hope of vast, clean energy from the same process that powers stars. But despite decades of research, a fusion reactor that can supply practical amounts of power has proven elusive. Now startup Commonwealth Fusion Systems has revealed in depth what it says is the most complex aspect of the reactor it is constructing—the way the reactor controls the plasma responsible for generating power.

The company says its findings support its vision—a reactor that can generate 1.1 gigawatts of fusion power and deliver 400 megawatts of net electricity to the grid. “That can power about 280,000 average American homes for a year, all using an amount of fuel you could deliver in a pickup truck,” says Brandon Sorbom, cofounder and chief science officer of Commonwealth Fusion Systems (CFS) in Devens, Mass.

The ARC (affordable, robust, compact) fusion reactor that CFS is developing is a tokamak. This is essentially a doughnut-shaped bottle that magnetically traps plasma at pressures and temperatures high enough to force atomic nuclei to fuse. A fraction of the mass of these atoms gets converted into energy. “We’re basically creating a miniature star,” Sorbom says.

High-Temperature Superconductor Magnets

The key innovation of the ARC reactor is the use of high-temperature superconductor (HTSC) magnets instead of typical superconducting magnets, which require frigid temperatures near absolute zero to work. Although HTSCs still require temperatures in the range of about 20 to 77 kelvins (-200 to -250 °C), the relative warmth in which they operate means they require dramatically less cooling equipment. This makes ARC significantly more compact and simple than previous fusion reactor designs, such as the International Thermonuclear Experimental Reactor (ITER).

The fusion reactions generate neutrons, whose energy heats a continuously flowing loop of molten salt around the reactor’s magnetic bottle. This blanket of molten salt then heats a fluid to drive a turbine that generates electricity.

CFS researchers collaborated with scientists at MIT, Columbia, the Max Planck Institute for Plasma Physics and other institutions around the world to describe the scientific underpinnings of the ARC reactor. They detailed their research in five peer-reviewed studies published today in the Journal of Plasma Physics.

“We demonstrate that the ARC power plant has a solid foundation in physics,” Sorbom says. “The papers confirm that when we build the ARC fusion power plant, it will work.”

Roughly two-thirds of the 58 authors of the studies come from outside CFS. “These papers are not just the stamp of our validation, but that of the global fusion-science community,” Sorbom says. “And then they underwent peer review from more institutions for independent checks to make sure all our calculations were correct.”

Managing Plasma Disruptions in Tokamaks

The new studies detail how ARC will deal with a major challenge all fusion reactors face. Plasma disruptions occur when instabilities within the plasma flow lead it to spiral out of control and make contact with the reactor wall. These can not only inflict a great deal of damage—the plasma is 150 million °C and carries 12 million amperes of electrical current—but also extinguishes the plasma.

“Plasma physics is really hard,” Sorbom says. “It’s the most complicated part of the machine.”

In the new studies, the researchers describe methods for limiting the impacts of such disruptions, such as rapidly injecting massive amounts of gas into ARC as a cushion to keep the plasma from damaging the reactor. But they also have designed ARC to withstand one disruption per day and to restart the plasma within a minute without interrupting power output, Sorbom says.

“We designed ARC considering that even on the wrong side of all the uncertainties we still face, ARC will still work.” —Brandon Sorbom, Commonwealth Fusion Systems

“Even if the plasma is off, the molten salt doesn’t decrease dramatically in temperature immediately,” Sorbom says. The salt can therefore continue to supply heat for electricity generation until fusion restarts.

ARC will use deuterium and tritium, two hydrogen isotopes, as its fuel. Ultimately, ARC will breed more tritium for future use, as neutrons from the plasma striking the molten salt will transmute some of the lithium within the salt to the rare hydrogen isotope. The tritium can then serve as fuel for the reactor, or help seed other power plants, “enabling the rapid scaling of this technology,” Sorbom says.

ARC Fusion Reactor Lifetime and Maintenance

The projected lifetime of ARC is 25 to 30 years. Its longevity depends on how long the superconducting magnets can survive damage from neutrons escaping the salt blanket. If the researchers want a fusion plant with a longer life, “we can make it slightly larger to put in more shielding between the blanket and the magnets,” Sorbom says.

The new studies explain that the reactor’s plasma fuel is held within a vacuum vessel that erodes over time. “It lasts somewhere between one to two years before it has to be replaced,” Sorbom says.

CFS has designed the vacuum vessel to be swapped out as quickly as possible. The reactor can be opened up and the salt blanket drained away so the company can cut up an old vacuum vessel and place in a new one.

ARC will have to shut down during such times, but Sorbom notes other kinds of power plants often experience outages every few years for routine maintenance as well. The startup hopes ARC will have short maintenance cycles, “a couple of months at most,” he says. The company is now collaborating with a grid operator to plan around such maintenance.

Sorbom adds that between replacements, research and development could design better vacuum vessels. “Every time we replace it, we can upgrade it,” he says. “The first may last one year. The next year, two years. Then after that, 2.5 years.”

All in all, these new studies suggest ARC is going to work, Sorbom says. “We designed ARC considering that even on the wrong side of all the uncertainties we still face, ARC will still work.”

Currently the startup is building a smaller prototype of ARC called Sparc. “Sparc is now more than 75 percent complete,” Sorbom says. The company aims for Sparc to generate its first plasma in 2027, and aims to build ARC at a site in Virginia by the early 2030s.

As thorough as the new studies are, the ARC reactor is still evolving, Sorbom adds. “We will be able to use what we learn from Sparc to make final design tweaks on ARC.”

]]>
Thu, 04 Jun 2026 14:46:59 +0000https://spectrum.ieee.org/fusion-reactor-tokamak-cfs-arcFusion-powerTokamakFusion-reactorClimate-changeClimate-techCharles Q. Choi
What It Takes for Future-Ready Power Distributionhttps://spectrum.ieee.org/distribution-grid-modernization

This sponsored article is brought to you by Black & Veatch.

The biggest challenge facing utilities today isn’t what it seems. It’s not demand, even as load growth accelerates. It’s not extreme weather, even as “major events” become routine. It’s not cybersecurity, even as connections expand across the grid.


Man in gray blazer and blue shirt posed against a plain white background.

The real challenge is this: Distribution systems were designed for a different reality.

Long gone are the days of predictable demand, one-way power flow and isolated disruptions. At Black & Veatch, we see that leading utilities are no longer debating whether to modernize. They’re deciding how quickly they can do it, and how to do it at scale.

Across grid modernization programs globally, three truths consistently emerge. They define what it takes to prepare the distribution system for what’s next:

1. Outage response is not a resilience strategy

Resilience is being redefined in real time. A strategy centered on mobilizing crews and restoring service as quickly as possible is reactive, and increasingly insufficient.

Resilience has to shift upstream into integrated system design. That starts with hardening. Stronger poles, undergrounding and structural upgrades all have a role, particularly in high-risk corridors. We’re also seeing meaningful gains from how the network is configured and how quickly it can respond without waiting on manual intervention.

This is where distribution automation programs can change outcomes. Strategically placed reclosers, automated switches and fault indicators help contain disruptions before they spread. When combined with feeder reconfiguration and updated protection strategies, distribution automation investments allow utilities to set more aggressive recovery targets and achieve measurable reductions in outage duration and customer impact.

2. Future-readiness depends on DERs at scale

Forecasting is less and less reliable. Only 19 percent of utilities report strong confidence in their ability to predict future load growth, according to the Black & Veatch 2025 Electric Report. Distributed Energy Resources (DERs) like solar, storage, EVs and behind-the-meter generation are exciting solutions; but they fundamentally change how the system operates. Power is no longer just delivered. It’s injected, stored and redirected in ways the system was never designed to manage.

At scale, these challenges show up quickly — particularly on feeders where distributed generation is approaching or exceeding hosting capacity. Protection coordination becomes more difficult when fault current comes from multiple directions. Voltage becomes less predictable as generation fluctuates throughout the day. And planning models must now account for highly variable, location-specific behavior.

Distribution modernization is fundamentally changing how the system is designed and operated so it can absorb disruption, manage bi-directional flows and respond in real time.

Adapting to bi-directional power flow requires more than incremental updates. Leading utilities are responding by building flexibility into the system, moving beyond static assumptions toward dynamic hosting capacity and interconnection studies, planning that incorporates DER, EV adoption and localized load growth, and infrastructure aligned with the communications and control needed to manage it.

3. The edge must be intelligent, visible and secure

As system stress and complexity increase, utilities need far greater visibility and control over the network. Historically, utilities relied on customer calls, Supervisory Control and Data Acquisition (SCADA) at the substation level and field crews to understand what was happening on the system. That model doesn’t hold up. You can’t effectively manage a system you can’t see. Plus, the most critical events are increasingly happening beyond the substation — on feeders, laterals, and at the edge where DER and customer behavior are interacting with the grid.

Grid-edge technologies have become essential. Sensors, Advanced Metering Infrastructure (AMI) and automated switching provide the raw data and control needed to move from reactive to proactive operations. In more advanced deployments, utilities are creating centralized control environments that allow operators to see and manage the distribution system in near real time. That capability is enabled by:

  • Advanced communications networks to form the backbone of real-time grid visibility
  • Distribution Management System (DMS) and Outage Management System (OMS) to enable faster, more coordinated system response
  • Analytics, AI and machine learning to improve situational awareness, anticipate system conditions, and support operational decision-making

The same connectivity enabling this real-time visibility and control also introduces new vulnerabilities, blurring the line between physical and cyber risk, yet many utilities manage them separately. Only 22 percent have unified teams in place, even as threats continue to rise, including a 50 percent increase in substation attacks and growing exposure to malware and ransomware, according to the Black & Veatch 2025 Electric Report. Cybersecurity and resilient network design must be embedded into the architecture from the outset—not layered on after the fact.

See what bolder vision looks like

Distribution modernization is fundamentally changing how the system is designed and operated so it can absorb disruption, manage bi-directional flows and respond in real time.

To learn about a successful program, check out Georgia Power’s recent grid modernization program. Black & Veatch partnered with the utility on large-scale infrastructure upgrades. The results? Outages are down 76 percent, restoration times have improved by more than 80 percent and communities across Georgia are powered by a grid built to meet the future head-on.

When the state faced the most destructive storm in the company’s history, Hurricane Helene, Georgia Power deployed a rapid response team that utilized its “smart grid” and restored power to more than 1 million customers within days.

A grid built to meet the future head-on—that’s the result of bolder vision.]]>
Wed, 03 Jun 2026 11:00:01 +0000https://spectrum.ieee.org/distribution-grid-modernizationDistributed-energy-resourcesGrid-resiliencePower-gridGrid-modernizationNick Lehnert
\ No newline at end of file diff --git a/tests/fixtures/nikkei_asia_nar_rdf.xml b/tests/fixtures/nikkei_asia_nar_rdf.xml new file mode 100644 index 0000000..f0145ed --- /dev/null +++ b/tests/fixtures/nikkei_asia_nar_rdf.xml @@ -0,0 +1,262 @@ + + + + Nikkei Asia + https://asia.nikkei.com/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <![CDATA[Tokyo Disneyland 'magic' in doubt as operator's stock falls]]> + https://asia.nikkei.com/business/media-entertainment/tokyo-disneyland-magic-in-doubt-as-operator-s-stock-falls + + + <![CDATA[SK Hynix to triple wafer capacity by 2034: Chairman Chey]]> + https://asia.nikkei.com/business/tech/semiconductors/sk-hynix-to-triple-wafer-capacity-by-2034-chairman-chey + + + <![CDATA[Analysis: Kim Jong Un emerges as winner in summit with Xi Jinping]]> + https://asia.nikkei.com/editor-s-picks/china-up-close/analysis-kim-jong-un-emerges-as-winner-in-summit-with-xi-jinping + + + <![CDATA[NTT sets sights on Nvidia, AI race with $500m optical network fund]]> + https://asia.nikkei.com/business/technology/ntt-sets-sights-on-nvidia-ai-race-with-500m-optical-network-fund + + + <![CDATA[Japan to help content industry sue over copyright infringement abroad]]> + https://asia.nikkei.com/politics/japan-to-help-content-industry-sue-over-copyright-infringement-abroad + + + <![CDATA[Anthropic plugs Claude AI in Japan for automated software development]]> + https://asia.nikkei.com/business/technology/artificial-intelligence/anthropic-plugs-claude-ai-in-japan-for-automated-software-development + + + <![CDATA[US tungsten scrap exports to Japan soar on Chinese curbs]]> + https://asia.nikkei.com/spotlight/supply-chain/us-tungsten-scrap-exports-to-japan-soar-on-chinese-curbs + + + <![CDATA[TDK to buy US maker of AI data center cooling components for up to $400m]]> + https://asia.nikkei.com/business/business-deals/tdk-to-buy-us-maker-of-ai-data-center-cooling-components-for-up-to-400m + + + <![CDATA[Nippon Life's private credit assets reach $4.6bn]]> + https://asia.nikkei.com/business/insurance/nippon-life-s-private-credit-assets-reach-4.6bn + + + <![CDATA[US firms see China as essential despite rising economic and political risks]]> + https://asia.nikkei.com/business/business-trends/us-firms-see-china-as-essential-despite-rising-economic-and-political-risks + + + <![CDATA[Manslaughter, other charges filed over Hong Kong's Wang Fuk Court fire]]> + https://asia.nikkei.com/economy/fires/manslaughter-other-charges-filed-over-hong-kong-s-wang-fuk-court-fire + + + <![CDATA[ADB and peers need to 'anchor' international order: President Kanda says]]> + https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/adb-and-peers-need-to-anchor-international-order-president-kanda-says + + + <![CDATA[Asia faces risks of economic spillover from Iran and AI disinformation]]> + https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/asia-faces-risks-of-economic-spillover-from-iran-and-ai-disinformation + + + <![CDATA[Swire dangles Cathay shares in $600m convertible bond issuance]]> + https://asia.nikkei.com/business/companies/swire-dangles-cathay-shares-in-600m-convertible-bond-issuance + + + <![CDATA[Shin-Etsu to set up rare-earth smelter in Japan to ease reliance on China]]> + https://asia.nikkei.com/business/materials/shin-etsu-to-set-up-rare-earth-smelter-in-japan-to-ease-reliance-on-china + + + <![CDATA[Setting sea border with Japan vital: Philippine foreign secretary]]> + https://asia.nikkei.com/editor-s-picks/interview/setting-sea-border-with-japan-vital-philippine-foreign-secretary + + + <![CDATA[Strong dollar rally weighs heavier on struggling Asian countries]]> + https://asia.nikkei.com/business/markets/strong-dollar-rally-weighs-heavier-on-struggling-asian-countries + + + <![CDATA[Thailand's shrimp industry hit by Malaysia's import ban]]> + https://asia.nikkei.com/business/fisheries/thailand-s-shrimp-industry-hit-by-malaysia-s-import-ban + + + <![CDATA[Applied Materials opens $500m manufacturing campus in Singapore]]> + https://asia.nikkei.com/business/technology/applied-materials-opens-500m-manufacturing-campus-in-singapore + + + <![CDATA[Malaysia's Anwar warns against global powers weaponizing trade]]> + https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/malaysia-s-anwar-warns-against-global-powers-weaponizing-trade + + + <![CDATA[Apple and Nvidia supplier Foxconn invests in Vietnam solar, wind power]]> + https://asia.nikkei.com/business/energy/apple-and-nvidia-supplier-foxconn-invests-in-vietnam-solar-wind-power + + + <![CDATA[World Cup tests Asia's appetite for costly broadcast rights]]> + https://asia.nikkei.com/spotlight/sports/world-cup-tests-asia-s-appetite-for-costly-broadcast-rights + + + <![CDATA[Brookfield bets on its Japan business to top Hong Kong and Singapore]]> + https://asia.nikkei.com/business/finance/brookfield-bets-on-its-japan-business-to-top-hong-kong-and-singapore + + + <![CDATA[Tencent raises $4.6bn in dual dollar, yuan bond issuances]]> + https://asia.nikkei.com/business/technology/tencent-raises-4.6bn-in-dual-dollar-yuan-bond-issuances + + + <![CDATA[Ferrari Luce EV highlights European struggle to lure back China's superrich]]> + https://asia.nikkei.com/business/automobiles/electric-vehicles/ferrari-luce-ev-highlights-european-struggle-to-lure-back-china-s-superrich + + + <![CDATA[Indian families scale back on gold for weddings as prices hover near highs]]> + https://asia.nikkei.com/business/markets/commodities/indian-families-scale-back-on-gold-for-weddings-as-prices-hover-near-highs + + + <![CDATA[Japan's new defense document to name China the biggest concern]]> + https://asia.nikkei.com/politics/defense/japan-s-new-defense-document-to-name-china-the-biggest-concern + + + <![CDATA[Japan to bolster IP protections for prized new fruit, vegetable varieties]]> + https://asia.nikkei.com/business/food-beverage/japan-to-bolster-ip-protections-for-prized-new-fruit-vegetable-varieties + + + <![CDATA[Japan's JGC bets on carbon-feeding bacteria to create bioplastics]]> + https://asia.nikkei.com/spotlight/environment/climate-change/japan-s-jgc-bets-on-carbon-feeding-bacteria-to-create-bioplastics + + + <![CDATA[Rural Japan hopes to charm domestic travelers priced out of overseas trips]]> + https://asia.nikkei.com/business/travel-leisure/rural-japan-hopes-to-charm-domestic-travelers-priced-out-of-overseas-trips + + + <![CDATA[GM partners with Peak Energy for sodium-ion battery storage]]> + https://asia.nikkei.com/business/energy/gm-partners-with-peak-energy-for-sodium-ion-battery-storage + + + <![CDATA[Japan seeks bigger role in Asia's subsea cables as AI rewires demand]]> + https://asia.nikkei.com/business/technology/tech-asia/japan-seeks-bigger-role-in-asia-s-subsea-cables-as-ai-rewires-demand + + + <![CDATA[Why Japan's Takaichi has stepped back from BOJ rate hike debate]]> + https://asia.nikkei.com/spotlight/comment/why-japan-s-takaichi-has-stepped-back-from-boj-rate-hike-debate + + + <![CDATA[Japan flying car startup SkyDrive aims for the skies in 2028]]> + https://asia.nikkei.com/business/transportation/japan-flying-car-startup-skydrive-aims-for-the-skies-in-2028 + + + <![CDATA[Hanwha Qcells kicks off first fully onshore US solar supply chain]]> + https://asia.nikkei.com/business/technology/hanwha-qcells-kicks-off-first-fully-onshore-us-solar-supply-chain + + + <![CDATA[Japan's Mitsubishi HC, Canada's Brookfield to buy European wind, solar farms in AI play]]> + https://asia.nikkei.com/business/energy/japan-s-mitsubishi-hc-canada-s-brookfield-to-buy-european-wind-solar-farms-in-ai-play2 + + + <![CDATA[G7 plans first joint statement for protecting minors on social media]]> + https://asia.nikkei.com/business/technology/g7-plans-first-joint-statement-for-protecting-minors-on-social-media + + + <![CDATA[Toyota backs Japan self-driving startup Tier IV in development push]]> + https://asia.nikkei.com/business/automobiles/toyota-backs-japan-self-driving-startup-tier-iv-in-development-push + + + <![CDATA[JPMorgan Chase emerges as SoftBank Group's top lender, surpassing Mizuho]]> + https://asia.nikkei.com/business/softbank/jpmorgan-chase-emerges-as-softbank-group-s-top-lender-surpassing-mizuho + + + <![CDATA[Malaysia to promise Japan maximum possible LNG, naphtha]]> + https://asia.nikkei.com/business/energy/malaysia-to-promise-japan-maximum-possible-lng-naphtha + + + <![CDATA[In Focus: Mindanao reels from another deadly earthquake]]> + https://asia.nikkei.com/photos/in-focus-mindanao-reels-from-another-deadly-earthquake + + + <![CDATA[Pentagon blacklists Alibaba, BYD and Baidu over alleged military ties]]> + https://asia.nikkei.com/politics/international-relations/us-china-tensions/pentagon-blacklists-alibaba-byd-and-baidu-over-alleged-military-ties + + + <![CDATA[Bank of Japan set to hike key interest rate to 1%]]> + https://asia.nikkei.com/economy/bank-of-japan/bank-of-japan-set-to-hike-key-interest-rate-to-1 + + + <![CDATA[Semiconductor advances a 'must' for data centers, says Tokyo Electron boss]]> + https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/semiconductor-advances-a-must-for-data-centers-says-tokyo-electron-boss + + + <![CDATA[Xi shores up China's sway in Pyongyang, wary of North Korea-Russia ties]]> + https://asia.nikkei.com/politics/international-relations/xi-shores-up-china-s-sway-in-pyongyang-wary-of-north-korea-russia-ties + + + <![CDATA[South Korea election: Yoon's legacy partially survives progressive victory]]> + https://asia.nikkei.com/opinion/south-korea-election-yoon-s-legacy-partially-survives-progressive-victory + + + <![CDATA[Chinese entrepreneur's e-truck startup Windrose faces unpaid wage claims]]> + https://asia.nikkei.com/business/automobiles/electric-vehicles/chinese-entrepreneur-s-e-truck-startup-windrose-faces-unpaid-wage-claims + + + <![CDATA[Bank Indonesia raises rates 0.25% at emergency meeting to defend rupiah]]> + https://asia.nikkei.com/economy/bank-indonesia-raises-rates-0.25-at-emergency-meeting-to-defend-rupiah + + + <![CDATA[Japan ex-PM Kishida calls for deeper energy ties with South Korea]]> + https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/japan-ex-pm-kishida-calls-for-deeper-energy-ties-with-south-korea + + + <![CDATA[Binance eyes Asian stock trading as Bitcoin slumps]]> + https://asia.nikkei.com/business/markets/equities/binance-eyes-asian-stock-trading-as-bitcoin-slumps + + diff --git a/tests/test_crawl_cycle3_shapes.py b/tests/test_crawl_cycle3_shapes.py new file mode 100644 index 0000000..5d2455a --- /dev/null +++ b/tests/test_crawl_cycle3_shapes.py @@ -0,0 +1,281 @@ +"""crawl-24x7 사이클 3 — 순수 함수/형태 회귀 테스트 (DB 불요). + +B-4 signal-only(본문 무절단 + enqueue 가드) + C-4 피드 shape + CSB sitemap diff 파서 ++ API 공지 목록 파서 + CCPS beacon 링크 파서 + B-5 (Nikkei RDF = feedparser 네이티브, +코드 분기 불요 박제). + +fixture = 2026-06-11 live 박제 (tests/fixtures/, [[feedback_external_api_fixture_first]]). +economist/ieee 는 repo 크기 사유로 item 수만 trim (헤더/푸터/item 구조 byte-faithful). +""" + +import re +from datetime import datetime, timedelta, timezone +from pathlib import Path +from types import SimpleNamespace + +import feedparser +import pytest + +from workers import news_collector +from workers.api_standards_collector import _parse_listing, _parse_pub_date +from workers.ccps_collector import _beacon_pdf_links +from workers.csb_collector import _parse_sitemap, _pdf_links, _should_skip +from workers.news_collector import _clean_html, _entry_body + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _feed(name: str): + return feedparser.parse((FIXTURES / name).read_text(encoding="utf-8")) + + +def _source(**kw): + return SimpleNamespace( + fetch_method=kw.get("fetch_method", "rss"), + fulltext_policy=kw.get("fulltext_policy", "none"), + source_channel=kw.get("source_channel", "news"), + ) + + +# ── B-4: 본문 선택 정책 ─────────────────────────────────────────────────────── + +class TestEntryBodyPolicy: + def test_signal_only_preserves_full_abstract(self): + """arXiv 초록 1.6K자 — 기본 1000자 cap 을 적용하면 꼬리 유실.""" + entry = _feed("arxiv_appph_rss.xml").entries[0] + summary = _clean_html(entry.get("summary", "")) # 기본 경로 = 1000자 절단 + body, ver = _entry_body(_source(fetch_method="signal-only"), entry, summary) + assert ver == "rss-signal" + assert len(body) > 1000 >= len(summary) + assert "Abstract" in body + + def test_feed_full_promotes_ieee_description(self): + entry = _feed("ieee_spectrum_energy_rss.xml").entries[0] + summary = _clean_html(entry.get("summary", "")) + body, ver = _entry_body(_source(fulltext_policy="feed-full"), entry, summary) + assert ver == "rss-feed-full" + assert len(body) > 1000 + + def test_default_source_keeps_capped_summary(self): + entry = _feed("arxiv_appph_rss.xml").entries[0] + summary = _clean_html(entry.get("summary", "")) + body, ver = _entry_body(_source(), entry, summary) + assert ver == "rss" + assert body == summary + + def test_signal_only_title_fallback_when_feed_has_no_summary(self): + """Nikkei RDF = description 없음 — summary 인자(=title 폴백)로 격하.""" + entry = _feed("nikkei_asia_nar_rdf.xml").entries[0] + body, ver = _entry_body( + _source(fetch_method="signal-only"), entry, entry.get("title", "") + ) + assert ver == "rss-signal" + assert body == entry.get("title", "") != "" + + +# ── B-4: enqueue 가드 (signal-only = fulltext/summarize 절대 금지) ──────────── + +class TestSignalOnlyEnqueueGuard: + @staticmethod + def _patch(monkeypatch): + calls = [] + + async def fake_enqueue(session, doc_id, stage): + calls.append(stage) + + monkeypatch.setattr(news_collector, "enqueue_stage", fake_enqueue) + return calls + + @pytest.mark.asyncio + async def test_signal_only_overrides_misconfigured_page_policy(self, monkeypatch): + """레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어).""" + calls = self._patch(monkeypatch) + doc = SimpleNamespace(id=1, edit_url="https://x/a") + src = _source(fetch_method="signal-only", fulltext_policy="page") + await news_collector._enqueue_processing( + None, doc, src, datetime.now(timezone.utc) + ) + assert calls == ["embed", "chunk"] # fulltext/summarize 부재 + + @pytest.mark.asyncio + async def test_signal_only_news_respects_30day_gate(self, monkeypatch): + calls = self._patch(monkeypatch) + doc = SimpleNamespace(id=1, edit_url="https://x/a") + old = datetime.now(timezone.utc) - timedelta(days=40) + await news_collector._enqueue_processing( + None, doc, _source(fetch_method="signal-only"), old + ) + assert calls == [] + + @pytest.mark.asyncio + async def test_signal_only_crawl_channel_indexes_regardless_of_age(self, monkeypatch): + calls = self._patch(monkeypatch) + doc = SimpleNamespace(id=1, edit_url="https://x/a") + old = datetime.now(timezone.utc) - timedelta(days=400) + src = _source(fetch_method="signal-only", source_channel="crawl") + await news_collector._enqueue_processing(None, doc, src, old) + assert calls == ["embed", "chunk"] + + +# ── C-4 / B-4 피드 shape (시드 전 live 박제) ───────────────────────────────── + +class TestNikkeiRdfNativeParsing: + """B-5 'rdf' quirk = 코드 분기 불요 실측 — feedparser 가 RSS 1.0 을 정규화.""" + + def test_rss10_entries_have_title_and_link(self): + f = _feed("nikkei_asia_nar_rdf.xml") + assert f.version == "rss10" + assert not f.bozo and len(f.entries) >= 10 + for e in f.entries: + assert e.get("title", "").strip() + assert e.get("link", "").startswith("https://asia.nikkei.com/") + + def test_no_summary_no_dates_means_title_signal(self): + e = _feed("nikkei_asia_nar_rdf.xml").entries[0] + assert not e.get("summary", "") + assert not e.get("published_parsed") and not e.get("updated_parsed") + + +class TestBloombergFixture: + def test_video_items_mixed_in_feed(self): + """비디오 항목 혼재 실측 → seed parser_quirk='skip-video' 의 근거.""" + links = [e.get("link", "") for e in _feed("bloomberg_markets_rss.xml").entries] + video_pat = re.compile(r"/videos?/") # news_collector skip-video 와 동일 패턴 + assert any(video_pat.search(u) for u in links) + assert any("/news/articles/" in u and not video_pat.search(u) for u in links) + + def test_articles_have_signal_grade_summary(self): + f = _feed("bloomberg_markets_rss.xml") + assert any(len(e.get("summary", "")) >= 100 for e in f.entries) + + +class TestAsmeJpvtFixture: + def test_journal_identity_and_abstract(self): + f = _feed("asme_jpvt_openissues_rss.xml") + assert "Pressure Vessel Technology" in f.feed.get("title", "") + assert f.entries + for e in f.entries: + assert len(e.get("summary", "")) >= 200 # 초록 = 본문 + + +class TestArxivFixture: + def test_abs_links_are_stable_dedup_keys(self): + """replace/cross 재공지는 같은 /abs/ URL — edit_url dedup 이 자연 차단.""" + f = _feed("arxiv_appph_rss.xml") + assert f.entries + for e in f.entries: + assert re.match(r"https://arxiv\.org/abs/\d", e.get("link", "")) + + def test_announce_type_in_summary(self): + e = _feed("arxiv_appph_rss.xml").entries[0] + assert "Announce Type:" in e.get("summary", "") + + +class TestEconomistFixture: + def test_oneline_signal_summaries(self): + f = _feed("economist_latest_rss.xml") + assert f.entries + for e in f.entries: + assert e.get("title", "").strip() + assert e.get("link", "").startswith("https://www.economist.com/") + + +# ── CSB sitemap diff 파서 ──────────────────────────────────────────────────── + +class TestCsbSitemapParsing: + def test_parse_pairs_with_tz_aware_lastmod(self): + xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8") + pairs = _parse_sitemap(xml) + assert pairs + for url, lastmod in pairs: + assert url.startswith("https://www.csb.gov/") + assert lastmod.tzinfo is not None + + def test_skip_sections_vs_root_slugs(self): + assert _should_skip("https://www.csb.gov/videos/some-video/") + assert _should_skip("https://www.csb.gov/investigations/completed-investigations/") + assert _should_skip("https://www.csb.gov/site-map/") + assert _should_skip("https://www.csb.gov/") # 홈 + # 조사 보고서/뉴스 릴리스 = 루트 슬러그 — 수집 대상 + assert not _should_skip("https://www.csb.gov/givaudan-sense-colour-explosion-/") + assert not _should_skip("https://www.csb.gov/recommendations/preventive-maintenance/") + + def test_watermark_diff_orders_oldest_first(self): + xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8") + pairs = [p for p in _parse_sitemap(xml) if not _should_skip(p[0])] + watermark = min(lm for _, lm in pairs) + changed = sorted( + ((u, lm) for u, lm in pairs if lm >= watermark), key=lambda p: p[1] + ) + assert changed == sorted(changed, key=lambda p: p[1]) + assert len(changed) == len(pairs) # >= 경계 포함 + + +class TestCsbPdfLinks: + HTML = (FIXTURES / "csb_investigation_page_excerpt.html").read_text(encoding="utf-8") + BASE = "https://www.csb.gov/givaudan-sense-colour-explosion-/" + + def test_report_pdfs_kept_with_cachebuster_query(self): + links = _pdf_links(self.HTML, self.BASE) + assert any("Givaudan_Investigation_Report_Publication.pdf" in u for u in links) + # cache-buster 쿼리는 다운로드 URL 에 유지 (정규화는 파일명/dedup 축에서만) + assert any("?" in u for u in links) + for u in links: + assert u.startswith("https://www.csb.gov/") + + def test_recommendation_status_summaries_excluded(self): + links = _pdf_links(self.HTML, self.BASE) + assert links + assert not any("/assets/recommendation/" in u for u in links) + + def test_dedup_by_path(self): + html = ( + 'a' + 'b' + 'c' + ) + links = _pdf_links(html, "https://www.csb.gov/page/") + assert len(links) == 1 # 같은 path 1회 + 외부 호스트 제외 + assert links[0].startswith("https://www.csb.gov/assets/1/6/r.pdf") + + +# ── API 표준 공지 목록 파서 ────────────────────────────────────────────────── + +class TestApiListingParsing: + HTML = (FIXTURES / "api_standards_announcements_listing.html").read_text( + encoding="utf-8", errors="replace" + ) + + def test_ten_unique_detail_links_per_page(self): + urls = _parse_listing(self.HTML) + assert len(urls) == 10 + assert len(set(urls)) == 10 + for u in urls: + assert u.startswith( + "https://www.api.org/products-and-services/standards/" + "important-standards-announcements/" + ) + assert "?" not in u # 페이지네이션 링크(?page=) 미혼입 + + def test_pub_date_parse(self): + dt = _parse_pub_date("Published June 4, 2026 — API announces ...") + assert dt == datetime(2026, 6, 4, tzinfo=timezone.utc) + assert _parse_pub_date("no date here") is None + assert _parse_pub_date("February 31, 2026") is None # 달력 불가 = None + + +# ── CCPS beacon 링크 파서 ──────────────────────────────────────────────────── + +class TestCcpsBeaconLinks: + def test_beacon_filter_and_relative_resolve(self): + html = ( + 'June' + 'Korean' + 'brochure' + 'Process Safety Beacon June' + ) + links = _beacon_pdf_links(html, "https://www.aiche.org/ccps/resources/process-safety-beacon") + assert "https://www.aiche.org/sites/default/files/2026-06/Beacon-June-2026.pdf" in links + assert any("beacon_korean" in u for u in links) + assert any(u.endswith("/monthly.pdf") for u in links) # 앵커 텍스트 매칭 + assert not any("other-brochure" in u for u in links)