8583465c58
- B-4 fetch_method='signal-only': 페이지 fetch 0 + summarize 스킵(검색 색인만, 맥미니 부하 0) + 본문 무절단(_entry_body — arXiv 초록 1.6K 보존). 다이제스트는 ai_summary NULL 제외 규칙으로 자연 배제. 레지스트리 오설정(page) 방어 가드. - 시드 9 소스 (전 URL 2026-06-11 live 검증): Bloomberg Markets/Technology(skip-video, 비디오 혼재 실측)·Economist Latest·Nikkei Asia(RDF — feedparser 네이티브, 분기 불요 fixture 박제)·ASME JPVT(site_1000037 실측 매핑)·arXiv 2종·IEEE Spectrum 2종(feed-full, 피드 description 이 전문 7.9~14K자 실측). - csb_collector: sitemap lastmod diff (weekly 월 06:50) — 워터마크(selector_override) + cap 40/회 점진 백필 + diff sanity 300 + 보고서 PDF(/assets/, recommendation 제외) → extract 파이프라인. 초기 일괄 = CLI --bulk. - api_standards_collector: 공지 목록 링크 파싱(실측 — 페이지 diff 아님, 상세 URL 10건/페이지) → 신규 상세만 ingest (monthly 5일 07:05). 초기 백필 = CLI --bulk. - ccps_collector: aiche.org 평문 403(UA 무관 실측) → playwright-fetcher 익명 컨텍스트 + referer 쿠키 승계 /download(base64) 신설로 월간 Beacon PDF (monthly 5일 07:20). 헤드리스 차단 시 CrawlBlocked → health 가시화 (르몽드 PARK 선례). - B-5 잔여: rdf/feed-reader-UA = 코드 분기 불요 실측 박제 (Economist 는 Archiver UA 200). table-strip/gn-redirect 는 해당 소스 미진입 — 백로그 유지. - 테스트 24건 신규 (fixture 9건 live 박제, economist/ieee 는 item trim) — 39 passed. - 마이그 327 단일 statement (PKM 트랙과 번호 경합 주의 — 327 본 트랙 선점). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
282 lines
12 KiB
Python
282 lines
12 KiB
Python
"""crawl-24x7 사이클 3 — 순수 함수/형태 회귀 테스트 (DB 불요).
|
|
|
|
B-4 signal-only(본문 무절단 + enqueue 가드) + C-4 피드 shape + CSB sitemap diff 파서
|
|
+ API 공지 목록 파서 + CCPS beacon 링크 파서 + B-5 (Nikkei RDF = feedparser 네이티브,
|
|
코드 분기 불요 박제).
|
|
|
|
fixture = 2026-06-11 live 박제 (tests/fixtures/, [[feedback_external_api_fixture_first]]).
|
|
economist/ieee 는 repo 크기 사유로 item 수만 trim (헤더/푸터/item 구조 byte-faithful).
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
from types import SimpleNamespace
|
|
|
|
import feedparser
|
|
import pytest
|
|
|
|
from workers import news_collector
|
|
from workers.api_standards_collector import _parse_listing, _parse_pub_date
|
|
from workers.ccps_collector import _beacon_pdf_links
|
|
from workers.csb_collector import _parse_sitemap, _pdf_links, _should_skip
|
|
from workers.news_collector import _clean_html, _entry_body
|
|
|
|
FIXTURES = Path(__file__).parent / "fixtures"
|
|
|
|
|
|
def _feed(name: str):
|
|
return feedparser.parse((FIXTURES / name).read_text(encoding="utf-8"))
|
|
|
|
|
|
def _source(**kw):
|
|
return SimpleNamespace(
|
|
fetch_method=kw.get("fetch_method", "rss"),
|
|
fulltext_policy=kw.get("fulltext_policy", "none"),
|
|
source_channel=kw.get("source_channel", "news"),
|
|
)
|
|
|
|
|
|
# ── B-4: 본문 선택 정책 ───────────────────────────────────────────────────────
|
|
|
|
class TestEntryBodyPolicy:
|
|
def test_signal_only_preserves_full_abstract(self):
|
|
"""arXiv 초록 1.6K자 — 기본 1000자 cap 을 적용하면 꼬리 유실."""
|
|
entry = _feed("arxiv_appph_rss.xml").entries[0]
|
|
summary = _clean_html(entry.get("summary", "")) # 기본 경로 = 1000자 절단
|
|
body, ver = _entry_body(_source(fetch_method="signal-only"), entry, summary)
|
|
assert ver == "rss-signal"
|
|
assert len(body) > 1000 >= len(summary)
|
|
assert "Abstract" in body
|
|
|
|
def test_feed_full_promotes_ieee_description(self):
|
|
entry = _feed("ieee_spectrum_energy_rss.xml").entries[0]
|
|
summary = _clean_html(entry.get("summary", ""))
|
|
body, ver = _entry_body(_source(fulltext_policy="feed-full"), entry, summary)
|
|
assert ver == "rss-feed-full"
|
|
assert len(body) > 1000
|
|
|
|
def test_default_source_keeps_capped_summary(self):
|
|
entry = _feed("arxiv_appph_rss.xml").entries[0]
|
|
summary = _clean_html(entry.get("summary", ""))
|
|
body, ver = _entry_body(_source(), entry, summary)
|
|
assert ver == "rss"
|
|
assert body == summary
|
|
|
|
def test_signal_only_title_fallback_when_feed_has_no_summary(self):
|
|
"""Nikkei RDF = description 없음 — summary 인자(=title 폴백)로 격하."""
|
|
entry = _feed("nikkei_asia_nar_rdf.xml").entries[0]
|
|
body, ver = _entry_body(
|
|
_source(fetch_method="signal-only"), entry, entry.get("title", "")
|
|
)
|
|
assert ver == "rss-signal"
|
|
assert body == entry.get("title", "") != ""
|
|
|
|
|
|
# ── B-4: enqueue 가드 (signal-only = fulltext/summarize 절대 금지) ────────────
|
|
|
|
class TestSignalOnlyEnqueueGuard:
|
|
@staticmethod
|
|
def _patch(monkeypatch):
|
|
calls = []
|
|
|
|
async def fake_enqueue(session, doc_id, stage):
|
|
calls.append(stage)
|
|
|
|
monkeypatch.setattr(news_collector, "enqueue_stage", fake_enqueue)
|
|
return calls
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_signal_only_overrides_misconfigured_page_policy(self, monkeypatch):
|
|
"""레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어)."""
|
|
calls = self._patch(monkeypatch)
|
|
doc = SimpleNamespace(id=1, edit_url="https://x/a")
|
|
src = _source(fetch_method="signal-only", fulltext_policy="page")
|
|
await news_collector._enqueue_processing(
|
|
None, doc, src, datetime.now(timezone.utc)
|
|
)
|
|
assert calls == ["embed", "chunk"] # fulltext/summarize 부재
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_signal_only_news_respects_30day_gate(self, monkeypatch):
|
|
calls = self._patch(monkeypatch)
|
|
doc = SimpleNamespace(id=1, edit_url="https://x/a")
|
|
old = datetime.now(timezone.utc) - timedelta(days=40)
|
|
await news_collector._enqueue_processing(
|
|
None, doc, _source(fetch_method="signal-only"), old
|
|
)
|
|
assert calls == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_signal_only_crawl_channel_indexes_regardless_of_age(self, monkeypatch):
|
|
calls = self._patch(monkeypatch)
|
|
doc = SimpleNamespace(id=1, edit_url="https://x/a")
|
|
old = datetime.now(timezone.utc) - timedelta(days=400)
|
|
src = _source(fetch_method="signal-only", source_channel="crawl")
|
|
await news_collector._enqueue_processing(None, doc, src, old)
|
|
assert calls == ["embed", "chunk"]
|
|
|
|
|
|
# ── C-4 / B-4 피드 shape (시드 전 live 박제) ─────────────────────────────────
|
|
|
|
class TestNikkeiRdfNativeParsing:
|
|
"""B-5 'rdf' quirk = 코드 분기 불요 실측 — feedparser 가 RSS 1.0 을 정규화."""
|
|
|
|
def test_rss10_entries_have_title_and_link(self):
|
|
f = _feed("nikkei_asia_nar_rdf.xml")
|
|
assert f.version == "rss10"
|
|
assert not f.bozo and len(f.entries) >= 10
|
|
for e in f.entries:
|
|
assert e.get("title", "").strip()
|
|
assert e.get("link", "").startswith("https://asia.nikkei.com/")
|
|
|
|
def test_no_summary_no_dates_means_title_signal(self):
|
|
e = _feed("nikkei_asia_nar_rdf.xml").entries[0]
|
|
assert not e.get("summary", "")
|
|
assert not e.get("published_parsed") and not e.get("updated_parsed")
|
|
|
|
|
|
class TestBloombergFixture:
|
|
def test_video_items_mixed_in_feed(self):
|
|
"""비디오 항목 혼재 실측 → seed parser_quirk='skip-video' 의 근거."""
|
|
links = [e.get("link", "") for e in _feed("bloomberg_markets_rss.xml").entries]
|
|
video_pat = re.compile(r"/videos?/") # news_collector skip-video 와 동일 패턴
|
|
assert any(video_pat.search(u) for u in links)
|
|
assert any("/news/articles/" in u and not video_pat.search(u) for u in links)
|
|
|
|
def test_articles_have_signal_grade_summary(self):
|
|
f = _feed("bloomberg_markets_rss.xml")
|
|
assert any(len(e.get("summary", "")) >= 100 for e in f.entries)
|
|
|
|
|
|
class TestAsmeJpvtFixture:
|
|
def test_journal_identity_and_abstract(self):
|
|
f = _feed("asme_jpvt_openissues_rss.xml")
|
|
assert "Pressure Vessel Technology" in f.feed.get("title", "")
|
|
assert f.entries
|
|
for e in f.entries:
|
|
assert len(e.get("summary", "")) >= 200 # 초록 = 본문
|
|
|
|
|
|
class TestArxivFixture:
|
|
def test_abs_links_are_stable_dedup_keys(self):
|
|
"""replace/cross 재공지는 같은 /abs/ URL — edit_url dedup 이 자연 차단."""
|
|
f = _feed("arxiv_appph_rss.xml")
|
|
assert f.entries
|
|
for e in f.entries:
|
|
assert re.match(r"https://arxiv\.org/abs/\d", e.get("link", ""))
|
|
|
|
def test_announce_type_in_summary(self):
|
|
e = _feed("arxiv_appph_rss.xml").entries[0]
|
|
assert "Announce Type:" in e.get("summary", "")
|
|
|
|
|
|
class TestEconomistFixture:
|
|
def test_oneline_signal_summaries(self):
|
|
f = _feed("economist_latest_rss.xml")
|
|
assert f.entries
|
|
for e in f.entries:
|
|
assert e.get("title", "").strip()
|
|
assert e.get("link", "").startswith("https://www.economist.com/")
|
|
|
|
|
|
# ── CSB sitemap diff 파서 ────────────────────────────────────────────────────
|
|
|
|
class TestCsbSitemapParsing:
|
|
def test_parse_pairs_with_tz_aware_lastmod(self):
|
|
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
|
|
pairs = _parse_sitemap(xml)
|
|
assert pairs
|
|
for url, lastmod in pairs:
|
|
assert url.startswith("https://www.csb.gov/")
|
|
assert lastmod.tzinfo is not None
|
|
|
|
def test_skip_sections_vs_root_slugs(self):
|
|
assert _should_skip("https://www.csb.gov/videos/some-video/")
|
|
assert _should_skip("https://www.csb.gov/investigations/completed-investigations/")
|
|
assert _should_skip("https://www.csb.gov/site-map/")
|
|
assert _should_skip("https://www.csb.gov/") # 홈
|
|
# 조사 보고서/뉴스 릴리스 = 루트 슬러그 — 수집 대상
|
|
assert not _should_skip("https://www.csb.gov/givaudan-sense-colour-explosion-/")
|
|
assert not _should_skip("https://www.csb.gov/recommendations/preventive-maintenance/")
|
|
|
|
def test_watermark_diff_orders_oldest_first(self):
|
|
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
|
|
pairs = [p for p in _parse_sitemap(xml) if not _should_skip(p[0])]
|
|
watermark = min(lm for _, lm in pairs)
|
|
changed = sorted(
|
|
((u, lm) for u, lm in pairs if lm >= watermark), key=lambda p: p[1]
|
|
)
|
|
assert changed == sorted(changed, key=lambda p: p[1])
|
|
assert len(changed) == len(pairs) # >= 경계 포함
|
|
|
|
|
|
class TestCsbPdfLinks:
|
|
HTML = (FIXTURES / "csb_investigation_page_excerpt.html").read_text(encoding="utf-8")
|
|
BASE = "https://www.csb.gov/givaudan-sense-colour-explosion-/"
|
|
|
|
def test_report_pdfs_kept_with_cachebuster_query(self):
|
|
links = _pdf_links(self.HTML, self.BASE)
|
|
assert any("Givaudan_Investigation_Report_Publication.pdf" in u for u in links)
|
|
# cache-buster 쿼리는 다운로드 URL 에 유지 (정규화는 파일명/dedup 축에서만)
|
|
assert any("?" in u for u in links)
|
|
for u in links:
|
|
assert u.startswith("https://www.csb.gov/")
|
|
|
|
def test_recommendation_status_summaries_excluded(self):
|
|
links = _pdf_links(self.HTML, self.BASE)
|
|
assert links
|
|
assert not any("/assets/recommendation/" in u for u in links)
|
|
|
|
def test_dedup_by_path(self):
|
|
html = (
|
|
'<a href="/assets/1/6/r.pdf?100">a</a>'
|
|
'<a href="/assets/1/6/r.pdf?200">b</a>'
|
|
'<a href="https://evil.example.com/x.pdf">c</a>'
|
|
)
|
|
links = _pdf_links(html, "https://www.csb.gov/page/")
|
|
assert len(links) == 1 # 같은 path 1회 + 외부 호스트 제외
|
|
assert links[0].startswith("https://www.csb.gov/assets/1/6/r.pdf")
|
|
|
|
|
|
# ── API 표준 공지 목록 파서 ──────────────────────────────────────────────────
|
|
|
|
class TestApiListingParsing:
|
|
HTML = (FIXTURES / "api_standards_announcements_listing.html").read_text(
|
|
encoding="utf-8", errors="replace"
|
|
)
|
|
|
|
def test_ten_unique_detail_links_per_page(self):
|
|
urls = _parse_listing(self.HTML)
|
|
assert len(urls) == 10
|
|
assert len(set(urls)) == 10
|
|
for u in urls:
|
|
assert u.startswith(
|
|
"https://www.api.org/products-and-services/standards/"
|
|
"important-standards-announcements/"
|
|
)
|
|
assert "?" not in u # 페이지네이션 링크(?page=) 미혼입
|
|
|
|
def test_pub_date_parse(self):
|
|
dt = _parse_pub_date("Published June 4, 2026 — API announces ...")
|
|
assert dt == datetime(2026, 6, 4, tzinfo=timezone.utc)
|
|
assert _parse_pub_date("no date here") is None
|
|
assert _parse_pub_date("February 31, 2026") is None # 달력 불가 = None
|
|
|
|
|
|
# ── CCPS beacon 링크 파서 ────────────────────────────────────────────────────
|
|
|
|
class TestCcpsBeaconLinks:
|
|
def test_beacon_filter_and_relative_resolve(self):
|
|
html = (
|
|
'<a href="/sites/default/files/2026-06/Beacon-June-2026.pdf">June</a>'
|
|
'<a href="/sites/default/files/beacon_korean_2026_06.pdf"><b>Korean</b></a>'
|
|
'<a href="/sites/default/files/other-brochure.pdf">brochure</a>'
|
|
'<a href="/sites/default/files/monthly.pdf">Process Safety Beacon June</a>'
|
|
)
|
|
links = _beacon_pdf_links(html, "https://www.aiche.org/ccps/resources/process-safety-beacon")
|
|
assert "https://www.aiche.org/sites/default/files/2026-06/Beacon-June-2026.pdf" in links
|
|
assert any("beacon_korean" in u for u in links)
|
|
assert any(u.endswith("/monthly.pdf") for u in links) # 앵커 텍스트 매칭
|
|
assert not any("other-brochure" in u for u in links)
|