Files
hyungi_document_server/tests/test_crawl_cycle3_shapes.py
T
hyungi 8583465c58 feat(news): crawl-24x7 사이클 3 — B-4 시그널·C-4 공학 지속·CSB sitemap·CCPS Beacon (마이그 327)
- B-4 fetch_method='signal-only': 페이지 fetch 0 + summarize 스킵(검색 색인만,
  맥미니 부하 0) + 본문 무절단(_entry_body — arXiv 초록 1.6K 보존). 다이제스트는
  ai_summary NULL 제외 규칙으로 자연 배제. 레지스트리 오설정(page) 방어 가드.
- 시드 9 소스 (전 URL 2026-06-11 live 검증): Bloomberg Markets/Technology(skip-video,
  비디오 혼재 실측)·Economist Latest·Nikkei Asia(RDF — feedparser 네이티브, 분기 불요
  fixture 박제)·ASME JPVT(site_1000037 실측 매핑)·arXiv 2종·IEEE Spectrum 2종(feed-full,
  피드 description 이 전문 7.9~14K자 실측).
- csb_collector: sitemap lastmod diff (weekly 월 06:50) — 워터마크(selector_override)
  + cap 40/회 점진 백필 + diff sanity 300 + 보고서 PDF(/assets/, recommendation 제외)
  → extract 파이프라인. 초기 일괄 = CLI --bulk.
- api_standards_collector: 공지 목록 링크 파싱(실측 — 페이지 diff 아님, 상세 URL
  10건/페이지) → 신규 상세만 ingest (monthly 5일 07:05). 초기 백필 = CLI --bulk.
- ccps_collector: aiche.org 평문 403(UA 무관 실측) → playwright-fetcher 익명 컨텍스트
  + referer 쿠키 승계 /download(base64) 신설로 월간 Beacon PDF (monthly 5일 07:20).
  헤드리스 차단 시 CrawlBlocked → health 가시화 (르몽드 PARK 선례).
- B-5 잔여: rdf/feed-reader-UA = 코드 분기 불요 실측 박제 (Economist 는 Archiver UA
  200). table-strip/gn-redirect 는 해당 소스 미진입 — 백로그 유지.
- 테스트 24건 신규 (fixture 9건 live 박제, economist/ieee 는 item trim) — 39 passed.
- 마이그 327 단일 statement (PKM 트랙과 번호 경합 주의 — 327 본 트랙 선점).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 07:13:17 +09:00

282 lines
12 KiB
Python

"""crawl-24x7 사이클 3 — 순수 함수/형태 회귀 테스트 (DB 불요).
B-4 signal-only(본문 무절단 + enqueue 가드) + C-4 피드 shape + CSB sitemap diff 파서
+ API 공지 목록 파서 + CCPS beacon 링크 파서 + B-5 (Nikkei RDF = feedparser 네이티브,
코드 분기 불요 박제).
fixture = 2026-06-11 live 박제 (tests/fixtures/, [[feedback_external_api_fixture_first]]).
economist/ieee 는 repo 크기 사유로 item 수만 trim (헤더/푸터/item 구조 byte-faithful).
"""
import re
from datetime import datetime, timedelta, timezone
from pathlib import Path
from types import SimpleNamespace
import feedparser
import pytest
from workers import news_collector
from workers.api_standards_collector import _parse_listing, _parse_pub_date
from workers.ccps_collector import _beacon_pdf_links
from workers.csb_collector import _parse_sitemap, _pdf_links, _should_skip
from workers.news_collector import _clean_html, _entry_body
FIXTURES = Path(__file__).parent / "fixtures"
def _feed(name: str):
return feedparser.parse((FIXTURES / name).read_text(encoding="utf-8"))
def _source(**kw):
return SimpleNamespace(
fetch_method=kw.get("fetch_method", "rss"),
fulltext_policy=kw.get("fulltext_policy", "none"),
source_channel=kw.get("source_channel", "news"),
)
# ── B-4: 본문 선택 정책 ───────────────────────────────────────────────────────
class TestEntryBodyPolicy:
def test_signal_only_preserves_full_abstract(self):
"""arXiv 초록 1.6K자 — 기본 1000자 cap 을 적용하면 꼬리 유실."""
entry = _feed("arxiv_appph_rss.xml").entries[0]
summary = _clean_html(entry.get("summary", "")) # 기본 경로 = 1000자 절단
body, ver = _entry_body(_source(fetch_method="signal-only"), entry, summary)
assert ver == "rss-signal"
assert len(body) > 1000 >= len(summary)
assert "Abstract" in body
def test_feed_full_promotes_ieee_description(self):
entry = _feed("ieee_spectrum_energy_rss.xml").entries[0]
summary = _clean_html(entry.get("summary", ""))
body, ver = _entry_body(_source(fulltext_policy="feed-full"), entry, summary)
assert ver == "rss-feed-full"
assert len(body) > 1000
def test_default_source_keeps_capped_summary(self):
entry = _feed("arxiv_appph_rss.xml").entries[0]
summary = _clean_html(entry.get("summary", ""))
body, ver = _entry_body(_source(), entry, summary)
assert ver == "rss"
assert body == summary
def test_signal_only_title_fallback_when_feed_has_no_summary(self):
"""Nikkei RDF = description 없음 — summary 인자(=title 폴백)로 격하."""
entry = _feed("nikkei_asia_nar_rdf.xml").entries[0]
body, ver = _entry_body(
_source(fetch_method="signal-only"), entry, entry.get("title", "")
)
assert ver == "rss-signal"
assert body == entry.get("title", "") != ""
# ── B-4: enqueue 가드 (signal-only = fulltext/summarize 절대 금지) ────────────
class TestSignalOnlyEnqueueGuard:
@staticmethod
def _patch(monkeypatch):
calls = []
async def fake_enqueue(session, doc_id, stage):
calls.append(stage)
monkeypatch.setattr(news_collector, "enqueue_stage", fake_enqueue)
return calls
@pytest.mark.asyncio
async def test_signal_only_overrides_misconfigured_page_policy(self, monkeypatch):
"""레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어)."""
calls = self._patch(monkeypatch)
doc = SimpleNamespace(id=1, edit_url="https://x/a")
src = _source(fetch_method="signal-only", fulltext_policy="page")
await news_collector._enqueue_processing(
None, doc, src, datetime.now(timezone.utc)
)
assert calls == ["embed", "chunk"] # fulltext/summarize 부재
@pytest.mark.asyncio
async def test_signal_only_news_respects_30day_gate(self, monkeypatch):
calls = self._patch(monkeypatch)
doc = SimpleNamespace(id=1, edit_url="https://x/a")
old = datetime.now(timezone.utc) - timedelta(days=40)
await news_collector._enqueue_processing(
None, doc, _source(fetch_method="signal-only"), old
)
assert calls == []
@pytest.mark.asyncio
async def test_signal_only_crawl_channel_indexes_regardless_of_age(self, monkeypatch):
calls = self._patch(monkeypatch)
doc = SimpleNamespace(id=1, edit_url="https://x/a")
old = datetime.now(timezone.utc) - timedelta(days=400)
src = _source(fetch_method="signal-only", source_channel="crawl")
await news_collector._enqueue_processing(None, doc, src, old)
assert calls == ["embed", "chunk"]
# ── C-4 / B-4 피드 shape (시드 전 live 박제) ─────────────────────────────────
class TestNikkeiRdfNativeParsing:
"""B-5 'rdf' quirk = 코드 분기 불요 실측 — feedparser 가 RSS 1.0 을 정규화."""
def test_rss10_entries_have_title_and_link(self):
f = _feed("nikkei_asia_nar_rdf.xml")
assert f.version == "rss10"
assert not f.bozo and len(f.entries) >= 10
for e in f.entries:
assert e.get("title", "").strip()
assert e.get("link", "").startswith("https://asia.nikkei.com/")
def test_no_summary_no_dates_means_title_signal(self):
e = _feed("nikkei_asia_nar_rdf.xml").entries[0]
assert not e.get("summary", "")
assert not e.get("published_parsed") and not e.get("updated_parsed")
class TestBloombergFixture:
def test_video_items_mixed_in_feed(self):
"""비디오 항목 혼재 실측 → seed parser_quirk='skip-video' 의 근거."""
links = [e.get("link", "") for e in _feed("bloomberg_markets_rss.xml").entries]
video_pat = re.compile(r"/videos?/") # news_collector skip-video 와 동일 패턴
assert any(video_pat.search(u) for u in links)
assert any("/news/articles/" in u and not video_pat.search(u) for u in links)
def test_articles_have_signal_grade_summary(self):
f = _feed("bloomberg_markets_rss.xml")
assert any(len(e.get("summary", "")) >= 100 for e in f.entries)
class TestAsmeJpvtFixture:
def test_journal_identity_and_abstract(self):
f = _feed("asme_jpvt_openissues_rss.xml")
assert "Pressure Vessel Technology" in f.feed.get("title", "")
assert f.entries
for e in f.entries:
assert len(e.get("summary", "")) >= 200 # 초록 = 본문
class TestArxivFixture:
def test_abs_links_are_stable_dedup_keys(self):
"""replace/cross 재공지는 같은 /abs/ URL — edit_url dedup 이 자연 차단."""
f = _feed("arxiv_appph_rss.xml")
assert f.entries
for e in f.entries:
assert re.match(r"https://arxiv\.org/abs/\d", e.get("link", ""))
def test_announce_type_in_summary(self):
e = _feed("arxiv_appph_rss.xml").entries[0]
assert "Announce Type:" in e.get("summary", "")
class TestEconomistFixture:
def test_oneline_signal_summaries(self):
f = _feed("economist_latest_rss.xml")
assert f.entries
for e in f.entries:
assert e.get("title", "").strip()
assert e.get("link", "").startswith("https://www.economist.com/")
# ── CSB sitemap diff 파서 ────────────────────────────────────────────────────
class TestCsbSitemapParsing:
def test_parse_pairs_with_tz_aware_lastmod(self):
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
pairs = _parse_sitemap(xml)
assert pairs
for url, lastmod in pairs:
assert url.startswith("https://www.csb.gov/")
assert lastmod.tzinfo is not None
def test_skip_sections_vs_root_slugs(self):
assert _should_skip("https://www.csb.gov/videos/some-video/")
assert _should_skip("https://www.csb.gov/investigations/completed-investigations/")
assert _should_skip("https://www.csb.gov/site-map/")
assert _should_skip("https://www.csb.gov/") # 홈
# 조사 보고서/뉴스 릴리스 = 루트 슬러그 — 수집 대상
assert not _should_skip("https://www.csb.gov/givaudan-sense-colour-explosion-/")
assert not _should_skip("https://www.csb.gov/recommendations/preventive-maintenance/")
def test_watermark_diff_orders_oldest_first(self):
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
pairs = [p for p in _parse_sitemap(xml) if not _should_skip(p[0])]
watermark = min(lm for _, lm in pairs)
changed = sorted(
((u, lm) for u, lm in pairs if lm >= watermark), key=lambda p: p[1]
)
assert changed == sorted(changed, key=lambda p: p[1])
assert len(changed) == len(pairs) # >= 경계 포함
class TestCsbPdfLinks:
HTML = (FIXTURES / "csb_investigation_page_excerpt.html").read_text(encoding="utf-8")
BASE = "https://www.csb.gov/givaudan-sense-colour-explosion-/"
def test_report_pdfs_kept_with_cachebuster_query(self):
links = _pdf_links(self.HTML, self.BASE)
assert any("Givaudan_Investigation_Report_Publication.pdf" in u for u in links)
# cache-buster 쿼리는 다운로드 URL 에 유지 (정규화는 파일명/dedup 축에서만)
assert any("?" in u for u in links)
for u in links:
assert u.startswith("https://www.csb.gov/")
def test_recommendation_status_summaries_excluded(self):
links = _pdf_links(self.HTML, self.BASE)
assert links
assert not any("/assets/recommendation/" in u for u in links)
def test_dedup_by_path(self):
html = (
'<a href="/assets/1/6/r.pdf?100">a</a>'
'<a href="/assets/1/6/r.pdf?200">b</a>'
'<a href="https://evil.example.com/x.pdf">c</a>'
)
links = _pdf_links(html, "https://www.csb.gov/page/")
assert len(links) == 1 # 같은 path 1회 + 외부 호스트 제외
assert links[0].startswith("https://www.csb.gov/assets/1/6/r.pdf")
# ── API 표준 공지 목록 파서 ──────────────────────────────────────────────────
class TestApiListingParsing:
HTML = (FIXTURES / "api_standards_announcements_listing.html").read_text(
encoding="utf-8", errors="replace"
)
def test_ten_unique_detail_links_per_page(self):
urls = _parse_listing(self.HTML)
assert len(urls) == 10
assert len(set(urls)) == 10
for u in urls:
assert u.startswith(
"https://www.api.org/products-and-services/standards/"
"important-standards-announcements/"
)
assert "?" not in u # 페이지네이션 링크(?page=) 미혼입
def test_pub_date_parse(self):
dt = _parse_pub_date("Published June 4, 2026 — API announces ...")
assert dt == datetime(2026, 6, 4, tzinfo=timezone.utc)
assert _parse_pub_date("no date here") is None
assert _parse_pub_date("February 31, 2026") is None # 달력 불가 = None
# ── CCPS beacon 링크 파서 ────────────────────────────────────────────────────
class TestCcpsBeaconLinks:
def test_beacon_filter_and_relative_resolve(self):
html = (
'<a href="/sites/default/files/2026-06/Beacon-June-2026.pdf">June</a>'
'<a href="/sites/default/files/beacon_korean_2026_06.pdf"><b>Korean</b></a>'
'<a href="/sites/default/files/other-brochure.pdf">brochure</a>'
'<a href="/sites/default/files/monthly.pdf">Process Safety Beacon June</a>'
)
links = _beacon_pdf_links(html, "https://www.aiche.org/ccps/resources/process-safety-beacon")
assert "https://www.aiche.org/sites/default/files/2026-06/Beacon-June-2026.pdf" in links
assert any("beacon_korean" in u for u in links)
assert any(u.endswith("/monthly.pdf") for u in links) # 앵커 텍스트 매칭
assert not any("other-brochure" in u for u in links)