"""crawl-24x7 사이클 3 — 순수 함수/형태 회귀 테스트 (DB 불요). B-4 signal-only(본문 무절단 + enqueue 가드) + C-4 피드 shape + CSB sitemap diff 파서 + API 공지 목록 파서 + CCPS beacon 링크 파서 + B-5 (Nikkei RDF = feedparser 네이티브, 코드 분기 불요 박제). fixture = 2026-06-11 live 박제 (tests/fixtures/, [[feedback_external_api_fixture_first]]). economist/ieee 는 repo 크기 사유로 item 수만 trim (헤더/푸터/item 구조 byte-faithful). """ import re from datetime import datetime, timedelta, timezone from pathlib import Path from types import SimpleNamespace import feedparser import pytest from workers import news_collector from workers.api_standards_collector import _parse_listing, _parse_pub_date from workers.ccps_collector import _beacon_pdf_links from workers.csb_collector import _parse_sitemap, _pdf_links, _should_skip from workers.news_collector import _clean_html, _entry_body FIXTURES = Path(__file__).parent / "fixtures" def _feed(name: str): return feedparser.parse((FIXTURES / name).read_text(encoding="utf-8")) def _source(**kw): return SimpleNamespace( fetch_method=kw.get("fetch_method", "rss"), fulltext_policy=kw.get("fulltext_policy", "none"), source_channel=kw.get("source_channel", "news"), ) # ── B-4: 본문 선택 정책 ─────────────────────────────────────────────────────── class TestEntryBodyPolicy: def test_signal_only_preserves_full_abstract(self): """arXiv 초록 1.6K자 — 기본 1000자 cap 을 적용하면 꼬리 유실.""" entry = _feed("arxiv_appph_rss.xml").entries[0] summary = _clean_html(entry.get("summary", "")) # 기본 경로 = 1000자 절단 body, ver = _entry_body(_source(fetch_method="signal-only"), entry, summary) assert ver == "rss-signal" assert len(body) > 1000 >= len(summary) assert "Abstract" in body def test_feed_full_promotes_ieee_description(self): entry = _feed("ieee_spectrum_energy_rss.xml").entries[0] summary = _clean_html(entry.get("summary", "")) body, ver = _entry_body(_source(fulltext_policy="feed-full"), entry, summary) assert ver == "rss-feed-full" assert len(body) > 1000 def test_default_source_keeps_capped_summary(self): entry = _feed("arxiv_appph_rss.xml").entries[0] summary = _clean_html(entry.get("summary", "")) body, ver = _entry_body(_source(), entry, summary) assert ver == "rss" assert body == summary def test_signal_only_title_fallback_when_feed_has_no_summary(self): """Nikkei RDF = description 없음 — summary 인자(=title 폴백)로 격하.""" entry = _feed("nikkei_asia_nar_rdf.xml").entries[0] body, ver = _entry_body( _source(fetch_method="signal-only"), entry, entry.get("title", "") ) assert ver == "rss-signal" assert body == entry.get("title", "") != "" # ── B-4: enqueue 가드 (signal-only = fulltext/summarize 절대 금지) ──────────── class TestSignalOnlyEnqueueGuard: @staticmethod def _patch(monkeypatch): calls = [] async def fake_enqueue(session, doc_id, stage): calls.append(stage) monkeypatch.setattr(news_collector, "enqueue_stage", fake_enqueue) return calls @pytest.mark.asyncio async def test_signal_only_overrides_misconfigured_page_policy(self, monkeypatch): """레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어).""" calls = self._patch(monkeypatch) doc = SimpleNamespace(id=1, edit_url="https://x/a") src = _source(fetch_method="signal-only", fulltext_policy="page") await news_collector._enqueue_processing( None, doc, src, datetime.now(timezone.utc) ) assert calls == ["embed", "chunk"] # fulltext/summarize 부재 @pytest.mark.asyncio async def test_signal_only_news_respects_30day_gate(self, monkeypatch): calls = self._patch(monkeypatch) doc = SimpleNamespace(id=1, edit_url="https://x/a") old = datetime.now(timezone.utc) - timedelta(days=40) await news_collector._enqueue_processing( None, doc, _source(fetch_method="signal-only"), old ) assert calls == [] @pytest.mark.asyncio async def test_signal_only_crawl_channel_indexes_regardless_of_age(self, monkeypatch): calls = self._patch(monkeypatch) doc = SimpleNamespace(id=1, edit_url="https://x/a") old = datetime.now(timezone.utc) - timedelta(days=400) src = _source(fetch_method="signal-only", source_channel="crawl") await news_collector._enqueue_processing(None, doc, src, old) assert calls == ["embed", "chunk"] # ── 연결 계층 1회 재시도 (MOEL 첫 TLS 핸드셰이크 간헐 드랍 실측) ────────────── class TestConnectRetry: class _Client: def __init__(self, errors: list): self.errors = errors self.calls = 0 async def get(self, url): self.calls += 1 if self.errors: raise self.errors.pop(0) return "OK" @pytest.mark.asyncio async def test_single_connect_error_retried_once(self): import httpx client = self._Client([httpx.ConnectError("")]) resp = await news_collector._get_with_connect_retry(client, "https://x/feed") assert resp == "OK" and client.calls == 2 @pytest.mark.asyncio async def test_second_retry_absorbs_consecutive_drop(self): """드랍이 연결 단위 랜덤이라 재시도 1회도 연속으로 걸림 (MOEL lawinfo 실측).""" import httpx client = self._Client([httpx.ConnectError(""), httpx.ConnectError("")]) resp = await news_collector._get_with_connect_retry(client, "https://x/feed") assert resp == "OK" and client.calls == 3 @pytest.mark.asyncio async def test_persistent_connect_error_propagates(self): import httpx client = self._Client([httpx.ConnectError("")] * 3) with pytest.raises(httpx.ConnectError): await news_collector._get_with_connect_retry(client, "https://x/feed") assert client.calls == 3 # 최대 2회 재시도 — 지속 장애는 circuit 몫 @pytest.mark.asyncio async def test_non_connect_errors_not_retried(self): import httpx client = self._Client([httpx.ReadTimeout("")]) with pytest.raises(httpx.ReadTimeout): await news_collector._get_with_connect_retry(client, "https://x/feed") assert client.calls == 1 # ── C-4 / B-4 피드 shape (시드 전 live 박제) ───────────────────────────────── class TestNikkeiRdfNativeParsing: """B-5 'rdf' quirk = 코드 분기 불요 실측 — feedparser 가 RSS 1.0 을 정규화.""" def test_rss10_entries_have_title_and_link(self): f = _feed("nikkei_asia_nar_rdf.xml") assert f.version == "rss10" assert not f.bozo and len(f.entries) >= 10 for e in f.entries: assert e.get("title", "").strip() assert e.get("link", "").startswith("https://asia.nikkei.com/") def test_no_summary_no_dates_means_title_signal(self): e = _feed("nikkei_asia_nar_rdf.xml").entries[0] assert not e.get("summary", "") assert not e.get("published_parsed") and not e.get("updated_parsed") class TestBloombergFixture: def test_video_items_mixed_in_feed(self): """비디오 항목 혼재 실측 → seed parser_quirk='skip-video' 의 근거.""" links = [e.get("link", "") for e in _feed("bloomberg_markets_rss.xml").entries] video_pat = re.compile(r"/videos?/") # news_collector skip-video 와 동일 패턴 assert any(video_pat.search(u) for u in links) assert any("/news/articles/" in u and not video_pat.search(u) for u in links) def test_articles_have_signal_grade_summary(self): f = _feed("bloomberg_markets_rss.xml") assert any(len(e.get("summary", "")) >= 100 for e in f.entries) class TestAsmeJpvtFixture: def test_journal_identity_and_abstract(self): f = _feed("asme_jpvt_openissues_rss.xml") assert "Pressure Vessel Technology" in f.feed.get("title", "") assert f.entries for e in f.entries: assert len(e.get("summary", "")) >= 200 # 초록 = 본문 class TestArxivFixture: def test_abs_links_are_stable_dedup_keys(self): """replace/cross 재공지는 같은 /abs/ URL — edit_url dedup 이 자연 차단.""" f = _feed("arxiv_appph_rss.xml") assert f.entries for e in f.entries: assert re.match(r"https://arxiv\.org/abs/\d", e.get("link", "")) def test_announce_type_in_summary(self): e = _feed("arxiv_appph_rss.xml").entries[0] assert "Announce Type:" in e.get("summary", "") class TestEconomistFixture: def test_oneline_signal_summaries(self): f = _feed("economist_latest_rss.xml") assert f.entries for e in f.entries: assert e.get("title", "").strip() assert e.get("link", "").startswith("https://www.economist.com/") # ── CSB sitemap diff 파서 ──────────────────────────────────────────────────── class TestCsbSitemapParsing: def test_parse_pairs_with_tz_aware_lastmod(self): xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8") pairs = _parse_sitemap(xml) assert pairs for url, lastmod in pairs: assert url.startswith("https://www.csb.gov/") assert lastmod.tzinfo is not None def test_skip_sections_vs_root_slugs(self): assert _should_skip("https://www.csb.gov/videos/some-video/") assert _should_skip("https://www.csb.gov/investigations/completed-investigations/") assert _should_skip("https://www.csb.gov/site-map/") assert _should_skip("https://www.csb.gov/") # 홈 # 조사 보고서/뉴스 릴리스 = 루트 슬러그 — 수집 대상 assert not _should_skip("https://www.csb.gov/givaudan-sense-colour-explosion-/") assert not _should_skip("https://www.csb.gov/recommendations/preventive-maintenance/") def test_watermark_diff_orders_oldest_first(self): xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8") pairs = [p for p in _parse_sitemap(xml) if not _should_skip(p[0])] watermark = min(lm for _, lm in pairs) changed = sorted( ((u, lm) for u, lm in pairs if lm >= watermark), key=lambda p: p[1] ) assert changed == sorted(changed, key=lambda p: p[1]) assert len(changed) == len(pairs) # >= 경계 포함 class TestCsbPdfLinks: HTML = (FIXTURES / "csb_investigation_page_excerpt.html").read_text(encoding="utf-8") BASE = "https://www.csb.gov/givaudan-sense-colour-explosion-/" def test_report_pdfs_kept_with_cachebuster_query(self): links = _pdf_links(self.HTML, self.BASE) assert any("Givaudan_Investigation_Report_Publication.pdf" in u for u in links) # cache-buster 쿼리는 다운로드 URL 에 유지 (정규화는 파일명/dedup 축에서만) assert any("?" in u for u in links) for u in links: assert u.startswith("https://www.csb.gov/") def test_recommendation_status_summaries_excluded(self): links = _pdf_links(self.HTML, self.BASE) assert links assert not any("/assets/recommendation/" in u for u in links) def test_dedup_by_path(self): html = ( 'a' 'b' 'c' ) links = _pdf_links(html, "https://www.csb.gov/page/") assert len(links) == 1 # 같은 path 1회 + 외부 호스트 제외 assert links[0].startswith("https://www.csb.gov/assets/1/6/r.pdf") # ── API 표준 공지 목록 파서 ────────────────────────────────────────────────── class TestApiListingParsing: HTML = (FIXTURES / "api_standards_announcements_listing.html").read_text( encoding="utf-8", errors="replace" ) def test_ten_unique_detail_links_per_page(self): urls = _parse_listing(self.HTML) assert len(urls) == 10 assert len(set(urls)) == 10 for u in urls: assert u.startswith( "https://www.api.org/products-and-services/standards/" "important-standards-announcements/" ) assert "?" not in u # 페이지네이션 링크(?page=) 미혼입 def test_pub_date_parse(self): dt = _parse_pub_date("Published June 4, 2026 — API announces ...") assert dt == datetime(2026, 6, 4, tzinfo=timezone.utc) assert _parse_pub_date("no date here") is None assert _parse_pub_date("February 31, 2026") is None # 달력 불가 = None # ── CCPS beacon 링크 파서 ──────────────────────────────────────────────────── class TestCcpsBeaconLinks: def test_beacon_filter_and_relative_resolve(self): html = ( 'June' 'Korean' 'brochure' 'Process Safety Beacon June' ) links = _beacon_pdf_links(html, "https://www.aiche.org/ccps/resources/process-safety-beacon") assert "https://www.aiche.org/sites/default/files/2026-06/Beacon-June-2026.pdf" in links assert any("beacon_korean" in u for u in links) assert any(u.endswith("/monthly.pdf") for u in links) # 앵커 텍스트 매칭 assert not any("other-brochure" in u for u in links)