Files
hyungi_document_server/tests/test_crawl_cycle3_shapes.py
T
hyungi b75307b89b fix(news): 연결 계층(TCP/TLS) 오류 1회 재시도 — MOEL 보안장비 첫 핸드셰이크 간헐 드랍 (재실측 진단)
GPU 회선에서 moel.go.kr 첫 TLS 연결이 간헐 드랍(curl rc=35, 직후 재시도 5/5 성공,
맥북 무발생·단일 A 레코드) → 사이클당 1회 fetch 인 피드가 ConnectError('') 누적,
입법행정예고 circuit open. ConnectError/ConnectTimeout 만 1.5s 후 1회 재시도,
HTTP 상태 오류 비대상. 회귀 테스트 3건 (42 passed).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 07:43:05 +09:00

320 lines
14 KiB
Python

"""crawl-24x7 사이클 3 — 순수 함수/형태 회귀 테스트 (DB 불요).
B-4 signal-only(본문 무절단 + enqueue 가드) + C-4 피드 shape + CSB sitemap diff 파서
+ API 공지 목록 파서 + CCPS beacon 링크 파서 + B-5 (Nikkei RDF = feedparser 네이티브,
코드 분기 불요 박제).
fixture = 2026-06-11 live 박제 (tests/fixtures/, [[feedback_external_api_fixture_first]]).
economist/ieee 는 repo 크기 사유로 item 수만 trim (헤더/푸터/item 구조 byte-faithful).
"""
import re
from datetime import datetime, timedelta, timezone
from pathlib import Path
from types import SimpleNamespace
import feedparser
import pytest
from workers import news_collector
from workers.api_standards_collector import _parse_listing, _parse_pub_date
from workers.ccps_collector import _beacon_pdf_links
from workers.csb_collector import _parse_sitemap, _pdf_links, _should_skip
from workers.news_collector import _clean_html, _entry_body
FIXTURES = Path(__file__).parent / "fixtures"
def _feed(name: str):
return feedparser.parse((FIXTURES / name).read_text(encoding="utf-8"))
def _source(**kw):
return SimpleNamespace(
fetch_method=kw.get("fetch_method", "rss"),
fulltext_policy=kw.get("fulltext_policy", "none"),
source_channel=kw.get("source_channel", "news"),
)
# ── B-4: 본문 선택 정책 ───────────────────────────────────────────────────────
class TestEntryBodyPolicy:
def test_signal_only_preserves_full_abstract(self):
"""arXiv 초록 1.6K자 — 기본 1000자 cap 을 적용하면 꼬리 유실."""
entry = _feed("arxiv_appph_rss.xml").entries[0]
summary = _clean_html(entry.get("summary", "")) # 기본 경로 = 1000자 절단
body, ver = _entry_body(_source(fetch_method="signal-only"), entry, summary)
assert ver == "rss-signal"
assert len(body) > 1000 >= len(summary)
assert "Abstract" in body
def test_feed_full_promotes_ieee_description(self):
entry = _feed("ieee_spectrum_energy_rss.xml").entries[0]
summary = _clean_html(entry.get("summary", ""))
body, ver = _entry_body(_source(fulltext_policy="feed-full"), entry, summary)
assert ver == "rss-feed-full"
assert len(body) > 1000
def test_default_source_keeps_capped_summary(self):
entry = _feed("arxiv_appph_rss.xml").entries[0]
summary = _clean_html(entry.get("summary", ""))
body, ver = _entry_body(_source(), entry, summary)
assert ver == "rss"
assert body == summary
def test_signal_only_title_fallback_when_feed_has_no_summary(self):
"""Nikkei RDF = description 없음 — summary 인자(=title 폴백)로 격하."""
entry = _feed("nikkei_asia_nar_rdf.xml").entries[0]
body, ver = _entry_body(
_source(fetch_method="signal-only"), entry, entry.get("title", "")
)
assert ver == "rss-signal"
assert body == entry.get("title", "") != ""
# ── B-4: enqueue 가드 (signal-only = fulltext/summarize 절대 금지) ────────────
class TestSignalOnlyEnqueueGuard:
@staticmethod
def _patch(monkeypatch):
calls = []
async def fake_enqueue(session, doc_id, stage):
calls.append(stage)
monkeypatch.setattr(news_collector, "enqueue_stage", fake_enqueue)
return calls
@pytest.mark.asyncio
async def test_signal_only_overrides_misconfigured_page_policy(self, monkeypatch):
"""레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어)."""
calls = self._patch(monkeypatch)
doc = SimpleNamespace(id=1, edit_url="https://x/a")
src = _source(fetch_method="signal-only", fulltext_policy="page")
await news_collector._enqueue_processing(
None, doc, src, datetime.now(timezone.utc)
)
assert calls == ["embed", "chunk"] # fulltext/summarize 부재
@pytest.mark.asyncio
async def test_signal_only_news_respects_30day_gate(self, monkeypatch):
calls = self._patch(monkeypatch)
doc = SimpleNamespace(id=1, edit_url="https://x/a")
old = datetime.now(timezone.utc) - timedelta(days=40)
await news_collector._enqueue_processing(
None, doc, _source(fetch_method="signal-only"), old
)
assert calls == []
@pytest.mark.asyncio
async def test_signal_only_crawl_channel_indexes_regardless_of_age(self, monkeypatch):
calls = self._patch(monkeypatch)
doc = SimpleNamespace(id=1, edit_url="https://x/a")
old = datetime.now(timezone.utc) - timedelta(days=400)
src = _source(fetch_method="signal-only", source_channel="crawl")
await news_collector._enqueue_processing(None, doc, src, old)
assert calls == ["embed", "chunk"]
# ── 연결 계층 1회 재시도 (MOEL 첫 TLS 핸드셰이크 간헐 드랍 실측) ──────────────
class TestConnectRetry:
class _Client:
def __init__(self, errors: list):
self.errors = errors
self.calls = 0
async def get(self, url):
self.calls += 1
if self.errors:
raise self.errors.pop(0)
return "OK"
@pytest.mark.asyncio
async def test_single_connect_error_retried_once(self):
import httpx
client = self._Client([httpx.ConnectError("")])
resp = await news_collector._get_with_connect_retry(client, "https://x/feed")
assert resp == "OK" and client.calls == 2
@pytest.mark.asyncio
async def test_persistent_connect_error_propagates(self):
import httpx
client = self._Client([httpx.ConnectError(""), httpx.ConnectError("")])
with pytest.raises(httpx.ConnectError):
await news_collector._get_with_connect_retry(client, "https://x/feed")
assert client.calls == 2 # 1회만 재시도 — 지속 장애는 circuit 몫
@pytest.mark.asyncio
async def test_non_connect_errors_not_retried(self):
import httpx
client = self._Client([httpx.ReadTimeout("")])
with pytest.raises(httpx.ReadTimeout):
await news_collector._get_with_connect_retry(client, "https://x/feed")
assert client.calls == 1
# ── C-4 / B-4 피드 shape (시드 전 live 박제) ─────────────────────────────────
class TestNikkeiRdfNativeParsing:
"""B-5 'rdf' quirk = 코드 분기 불요 실측 — feedparser 가 RSS 1.0 을 정규화."""
def test_rss10_entries_have_title_and_link(self):
f = _feed("nikkei_asia_nar_rdf.xml")
assert f.version == "rss10"
assert not f.bozo and len(f.entries) >= 10
for e in f.entries:
assert e.get("title", "").strip()
assert e.get("link", "").startswith("https://asia.nikkei.com/")
def test_no_summary_no_dates_means_title_signal(self):
e = _feed("nikkei_asia_nar_rdf.xml").entries[0]
assert not e.get("summary", "")
assert not e.get("published_parsed") and not e.get("updated_parsed")
class TestBloombergFixture:
def test_video_items_mixed_in_feed(self):
"""비디오 항목 혼재 실측 → seed parser_quirk='skip-video' 의 근거."""
links = [e.get("link", "") for e in _feed("bloomberg_markets_rss.xml").entries]
video_pat = re.compile(r"/videos?/") # news_collector skip-video 와 동일 패턴
assert any(video_pat.search(u) for u in links)
assert any("/news/articles/" in u and not video_pat.search(u) for u in links)
def test_articles_have_signal_grade_summary(self):
f = _feed("bloomberg_markets_rss.xml")
assert any(len(e.get("summary", "")) >= 100 for e in f.entries)
class TestAsmeJpvtFixture:
def test_journal_identity_and_abstract(self):
f = _feed("asme_jpvt_openissues_rss.xml")
assert "Pressure Vessel Technology" in f.feed.get("title", "")
assert f.entries
for e in f.entries:
assert len(e.get("summary", "")) >= 200 # 초록 = 본문
class TestArxivFixture:
def test_abs_links_are_stable_dedup_keys(self):
"""replace/cross 재공지는 같은 /abs/ URL — edit_url dedup 이 자연 차단."""
f = _feed("arxiv_appph_rss.xml")
assert f.entries
for e in f.entries:
assert re.match(r"https://arxiv\.org/abs/\d", e.get("link", ""))
def test_announce_type_in_summary(self):
e = _feed("arxiv_appph_rss.xml").entries[0]
assert "Announce Type:" in e.get("summary", "")
class TestEconomistFixture:
def test_oneline_signal_summaries(self):
f = _feed("economist_latest_rss.xml")
assert f.entries
for e in f.entries:
assert e.get("title", "").strip()
assert e.get("link", "").startswith("https://www.economist.com/")
# ── CSB sitemap diff 파서 ────────────────────────────────────────────────────
class TestCsbSitemapParsing:
def test_parse_pairs_with_tz_aware_lastmod(self):
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
pairs = _parse_sitemap(xml)
assert pairs
for url, lastmod in pairs:
assert url.startswith("https://www.csb.gov/")
assert lastmod.tzinfo is not None
def test_skip_sections_vs_root_slugs(self):
assert _should_skip("https://www.csb.gov/videos/some-video/")
assert _should_skip("https://www.csb.gov/investigations/completed-investigations/")
assert _should_skip("https://www.csb.gov/site-map/")
assert _should_skip("https://www.csb.gov/") # 홈
# 조사 보고서/뉴스 릴리스 = 루트 슬러그 — 수집 대상
assert not _should_skip("https://www.csb.gov/givaudan-sense-colour-explosion-/")
assert not _should_skip("https://www.csb.gov/recommendations/preventive-maintenance/")
def test_watermark_diff_orders_oldest_first(self):
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
pairs = [p for p in _parse_sitemap(xml) if not _should_skip(p[0])]
watermark = min(lm for _, lm in pairs)
changed = sorted(
((u, lm) for u, lm in pairs if lm >= watermark), key=lambda p: p[1]
)
assert changed == sorted(changed, key=lambda p: p[1])
assert len(changed) == len(pairs) # >= 경계 포함
class TestCsbPdfLinks:
HTML = (FIXTURES / "csb_investigation_page_excerpt.html").read_text(encoding="utf-8")
BASE = "https://www.csb.gov/givaudan-sense-colour-explosion-/"
def test_report_pdfs_kept_with_cachebuster_query(self):
links = _pdf_links(self.HTML, self.BASE)
assert any("Givaudan_Investigation_Report_Publication.pdf" in u for u in links)
# cache-buster 쿼리는 다운로드 URL 에 유지 (정규화는 파일명/dedup 축에서만)
assert any("?" in u for u in links)
for u in links:
assert u.startswith("https://www.csb.gov/")
def test_recommendation_status_summaries_excluded(self):
links = _pdf_links(self.HTML, self.BASE)
assert links
assert not any("/assets/recommendation/" in u for u in links)
def test_dedup_by_path(self):
html = (
'<a href="/assets/1/6/r.pdf?100">a</a>'
'<a href="/assets/1/6/r.pdf?200">b</a>'
'<a href="https://evil.example.com/x.pdf">c</a>'
)
links = _pdf_links(html, "https://www.csb.gov/page/")
assert len(links) == 1 # 같은 path 1회 + 외부 호스트 제외
assert links[0].startswith("https://www.csb.gov/assets/1/6/r.pdf")
# ── API 표준 공지 목록 파서 ──────────────────────────────────────────────────
class TestApiListingParsing:
HTML = (FIXTURES / "api_standards_announcements_listing.html").read_text(
encoding="utf-8", errors="replace"
)
def test_ten_unique_detail_links_per_page(self):
urls = _parse_listing(self.HTML)
assert len(urls) == 10
assert len(set(urls)) == 10
for u in urls:
assert u.startswith(
"https://www.api.org/products-and-services/standards/"
"important-standards-announcements/"
)
assert "?" not in u # 페이지네이션 링크(?page=) 미혼입
def test_pub_date_parse(self):
dt = _parse_pub_date("Published June 4, 2026 — API announces ...")
assert dt == datetime(2026, 6, 4, tzinfo=timezone.utc)
assert _parse_pub_date("no date here") is None
assert _parse_pub_date("February 31, 2026") is None # 달력 불가 = None
# ── CCPS beacon 링크 파서 ────────────────────────────────────────────────────
class TestCcpsBeaconLinks:
def test_beacon_filter_and_relative_resolve(self):
html = (
'<a href="/sites/default/files/2026-06/Beacon-June-2026.pdf">June</a>'
'<a href="/sites/default/files/beacon_korean_2026_06.pdf"><b>Korean</b></a>'
'<a href="/sites/default/files/other-brochure.pdf">brochure</a>'
'<a href="/sites/default/files/monthly.pdf">Process Safety Beacon June</a>'
)
links = _beacon_pdf_links(html, "https://www.aiche.org/ccps/resources/process-safety-beacon")
assert "https://www.aiche.org/sites/default/files/2026-06/Beacon-June-2026.pdf" in links
assert any("beacon_korean" in u for u in links)
assert any(u.endswith("/monthly.pdf") for u in links) # 앵커 텍스트 매칭
assert not any("other-brochure" in u for u in links)