hyungi_document_server/tests/test_crawl_cycle3_shapes.py

"""crawl-24x7 사이클 3 — 순수 함수/형태 회귀 테스트 (DB 불요).

B-4 signal-only(본문 무절단 + enqueue 가드) + C-4 피드 shape + CSB sitemap diff 파서
+ API 공지 목록 파서 + CCPS beacon 링크 파서 + B-5 (Nikkei RDF = feedparser 네이티브,
코드 분기 불요 박제).

fixture = 2026-06-11 live 박제 (tests/fixtures/, [[feedback_external_api_fixture_first]]).
economist/ieee 는 repo 크기 사유로 item 수만 trim (헤더/푸터/item 구조 byte-faithful).
"""

import re
from datetime import datetime, timedelta, timezone
from pathlib import Path
from types import SimpleNamespace

import feedparser
import pytest

from workers import news_collector
from workers.api_standards_collector import _parse_listing, _parse_pub_date
from workers.ccps_collector import _beacon_pdf_links
from workers.csb_collector import _parse_sitemap, _pdf_links, _should_skip
from workers.news_collector import _clean_html, _entry_body

FIXTURES = Path(__file__).parent / "fixtures"


def _feed(name: str):
    return feedparser.parse((FIXTURES / name).read_text(encoding="utf-8"))


def _source(**kw):
    return SimpleNamespace(
        fetch_method=kw.get("fetch_method", "rss"),
        fulltext_policy=kw.get("fulltext_policy", "none"),
        source_channel=kw.get("source_channel", "news"),
    )


# ── B-4: 본문 선택 정책 ───────────────────────────────────────────────────────

class TestEntryBodyPolicy:
    def test_signal_only_preserves_full_abstract(self):
        """arXiv 초록 1.6K자 — 기본 1000자 cap 을 적용하면 꼬리 유실."""
        entry = _feed("arxiv_appph_rss.xml").entries[0]
        summary = _clean_html(entry.get("summary", ""))  # 기본 경로 = 1000자 절단
        body, ver = _entry_body(_source(fetch_method="signal-only"), entry, summary)
        assert ver == "rss-signal"
        assert len(body) > 1000 >= len(summary)
        assert "Abstract" in body

    def test_feed_full_promotes_ieee_description(self):
        entry = _feed("ieee_spectrum_energy_rss.xml").entries[0]
        summary = _clean_html(entry.get("summary", ""))
        body, ver = _entry_body(_source(fulltext_policy="feed-full"), entry, summary)
        assert ver == "rss-feed-full"
        assert len(body) > 1000

    def test_default_source_keeps_capped_summary(self):
        entry = _feed("arxiv_appph_rss.xml").entries[0]
        summary = _clean_html(entry.get("summary", ""))
        body, ver = _entry_body(_source(), entry, summary)
        assert ver == "rss"
        assert body == summary

    def test_signal_only_title_fallback_when_feed_has_no_summary(self):
        """Nikkei RDF = description 없음 — summary 인자(=title 폴백)로 격하."""
        entry = _feed("nikkei_asia_nar_rdf.xml").entries[0]
        body, ver = _entry_body(
            _source(fetch_method="signal-only"), entry, entry.get("title", "")
        )
        assert ver == "rss-signal"
        assert body == entry.get("title", "") != ""


# ── B-4: enqueue 가드 (signal-only = fulltext/summarize 절대 금지) ────────────

class TestSignalOnlyEnqueueGuard:
    @staticmethod
    def _patch(monkeypatch):
        calls = []

        async def fake_enqueue(session, doc_id, stage):
            calls.append(stage)

        monkeypatch.setattr(news_collector, "enqueue_stage", fake_enqueue)
        return calls

    @pytest.mark.asyncio
    async def test_signal_only_overrides_misconfigured_page_policy(self, monkeypatch):
        """레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어)."""
        calls = self._patch(monkeypatch)
        doc = SimpleNamespace(id=1, edit_url="https://x/a")
        src = _source(fetch_method="signal-only", fulltext_policy="page")
        await news_collector._enqueue_processing(
            None, doc, src, datetime.now(timezone.utc)
        )
        assert calls == ["embed", "chunk"]  # fulltext/summarize 부재

    @pytest.mark.asyncio
    async def test_signal_only_news_respects_30day_gate(self, monkeypatch):
        calls = self._patch(monkeypatch)
        doc = SimpleNamespace(id=1, edit_url="https://x/a")
        old = datetime.now(timezone.utc) - timedelta(days=40)
        await news_collector._enqueue_processing(
            None, doc, _source(fetch_method="signal-only"), old
        )
        assert calls == []

    @pytest.mark.asyncio
    async def test_signal_only_crawl_channel_indexes_regardless_of_age(self, monkeypatch):
        calls = self._patch(monkeypatch)
        doc = SimpleNamespace(id=1, edit_url="https://x/a")
        old = datetime.now(timezone.utc) - timedelta(days=400)
        src = _source(fetch_method="signal-only", source_channel="crawl")
        await news_collector._enqueue_processing(None, doc, src, old)
        assert calls == ["embed", "chunk"]


# ── 연결 계층 1회 재시도 (MOEL 첫 TLS 핸드셰이크 간헐 드랍 실측) ──────────────

class TestConnectRetry:
    class _Client:
        def __init__(self, errors: list):
            self.errors = errors
            self.calls = 0

        async def get(self, url):
            self.calls += 1
            if self.errors:
                raise self.errors.pop(0)
            return "OK"

    @pytest.mark.asyncio
    async def test_single_connect_error_retried_once(self):
        import httpx
        client = self._Client([httpx.ConnectError("")])
        resp = await news_collector._get_with_connect_retry(client, "https://x/feed")
        assert resp == "OK" and client.calls == 2

    @pytest.mark.asyncio
    async def test_persistent_connect_error_propagates(self):
        import httpx
        client = self._Client([httpx.ConnectError(""), httpx.ConnectError("")])
        with pytest.raises(httpx.ConnectError):
            await news_collector._get_with_connect_retry(client, "https://x/feed")
        assert client.calls == 2  # 1회만 재시도 — 지속 장애는 circuit 몫

    @pytest.mark.asyncio
    async def test_non_connect_errors_not_retried(self):
        import httpx
        client = self._Client([httpx.ReadTimeout("")])
        with pytest.raises(httpx.ReadTimeout):
            await news_collector._get_with_connect_retry(client, "https://x/feed")
        assert client.calls == 1


# ── C-4 / B-4 피드 shape (시드 전 live 박제) ─────────────────────────────────

class TestNikkeiRdfNativeParsing:
    """B-5 'rdf' quirk = 코드 분기 불요 실측 — feedparser 가 RSS 1.0 을 정규화."""

    def test_rss10_entries_have_title_and_link(self):
        f = _feed("nikkei_asia_nar_rdf.xml")
        assert f.version == "rss10"
        assert not f.bozo and len(f.entries) >= 10
        for e in f.entries:
            assert e.get("title", "").strip()
            assert e.get("link", "").startswith("https://asia.nikkei.com/")

    def test_no_summary_no_dates_means_title_signal(self):
        e = _feed("nikkei_asia_nar_rdf.xml").entries[0]
        assert not e.get("summary", "")
        assert not e.get("published_parsed") and not e.get("updated_parsed")


class TestBloombergFixture:
    def test_video_items_mixed_in_feed(self):
        """비디오 항목 혼재 실측 → seed parser_quirk='skip-video' 의 근거."""
        links = [e.get("link", "") for e in _feed("bloomberg_markets_rss.xml").entries]
        video_pat = re.compile(r"/videos?/")  # news_collector skip-video 와 동일 패턴
        assert any(video_pat.search(u) for u in links)
        assert any("/news/articles/" in u and not video_pat.search(u) for u in links)

    def test_articles_have_signal_grade_summary(self):
        f = _feed("bloomberg_markets_rss.xml")
        assert any(len(e.get("summary", "")) >= 100 for e in f.entries)


class TestAsmeJpvtFixture:
    def test_journal_identity_and_abstract(self):
        f = _feed("asme_jpvt_openissues_rss.xml")
        assert "Pressure Vessel Technology" in f.feed.get("title", "")
        assert f.entries
        for e in f.entries:
            assert len(e.get("summary", "")) >= 200  # 초록 = 본문


class TestArxivFixture:
    def test_abs_links_are_stable_dedup_keys(self):
        """replace/cross 재공지는 같은 /abs/ URL — edit_url dedup 이 자연 차단."""
        f = _feed("arxiv_appph_rss.xml")
        assert f.entries
        for e in f.entries:
            assert re.match(r"https://arxiv\.org/abs/\d", e.get("link", ""))

    def test_announce_type_in_summary(self):
        e = _feed("arxiv_appph_rss.xml").entries[0]
        assert "Announce Type:" in e.get("summary", "")


class TestEconomistFixture:
    def test_oneline_signal_summaries(self):
        f = _feed("economist_latest_rss.xml")
        assert f.entries
        for e in f.entries:
            assert e.get("title", "").strip()
            assert e.get("link", "").startswith("https://www.economist.com/")


# ── CSB sitemap diff 파서 ────────────────────────────────────────────────────

class TestCsbSitemapParsing:
    def test_parse_pairs_with_tz_aware_lastmod(self):
        xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
        pairs = _parse_sitemap(xml)
        assert pairs
        for url, lastmod in pairs:
            assert url.startswith("https://www.csb.gov/")
            assert lastmod.tzinfo is not None

    def test_skip_sections_vs_root_slugs(self):
        assert _should_skip("https://www.csb.gov/videos/some-video/")
        assert _should_skip("https://www.csb.gov/investigations/completed-investigations/")
        assert _should_skip("https://www.csb.gov/site-map/")
        assert _should_skip("https://www.csb.gov/")  # 홈
        # 조사 보고서/뉴스 릴리스 = 루트 슬러그 — 수집 대상
        assert not _should_skip("https://www.csb.gov/givaudan-sense-colour-explosion-/")
        assert not _should_skip("https://www.csb.gov/recommendations/preventive-maintenance/")

    def test_watermark_diff_orders_oldest_first(self):
        xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
        pairs = [p for p in _parse_sitemap(xml) if not _should_skip(p[0])]
        watermark = min(lm for _, lm in pairs)
        changed = sorted(
            ((u, lm) for u, lm in pairs if lm >= watermark), key=lambda p: p[1]
        )
        assert changed == sorted(changed, key=lambda p: p[1])
        assert len(changed) == len(pairs)  # >= 경계 포함


class TestCsbPdfLinks:
    HTML = (FIXTURES / "csb_investigation_page_excerpt.html").read_text(encoding="utf-8")
    BASE = "https://www.csb.gov/givaudan-sense-colour-explosion-/"

    def test_report_pdfs_kept_with_cachebuster_query(self):
        links = _pdf_links(self.HTML, self.BASE)
        assert any("Givaudan_Investigation_Report_Publication.pdf" in u for u in links)
        # cache-buster 쿼리는 다운로드 URL 에 유지 (정규화는 파일명/dedup 축에서만)
        assert any("?" in u for u in links)
        for u in links:
            assert u.startswith("https://www.csb.gov/")

    def test_recommendation_status_summaries_excluded(self):
        links = _pdf_links(self.HTML, self.BASE)
        assert links
        assert not any("/assets/recommendation/" in u for u in links)

    def test_dedup_by_path(self):
        html = (
            '<a href="/assets/1/6/r.pdf?100">a</a>'
            '<a href="/assets/1/6/r.pdf?200">b</a>'
            '<a href="https://evil.example.com/x.pdf">c</a>'
        )
        links = _pdf_links(html, "https://www.csb.gov/page/")
        assert len(links) == 1  # 같은 path 1회 + 외부 호스트 제외
        assert links[0].startswith("https://www.csb.gov/assets/1/6/r.pdf")


# ── API 표준 공지 목록 파서 ──────────────────────────────────────────────────

class TestApiListingParsing:
    HTML = (FIXTURES / "api_standards_announcements_listing.html").read_text(
        encoding="utf-8", errors="replace"
    )

    def test_ten_unique_detail_links_per_page(self):
        urls = _parse_listing(self.HTML)
        assert len(urls) == 10
        assert len(set(urls)) == 10
        for u in urls:
            assert u.startswith(
                "https://www.api.org/products-and-services/standards/"
                "important-standards-announcements/"
            )
            assert "?" not in u  # 페이지네이션 링크(?page=) 미혼입

    def test_pub_date_parse(self):
        dt = _parse_pub_date("Published June 4, 2026 — API announces ...")
        assert dt == datetime(2026, 6, 4, tzinfo=timezone.utc)
        assert _parse_pub_date("no date here") is None
        assert _parse_pub_date("February 31, 2026") is None  # 달력 불가 = None


# ── CCPS beacon 링크 파서 ────────────────────────────────────────────────────

class TestCcpsBeaconLinks:
    def test_beacon_filter_and_relative_resolve(self):
        html = (
            '<a href="/sites/default/files/2026-06/Beacon-June-2026.pdf">June</a>'
            '<a href="/sites/default/files/beacon_korean_2026_06.pdf"><b>Korean</b></a>'
            '<a href="/sites/default/files/other-brochure.pdf">brochure</a>'
            '<a href="/sites/default/files/monthly.pdf">Process Safety Beacon June</a>'
        )
        links = _beacon_pdf_links(html, "https://www.aiche.org/ccps/resources/process-safety-beacon")
        assert "https://www.aiche.org/sites/default/files/2026-06/Beacon-June-2026.pdf" in links
        assert any("beacon_korean" in u for u in links)
        assert any(u.endswith("/monthly.pdf") for u in links)  # 앵커 텍스트 매칭
        assert not any("other-brochure" in u for u in links)