feat(briefing): add morning briefing schema + services + api (historical off)

야간 수집 뉴스 (KST 00:00~05:00) topic×country 비교 분석 1페이지 카드. Phase 4 Global Digest 와 코드/로직/테이블 분리, 알고리즘만 services/clustering_common 공유. Backend 신규: - migrations/255_morning_briefings.sql: morning_briefings + briefing_topics (briefing_date UNIQUE, UNIQUE(briefing_id,topic_rank), FK CASCADE, historical_* 3컬럼 nullable, cluster_members JSONB, country_perspectives JSONB, status 4-state success|partial|failed|empty) - app/models/briefing.py: SQLAlchemy ORM - app/services/briefing/loader.py: KST 5h 윈도우 + news_sources prefix fallback (Phase 4 패턴 미러) + historical candidate pool 로더 - app/services/briefing/clustering.py: cluster_global topic-first (LAMBDA=ln(2)/2h, MIN_COUNTRIES_PER_TOPIC=2, MAX_TOPICS=7) - app/services/briefing/comparator.py: call_primary 26B + JSON envelope sanitize (cap perspectives 10 / divergences 3 / convergences 2 / quotes 5) + fallback row 고정 형태 + retrieve_historical cosine top-K - app/services/briefing/pipeline.py: load→cluster→select(K=7,λ=0.6) →historical→compare→status 4-state→delete+insert transaction - app/workers/briefing_worker.py: APScheduler/수동 호출 공용 진입점, 600s hard cap - app/prompts/briefing_comparative.txt: 한국어 비교 분석 JSON 프롬프트, {articles_block} + {historical_block} 2섹션, 인용 금지 라벨 - app/api/briefing.py: GET /latest, GET ?date=, POST /regenerate?date= (admin, sync delete+insert tx, regenerated:true) Backend 수정: - app/main.py: briefing_router 등록 (/api/briefing prefix). scheduler 등록은 PR-3 에서. - app/services/digest/selection.py: select_for_llm 매개변수화 (K, λ caller 주입). Phase 4 동작은 default 값으로 보존. Historical 정책: - BRIEFING_HISTORICAL_ENABLED env flag, default off. - flag off → historical_* 컬럼 모두 NULL, prompt {historical_block} 빈 라벨, retrieval 호출 안 함. - flag on (PR-1b 에서 enable) → cluster centroid 와 과거 30일 doc embedding cosine top-K 5 (sim≥0.70), prompt 에 주입. Country canonical (실측 확인 후): - documents.country 컬럼 부재 확정 - document_chunks.country 매칭률 0% (chunks 자체가 뉴스에 안 만들어짐) - 유일 country 신호 = news_sources prefix 매핑 (Phase 4 와 동일) Tests: - tests/test_briefing_historical.py: 3 경로 회귀 (flag off/on with fixture/on zero match) + sanitize cap + fallback row 형태. Verification: PR-1.8 에서 GPU 컨테이너 pytest + 수동 regenerate.
2026-05-12 12:58:50 +09:00
parent 1ca6d8b522
commit 431d4fe010
13 changed files with 1466 additions and 7 deletions
@@ -0,0 +1,203 @@
+"""Briefing historical 분기 회귀 — Plan §"Verification 9".
+
+3 경로 검증:
+1. flag off → retrieve_historical 호출 안 함, prompt {historical_block} = "(과거 참고 자료 없음)"
+2. flag on + fixture top-K → similarity ≥0.70 docs 만 반환
+3. flag on + zero match → 빈 list (no fallback hallucination)
+"""
+
+import os
+import sys
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+# PYTHONPATH = /app (디렉토리 안에서 실행 가정 또는 sys.path 추가)
+APP_DIR = Path(__file__).resolve().parent.parent / "app"
+if str(APP_DIR) not in sys.path:
+    sys.path.insert(0, str(APP_DIR))
+
+from services.briefing.comparator import (
+    HISTORICAL_SIMILARITY_MIN,
+    HISTORICAL_TOP_K,
+    _build_historical_block,
+    _make_fallback,
+    _sanitize_envelope,
+    build_prompt,
+    historical_enabled,
+    retrieve_historical,
+)
+from services.clustering_common import normalize_vector
+
+
+def _make_doc(doc_id: int, embedding: np.ndarray, hours_ago: int = 1) -> dict:
+    return {
+        "id": doc_id,
+        "title": f"doc {doc_id}",
+        "ai_summary": f"summary {doc_id}",
+        "embedding": embedding,
+        "created_at": datetime.now(timezone.utc) - timedelta(hours=hours_ago),
+    }
+
+
+def _make_cluster_with_centroid(centroid_vec: np.ndarray) -> dict:
+    return {
+        "centroid": normalize_vector(centroid_vec),
+        "members": [],
+    }
+
+
+def test_flag_default_off():
+    """env 미설정 → historical disabled."""
+    os.environ.pop("BRIEFING_HISTORICAL_ENABLED", None)
+    assert historical_enabled() is False
+
+
+def test_flag_on():
+    os.environ["BRIEFING_HISTORICAL_ENABLED"] = "true"
+    try:
+        assert historical_enabled() is True
+    finally:
+        os.environ.pop("BRIEFING_HISTORICAL_ENABLED", None)
+
+
+def test_historical_block_empty_when_no_docs():
+    """경로 1: flag off 또는 historical_docs=[] → 빈 라벨."""
+    block = _build_historical_block([])
+    assert block == "(과거 참고 자료 없음)"
+
+
+def test_historical_block_has_label_when_docs():
+    docs = [_make_doc(1, np.ones(1024, dtype=np.float32))]
+    block = _build_historical_block(docs)
+    assert "이전 30일 흐름" in block
+    assert "직접 인용 금지" in block
+    assert "[H1]" in block
+
+
+def test_retrieve_historical_topk():
+    """경로 2: flag on + fixture top-K similarity ≥ threshold."""
+    # cluster centroid = 모두 1 방향
+    centroid = np.ones(8, dtype=np.float32)
+    cluster = _make_cluster_with_centroid(centroid)
+
+    # 후보 10개: 5개는 centroid 와 유사 (sim≈1.0), 5개는 직교 (sim≈0)
+    similar_emb = np.ones(8, dtype=np.float32)
+    orthogonal_emb = np.array([1, -1, 1, -1, 1, -1, 1, -1], dtype=np.float32)
+    candidates = (
+        [_make_doc(i, similar_emb + np.random.rand(8).astype(np.float32) * 0.01) for i in range(1, 6)]
+        + [_make_doc(10 + i, orthogonal_emb) for i in range(5)]
+    )
+
+    out = retrieve_historical(cluster, candidates, top_k=5, sim_min=0.70)
+    assert len(out) == 5
+    # 모두 similar 그룹 (id 1~5) 만 선택됨
+    selected_ids = {d["id"] for d in out}
+    assert selected_ids.issubset({1, 2, 3, 4, 5})
+
+
+def test_retrieve_historical_zero_match():
+    """경로 3: 모든 candidate similarity < threshold → 빈 list."""
+    centroid = np.ones(8, dtype=np.float32)
+    cluster = _make_cluster_with_centroid(centroid)
+    orthogonal_emb = np.array([1, -1, 1, -1, 1, -1, 1, -1], dtype=np.float32)
+    candidates = [_make_doc(i, orthogonal_emb) for i in range(5)]
+
+    out = retrieve_historical(cluster, candidates, top_k=5, sim_min=0.70)
+    assert out == []
+
+
+def test_retrieve_historical_empty_candidates():
+    centroid = np.ones(8, dtype=np.float32)
+    cluster = _make_cluster_with_centroid(centroid)
+    assert retrieve_historical(cluster, [], top_k=5) == []
+
+
+def test_sanitize_envelope_valid():
+    cluster = {"members": [{"id": 1}, {"id": 2}]}
+    parsed = {
+        "topic_label": "이란 충돌",
+        "headline": "긴장 격화",
+        "country_perspectives": [
+            {"country": "kr", "summary": "유가 충격", "article_ids": [1]},
+            {"country": "us", "summary": "외교 압박", "article_ids": [2]},
+        ],
+        "divergences": ["KR=경제 / US=외교"],
+        "convergences": ["민간 사상 우려 공통"],
+        "key_quotes": [{"country": "US", "source": "NYT", "quote": "Tehran ..."}],
+        "historical_context": "지난 3주 6회 공방",
+    }
+    sanitized = _sanitize_envelope(parsed, cluster)
+    assert sanitized is not None
+    assert sanitized["topic_label"] == "이란 충돌"
+    # country 대문자 변환
+    assert sanitized["country_perspectives"][0]["country"] == "KR"
+    assert sanitized["historical_context"] == "지난 3주 6회 공방"
+    assert sanitized["llm_fallback_used"] is False
+
+
+def test_sanitize_envelope_empty_perspectives_to_fallback():
+    """country_perspectives 비어 있으면 None (caller 가 fallback 발동)."""
+    cluster = {"members": []}
+    parsed = {
+        "topic_label": "X",
+        "headline": "Y",
+        "country_perspectives": [],
+    }
+    assert _sanitize_envelope(parsed, cluster) is None
+
+
+def test_fallback_row_fixed_form():
+    """Plan §"Fallback Topic Row 고정 형태"."""
+    cluster = {"members": [{"id": 1}]}
+    fb = _make_fallback(cluster)
+    assert fb["topic_label"] == "주요 뉴스 묶음"
+    assert fb["country_perspectives"] == []
+    assert fb["divergences"] == []
+    assert fb["convergences"] == []
+    assert fb["key_quotes"] == []
+    assert fb["historical_context"] is None
+    assert fb["llm_fallback_used"] is True
+
+
+def test_prompt_includes_both_blocks():
+    selected = [_make_doc(1, np.ones(8, dtype=np.float32))]
+    selected[0]["country"] = "KR"
+    selected[0]["ai_sub_group"] = "경향신문"
+    selected[0]["ai_summary_truncated"] = "오늘 한국 뉴스"
+
+    prompt = build_prompt(selected, historical_docs=[])
+    assert "{articles_block}" not in prompt  # 치환됨
+    assert "{historical_block}" not in prompt
+    assert "(KR · 경향신문)" in prompt
+    assert "(과거 참고 자료 없음)" in prompt
+
+
+def test_perspective_summary_cap_enforced():
+    """sanitize 가 길이 cap 강제."""
+    cluster = {"members": []}
+    long_summary = "가" * 500  # 500자, cap=240
+    parsed = {
+        "topic_label": "T",
+        "headline": "H",
+        "country_perspectives": [{"country": "KR", "summary": long_summary, "article_ids": []}],
+    }
+    s = _sanitize_envelope(parsed, cluster)
+    assert s is not None
+    assert len(s["country_perspectives"][0]["summary"]) <= 241  # 240 + "…"
+
+
+def test_max_perspectives_cap():
+    cluster = {"members": []}
+    parsed = {
+        "topic_label": "T",
+        "headline": "H",
+        "country_perspectives": [
+            {"country": f"C{i}", "summary": "s", "article_ids": []} for i in range(20)
+        ],
+    }
+    s = _sanitize_envelope(parsed, cluster)
+    assert s is not None
+    assert len(s["country_perspectives"]) <= 10