fix(briefing): backfill country_perspectives[].article_ids from cluster members

LLM 이 article_ids 를 자율적으로 비워두는 케이스 (2026-05-12 첫 briefing 6 topics 모두 빈 list) 를 서버에서 보정. 후처리 정책 (_resolve_article_ids): 1. LLM 이 준 id ∩ cluster member id (엉뚱한 id 차단, hallucination 방어) 2. 비어있으면 같은 country cluster member top weight N 개 자동 주입 3. cluster 안 country 매칭 멤버 0 → [] per-country cap = MAX_ARTICLE_IDS_PER_COUNTRY = 5. weight 내림차순. API 계약 강화: country_perspectives 가 있는 topic 은 article_ids ≥ 1 보장 (같은 country cluster member 존재 시). frontend / 외부 채널 / archive UI 모두 신뢰 가능. tests 3 케이스 추가.
2026-05-12 13:15:26 +09:00
parent 36fea2789a
commit 6966be9cf6
2 changed files with 128 additions and 7 deletions
@@ -35,6 +35,7 @@ MAX_CONVERGENCES = 2
 MAX_KEY_QUOTES = 5
 MAX_PERSPECTIVE_SUMMARY_LEN = 240     # 한국어 1~2문장 ≤120자 × 2
 MAX_HISTORICAL_CONTEXT_LEN = 240
+MAX_ARTICLE_IDS_PER_COUNTRY = 5       # country_perspectives[].article_ids 후처리 cap
 FALLBACK_HEADLINE = "LLM 분석 실패로 원문 기사 묶음만 표시합니다."
 FALLBACK_TOPIC_LABEL = "주요 뉴스 묶음"

@@ -127,8 +128,58 @@ def _truncate_str(s: Any, limit: int) -> str:
    return s


+def _country_article_id_map(cluster: dict) -> dict[str, list[int]]:
+    """cluster.members 를 country 별 article_id list 로 그룹 (weight 내림차순).
+
+    Phase 4 selection 단계에서 m['weight'] 가 채워져 있음. 누락 시 0.0 으로 fallback.
+    """
+    grouped: dict[str, list[tuple[float, int]]] = {}
+    for m in cluster.get("members", []):
+        country = (m.get("country") or "").upper()
+        if not country:
+            continue
+        weight = float(m.get("weight", 0.0))
+        grouped.setdefault(country, []).append((weight, int(m["id"])))
+    out: dict[str, list[int]] = {}
+    for country, pairs in grouped.items():
+        pairs.sort(key=lambda x: -x[0])
+        out[country] = [doc_id for _, doc_id in pairs]
+    return out
+
+
+def _resolve_article_ids(
+    raw_ids: list,
+    country: str,
+    cluster_country_ids: dict[str, list[int]],
+) -> list[int]:
+    """country_perspectives[].article_ids 후처리.
+
+    1) LLM 이 준 id 가 cluster member 와 교집합인 것만 유지 (엉뚱한 id 차단).
+    2) 비어있으면 같은 country 의 cluster member top weight N 개 자동 주입.
+    3) 그래도 없으면 [] (country 매핑된 member 부재).
+    """
+    cluster_ids = cluster_country_ids.get(country, [])
+    cluster_id_set = set(cluster_ids)
+
+    # 1) LLM id ∩ cluster
+    cleaned = []
+    if isinstance(raw_ids, list):
+        for x in raw_ids:
+            try:
+                doc_id = int(x)
+            except (TypeError, ValueError):
+                continue
+            if doc_id in cluster_id_set and doc_id not in cleaned:
+                cleaned.append(doc_id)
+    if cleaned:
+        return cleaned[:MAX_ARTICLE_IDS_PER_COUNTRY]
+
+    # 2) Country fallback top-N
+    return cluster_ids[:MAX_ARTICLE_IDS_PER_COUNTRY]
+
+
 def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
-    """LLM 응답 envelope 검증 + cap 강제. None 반환 시 fallback 발동."""
+    """LLM 응답 envelope 검증 + cap 강제 + article_ids 후처리. None → fallback."""
    if not isinstance(parsed, dict):
        return None

@@ -137,6 +188,9 @@ def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
    if not topic_label or not headline:
        return None

+    # cluster.members 의 country → [id] 매핑을 미리 만들어 후처리 input 으로 사용
+    country_ids_map = _country_article_id_map(cluster)
+
    # country_perspectives
    raw_persp = parsed.get("country_perspectives")
    perspectives = []
@@ -146,14 +200,15 @@ def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
                continue
            country = _truncate_str(p.get("country"), 10).upper()
            summary = _truncate_str(p.get("summary"), MAX_PERSPECTIVE_SUMMARY_LEN)
-            ids = p.get("article_ids") or []
-            if not isinstance(ids, list):
-                ids = []
-            ids = [int(x) for x in ids if isinstance(x, (int, str)) and str(x).isdigit()]
+            raw_ids = p.get("article_ids") or []
+            article_ids = _resolve_article_ids(raw_ids, country, country_ids_map)
            if country and summary:
-                perspectives.append({"country": country, "summary": summary, "article_ids": ids})
+                perspectives.append({
+                    "country": country,
+                    "summary": summary,
+                    "article_ids": article_ids,
+                })
    if not perspectives:
-        # 비교 분석 가치가 없는 응답 → fallback
        return None

    def _str_array(key: str, cap: int, item_limit: int) -> list[str]:
@@ -189,6 +189,72 @@ def test_perspective_summary_cap_enforced():
    assert len(s["country_perspectives"][0]["summary"]) <= 241  # 240 + "…"


+def test_article_ids_fallback_when_llm_empty():
+    """LLM 이 article_ids 를 비워두면 같은 country cluster member top-N 자동 주입."""
+    cluster = {
+        "members": [
+            {"id": 101, "country": "KR", "weight": 0.9},
+            {"id": 102, "country": "KR", "weight": 0.8},
+            {"id": 103, "country": "KR", "weight": 0.7},
+            {"id": 201, "country": "US", "weight": 0.5},
+        ]
+    }
+    parsed = {
+        "topic_label": "T",
+        "headline": "H",
+        "country_perspectives": [
+            {"country": "KR", "summary": "한국 시각", "article_ids": []},
+            {"country": "US", "summary": "미국 시각", "article_ids": []},
+        ],
+    }
+    s = _sanitize_envelope(parsed, cluster)
+    assert s is not None
+    kr = next(p for p in s["country_perspectives"] if p["country"] == "KR")
+    us = next(p for p in s["country_perspectives"] if p["country"] == "US")
+    assert kr["article_ids"] == [101, 102, 103]  # weight desc
+    assert us["article_ids"] == [201]
+
+
+def test_article_ids_intersect_with_cluster():
+    """LLM 이 엉뚱한 id 를 넣으면 cluster member 와 교집합만."""
+    cluster = {
+        "members": [
+            {"id": 101, "country": "KR", "weight": 0.9},
+            {"id": 102, "country": "KR", "weight": 0.8},
+        ]
+    }
+    parsed = {
+        "topic_label": "T",
+        "headline": "H",
+        "country_perspectives": [
+            {"country": "KR", "summary": "한국 시각", "article_ids": [101, 999, 888]},
+        ],
+    }
+    s = _sanitize_envelope(parsed, cluster)
+    assert s is not None
+    assert s["country_perspectives"][0]["article_ids"] == [101]
+
+
+def test_article_ids_capped_to_max():
+    """후처리 후에도 country 당 MAX_ARTICLE_IDS_PER_COUNTRY cap."""
+    cluster = {
+        "members": [
+            {"id": i, "country": "KR", "weight": 1.0 / i} for i in range(1, 15)
+        ]
+    }
+    parsed = {
+        "topic_label": "T",
+        "headline": "H",
+        "country_perspectives": [
+            {"country": "KR", "summary": "한국 시각", "article_ids": []},
+        ],
+    }
+    s = _sanitize_envelope(parsed, cluster)
+    assert s is not None
+    from services.briefing.comparator import MAX_ARTICLE_IDS_PER_COUNTRY
+    assert len(s["country_perspectives"][0]["article_ids"]) == MAX_ARTICLE_IDS_PER_COUNTRY
+
+
 def test_max_perspectives_cap():
    cluster = {"members": []}
    parsed = {