fix(briefing): backfill country_perspectives[].article_ids from cluster members

LLM 이 article_ids 를 자율적으로 비워두는 케이스 (2026-05-12 첫 briefing 6
topics 모두 빈 list) 를 서버에서 보정.

후처리 정책 (_resolve_article_ids):
1. LLM 이 준 id ∩ cluster member id (엉뚱한 id 차단, hallucination 방어)
2. 비어있으면 같은 country cluster member top weight N 개 자동 주입
3. cluster 안 country 매칭 멤버 0 → []

per-country cap = MAX_ARTICLE_IDS_PER_COUNTRY = 5. weight 내림차순.

API 계약 강화: country_perspectives 가 있는 topic 은 article_ids ≥ 1 보장
(같은 country cluster member 존재 시). frontend / 외부 채널 / archive UI
모두 신뢰 가능.

tests 3 케이스 추가.
This commit is contained in:
Hyungi Ahn
2026-05-12 13:15:26 +09:00
parent 36fea2789a
commit 6966be9cf6
2 changed files with 128 additions and 7 deletions
+62 -7
View File
@@ -35,6 +35,7 @@ MAX_CONVERGENCES = 2
MAX_KEY_QUOTES = 5
MAX_PERSPECTIVE_SUMMARY_LEN = 240 # 한국어 1~2문장 ≤120자 × 2
MAX_HISTORICAL_CONTEXT_LEN = 240
MAX_ARTICLE_IDS_PER_COUNTRY = 5 # country_perspectives[].article_ids 후처리 cap
FALLBACK_HEADLINE = "LLM 분석 실패로 원문 기사 묶음만 표시합니다."
FALLBACK_TOPIC_LABEL = "주요 뉴스 묶음"
@@ -127,8 +128,58 @@ def _truncate_str(s: Any, limit: int) -> str:
return s
def _country_article_id_map(cluster: dict) -> dict[str, list[int]]:
"""cluster.members 를 country 별 article_id list 로 그룹 (weight 내림차순).
Phase 4 selection 단계에서 m['weight'] 가 채워져 있음. 누락 시 0.0 으로 fallback.
"""
grouped: dict[str, list[tuple[float, int]]] = {}
for m in cluster.get("members", []):
country = (m.get("country") or "").upper()
if not country:
continue
weight = float(m.get("weight", 0.0))
grouped.setdefault(country, []).append((weight, int(m["id"])))
out: dict[str, list[int]] = {}
for country, pairs in grouped.items():
pairs.sort(key=lambda x: -x[0])
out[country] = [doc_id for _, doc_id in pairs]
return out
def _resolve_article_ids(
raw_ids: list,
country: str,
cluster_country_ids: dict[str, list[int]],
) -> list[int]:
"""country_perspectives[].article_ids 후처리.
1) LLM 이 준 id 가 cluster member 와 교집합인 것만 유지 (엉뚱한 id 차단).
2) 비어있으면 같은 country 의 cluster member top weight N 개 자동 주입.
3) 그래도 없으면 [] (country 매핑된 member 부재).
"""
cluster_ids = cluster_country_ids.get(country, [])
cluster_id_set = set(cluster_ids)
# 1) LLM id ∩ cluster
cleaned = []
if isinstance(raw_ids, list):
for x in raw_ids:
try:
doc_id = int(x)
except (TypeError, ValueError):
continue
if doc_id in cluster_id_set and doc_id not in cleaned:
cleaned.append(doc_id)
if cleaned:
return cleaned[:MAX_ARTICLE_IDS_PER_COUNTRY]
# 2) Country fallback top-N
return cluster_ids[:MAX_ARTICLE_IDS_PER_COUNTRY]
def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
"""LLM 응답 envelope 검증 + cap 강제. None 반환 시 fallback 발동."""
"""LLM 응답 envelope 검증 + cap 강제 + article_ids 후처리. None fallback."""
if not isinstance(parsed, dict):
return None
@@ -137,6 +188,9 @@ def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
if not topic_label or not headline:
return None
# cluster.members 의 country → [id] 매핑을 미리 만들어 후처리 input 으로 사용
country_ids_map = _country_article_id_map(cluster)
# country_perspectives
raw_persp = parsed.get("country_perspectives")
perspectives = []
@@ -146,14 +200,15 @@ def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
continue
country = _truncate_str(p.get("country"), 10).upper()
summary = _truncate_str(p.get("summary"), MAX_PERSPECTIVE_SUMMARY_LEN)
ids = p.get("article_ids") or []
if not isinstance(ids, list):
ids = []
ids = [int(x) for x in ids if isinstance(x, (int, str)) and str(x).isdigit()]
raw_ids = p.get("article_ids") or []
article_ids = _resolve_article_ids(raw_ids, country, country_ids_map)
if country and summary:
perspectives.append({"country": country, "summary": summary, "article_ids": ids})
perspectives.append({
"country": country,
"summary": summary,
"article_ids": article_ids,
})
if not perspectives:
# 비교 분석 가치가 없는 응답 → fallback
return None
def _str_array(key: str, cap: int, item_limit: int) -> list[str]:
+66
View File
@@ -189,6 +189,72 @@ def test_perspective_summary_cap_enforced():
assert len(s["country_perspectives"][0]["summary"]) <= 241 # 240 + "…"
def test_article_ids_fallback_when_llm_empty():
"""LLM 이 article_ids 를 비워두면 같은 country cluster member top-N 자동 주입."""
cluster = {
"members": [
{"id": 101, "country": "KR", "weight": 0.9},
{"id": 102, "country": "KR", "weight": 0.8},
{"id": 103, "country": "KR", "weight": 0.7},
{"id": 201, "country": "US", "weight": 0.5},
]
}
parsed = {
"topic_label": "T",
"headline": "H",
"country_perspectives": [
{"country": "KR", "summary": "한국 시각", "article_ids": []},
{"country": "US", "summary": "미국 시각", "article_ids": []},
],
}
s = _sanitize_envelope(parsed, cluster)
assert s is not None
kr = next(p for p in s["country_perspectives"] if p["country"] == "KR")
us = next(p for p in s["country_perspectives"] if p["country"] == "US")
assert kr["article_ids"] == [101, 102, 103] # weight desc
assert us["article_ids"] == [201]
def test_article_ids_intersect_with_cluster():
"""LLM 이 엉뚱한 id 를 넣으면 cluster member 와 교집합만."""
cluster = {
"members": [
{"id": 101, "country": "KR", "weight": 0.9},
{"id": 102, "country": "KR", "weight": 0.8},
]
}
parsed = {
"topic_label": "T",
"headline": "H",
"country_perspectives": [
{"country": "KR", "summary": "한국 시각", "article_ids": [101, 999, 888]},
],
}
s = _sanitize_envelope(parsed, cluster)
assert s is not None
assert s["country_perspectives"][0]["article_ids"] == [101]
def test_article_ids_capped_to_max():
"""후처리 후에도 country 당 MAX_ARTICLE_IDS_PER_COUNTRY cap."""
cluster = {
"members": [
{"id": i, "country": "KR", "weight": 1.0 / i} for i in range(1, 15)
]
}
parsed = {
"topic_label": "T",
"headline": "H",
"country_perspectives": [
{"country": "KR", "summary": "한국 시각", "article_ids": []},
],
}
s = _sanitize_envelope(parsed, cluster)
assert s is not None
from services.briefing.comparator import MAX_ARTICLE_IDS_PER_COUNTRY
assert len(s["country_perspectives"][0]["article_ids"]) == MAX_ARTICLE_IDS_PER_COUNTRY
def test_max_perspectives_cap():
cluster = {"members": []}
parsed = {