From 6966be9cf659f90429b1aac076bddb11b9d94f5f Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Tue, 12 May 2026 13:15:26 +0900 Subject: [PATCH] fix(briefing): backfill country_perspectives[].article_ids from cluster members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLM 이 article_ids 를 자율적으로 비워두는 케이스 (2026-05-12 첫 briefing 6 topics 모두 빈 list) 를 서버에서 보정. 후처리 정책 (_resolve_article_ids): 1. LLM 이 준 id ∩ cluster member id (엉뚱한 id 차단, hallucination 방어) 2. 비어있으면 같은 country cluster member top weight N 개 자동 주입 3. cluster 안 country 매칭 멤버 0 → [] per-country cap = MAX_ARTICLE_IDS_PER_COUNTRY = 5. weight 내림차순. API 계약 강화: country_perspectives 가 있는 topic 은 article_ids ≥ 1 보장 (같은 country cluster member 존재 시). frontend / 외부 채널 / archive UI 모두 신뢰 가능. tests 3 케이스 추가. --- app/services/briefing/comparator.py | 69 ++++++++++++++++++++++++++--- tests/test_briefing_historical.py | 66 +++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 7 deletions(-) diff --git a/app/services/briefing/comparator.py b/app/services/briefing/comparator.py index 694bfb9..973826a 100644 --- a/app/services/briefing/comparator.py +++ b/app/services/briefing/comparator.py @@ -35,6 +35,7 @@ MAX_CONVERGENCES = 2 MAX_KEY_QUOTES = 5 MAX_PERSPECTIVE_SUMMARY_LEN = 240 # 한국어 1~2문장 ≤120자 × 2 MAX_HISTORICAL_CONTEXT_LEN = 240 +MAX_ARTICLE_IDS_PER_COUNTRY = 5 # country_perspectives[].article_ids 후처리 cap FALLBACK_HEADLINE = "LLM 분석 실패로 원문 기사 묶음만 표시합니다." FALLBACK_TOPIC_LABEL = "주요 뉴스 묶음" @@ -127,8 +128,58 @@ def _truncate_str(s: Any, limit: int) -> str: return s +def _country_article_id_map(cluster: dict) -> dict[str, list[int]]: + """cluster.members 를 country 별 article_id list 로 그룹 (weight 내림차순). + + Phase 4 selection 단계에서 m['weight'] 가 채워져 있음. 누락 시 0.0 으로 fallback. + """ + grouped: dict[str, list[tuple[float, int]]] = {} + for m in cluster.get("members", []): + country = (m.get("country") or "").upper() + if not country: + continue + weight = float(m.get("weight", 0.0)) + grouped.setdefault(country, []).append((weight, int(m["id"]))) + out: dict[str, list[int]] = {} + for country, pairs in grouped.items(): + pairs.sort(key=lambda x: -x[0]) + out[country] = [doc_id for _, doc_id in pairs] + return out + + +def _resolve_article_ids( + raw_ids: list, + country: str, + cluster_country_ids: dict[str, list[int]], +) -> list[int]: + """country_perspectives[].article_ids 후처리. + + 1) LLM 이 준 id 가 cluster member 와 교집합인 것만 유지 (엉뚱한 id 차단). + 2) 비어있으면 같은 country 의 cluster member top weight N 개 자동 주입. + 3) 그래도 없으면 [] (country 매핑된 member 부재). + """ + cluster_ids = cluster_country_ids.get(country, []) + cluster_id_set = set(cluster_ids) + + # 1) LLM id ∩ cluster + cleaned = [] + if isinstance(raw_ids, list): + for x in raw_ids: + try: + doc_id = int(x) + except (TypeError, ValueError): + continue + if doc_id in cluster_id_set and doc_id not in cleaned: + cleaned.append(doc_id) + if cleaned: + return cleaned[:MAX_ARTICLE_IDS_PER_COUNTRY] + + # 2) Country fallback top-N + return cluster_ids[:MAX_ARTICLE_IDS_PER_COUNTRY] + + def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None: - """LLM 응답 envelope 검증 + cap 강제. None 반환 시 fallback 발동.""" + """LLM 응답 envelope 검증 + cap 강제 + article_ids 후처리. None → fallback.""" if not isinstance(parsed, dict): return None @@ -137,6 +188,9 @@ def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None: if not topic_label or not headline: return None + # cluster.members 의 country → [id] 매핑을 미리 만들어 후처리 input 으로 사용 + country_ids_map = _country_article_id_map(cluster) + # country_perspectives raw_persp = parsed.get("country_perspectives") perspectives = [] @@ -146,14 +200,15 @@ def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None: continue country = _truncate_str(p.get("country"), 10).upper() summary = _truncate_str(p.get("summary"), MAX_PERSPECTIVE_SUMMARY_LEN) - ids = p.get("article_ids") or [] - if not isinstance(ids, list): - ids = [] - ids = [int(x) for x in ids if isinstance(x, (int, str)) and str(x).isdigit()] + raw_ids = p.get("article_ids") or [] + article_ids = _resolve_article_ids(raw_ids, country, country_ids_map) if country and summary: - perspectives.append({"country": country, "summary": summary, "article_ids": ids}) + perspectives.append({ + "country": country, + "summary": summary, + "article_ids": article_ids, + }) if not perspectives: - # 비교 분석 가치가 없는 응답 → fallback return None def _str_array(key: str, cap: int, item_limit: int) -> list[str]: diff --git a/tests/test_briefing_historical.py b/tests/test_briefing_historical.py index c6102eb..abf134e 100644 --- a/tests/test_briefing_historical.py +++ b/tests/test_briefing_historical.py @@ -189,6 +189,72 @@ def test_perspective_summary_cap_enforced(): assert len(s["country_perspectives"][0]["summary"]) <= 241 # 240 + "…" +def test_article_ids_fallback_when_llm_empty(): + """LLM 이 article_ids 를 비워두면 같은 country cluster member top-N 자동 주입.""" + cluster = { + "members": [ + {"id": 101, "country": "KR", "weight": 0.9}, + {"id": 102, "country": "KR", "weight": 0.8}, + {"id": 103, "country": "KR", "weight": 0.7}, + {"id": 201, "country": "US", "weight": 0.5}, + ] + } + parsed = { + "topic_label": "T", + "headline": "H", + "country_perspectives": [ + {"country": "KR", "summary": "한국 시각", "article_ids": []}, + {"country": "US", "summary": "미국 시각", "article_ids": []}, + ], + } + s = _sanitize_envelope(parsed, cluster) + assert s is not None + kr = next(p for p in s["country_perspectives"] if p["country"] == "KR") + us = next(p for p in s["country_perspectives"] if p["country"] == "US") + assert kr["article_ids"] == [101, 102, 103] # weight desc + assert us["article_ids"] == [201] + + +def test_article_ids_intersect_with_cluster(): + """LLM 이 엉뚱한 id 를 넣으면 cluster member 와 교집합만.""" + cluster = { + "members": [ + {"id": 101, "country": "KR", "weight": 0.9}, + {"id": 102, "country": "KR", "weight": 0.8}, + ] + } + parsed = { + "topic_label": "T", + "headline": "H", + "country_perspectives": [ + {"country": "KR", "summary": "한국 시각", "article_ids": [101, 999, 888]}, + ], + } + s = _sanitize_envelope(parsed, cluster) + assert s is not None + assert s["country_perspectives"][0]["article_ids"] == [101] + + +def test_article_ids_capped_to_max(): + """후처리 후에도 country 당 MAX_ARTICLE_IDS_PER_COUNTRY cap.""" + cluster = { + "members": [ + {"id": i, "country": "KR", "weight": 1.0 / i} for i in range(1, 15) + ] + } + parsed = { + "topic_label": "T", + "headline": "H", + "country_perspectives": [ + {"country": "KR", "summary": "한국 시각", "article_ids": []}, + ], + } + s = _sanitize_envelope(parsed, cluster) + assert s is not None + from services.briefing.comparator import MAX_ARTICLE_IDS_PER_COUNTRY + assert len(s["country_perspectives"][0]["article_ids"]) == MAX_ARTICLE_IDS_PER_COUNTRY + + def test_max_perspectives_cap(): cluster = {"members": []} parsed = {