6966be9cf6
LLM 이 article_ids 를 자율적으로 비워두는 케이스 (2026-05-12 첫 briefing 6 topics 모두 빈 list) 를 서버에서 보정. 후처리 정책 (_resolve_article_ids): 1. LLM 이 준 id ∩ cluster member id (엉뚱한 id 차단, hallucination 방어) 2. 비어있으면 같은 country cluster member top weight N 개 자동 주입 3. cluster 안 country 매칭 멤버 0 → [] per-country cap = MAX_ARTICLE_IDS_PER_COUNTRY = 5. weight 내림차순. API 계약 강화: country_perspectives 가 있는 topic 은 article_ids ≥ 1 보장 (같은 country cluster member 존재 시). frontend / 외부 채널 / archive UI 모두 신뢰 가능. tests 3 케이스 추가.
308 lines
10 KiB
Python
308 lines
10 KiB
Python
"""Cluster → 26B MLX 비교 분석 호출 + JSON envelope + historical context + fallback row.
|
|
|
|
Plan §"LLM Parse 실패 시 Fallback Topic Row (고정 형태)":
|
|
LLM JSON parse 2회 재시도 후 실패 → 고정 형태 fallback 저장 (drop 금지).
|
|
|
|
Plan §"Historical Context":
|
|
BRIEFING_HISTORICAL_ENABLED=true 시 cluster centroid 와 historical candidate
|
|
cosine top-K 5 (similarity ≥0.70) 추출 → 프롬프트 {historical_block} 주입.
|
|
LLM 응답 envelope 의 historical_context 옵션 필드.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
|
|
from ai.client import parse_json_response
|
|
from core.utils import setup_logger
|
|
from services.clustering_common import normalize_vector
|
|
|
|
logger = setup_logger("briefing_comparator")
|
|
|
|
LLM_CALL_TIMEOUT = 25 # 초. Phase 4 와 동일
|
|
HISTORICAL_TOP_K = 5
|
|
HISTORICAL_SIMILARITY_MIN = 0.70
|
|
HISTORICAL_WINDOW_DAYS = 30
|
|
|
|
# JSON envelope cap (프롬프트 + 후처리 양쪽 강제)
|
|
MAX_PERSPECTIVES = 10
|
|
MAX_DIVERGENCES = 3
|
|
MAX_CONVERGENCES = 2
|
|
MAX_KEY_QUOTES = 5
|
|
MAX_PERSPECTIVE_SUMMARY_LEN = 240 # 한국어 1~2문장 ≤120자 × 2
|
|
MAX_HISTORICAL_CONTEXT_LEN = 240
|
|
MAX_ARTICLE_IDS_PER_COUNTRY = 5 # country_perspectives[].article_ids 후처리 cap
|
|
FALLBACK_HEADLINE = "LLM 분석 실패로 원문 기사 묶음만 표시합니다."
|
|
FALLBACK_TOPIC_LABEL = "주요 뉴스 묶음"
|
|
|
|
_llm_sem = asyncio.Semaphore(1)
|
|
_PROMPT_PATH = Path(__file__).resolve().parent.parent.parent / "prompts" / "briefing_comparative.txt"
|
|
_PROMPT_TEMPLATE: str | None = None
|
|
|
|
|
|
def historical_enabled() -> bool:
|
|
return os.environ.get("BRIEFING_HISTORICAL_ENABLED", "false").lower() in {"1", "true", "yes"}
|
|
|
|
|
|
def _load_prompt() -> str:
|
|
global _PROMPT_TEMPLATE
|
|
if _PROMPT_TEMPLATE is None:
|
|
_PROMPT_TEMPLATE = _PROMPT_PATH.read_text(encoding="utf-8")
|
|
return _PROMPT_TEMPLATE
|
|
|
|
|
|
def _build_articles_block(selected: list[dict]) -> str:
|
|
lines = []
|
|
for i, m in enumerate(selected, start=1):
|
|
country = m.get("country") or "??"
|
|
source = m.get("ai_sub_group") or ""
|
|
text = (m.get("ai_summary_truncated") or m.get("ai_summary") or m.get("title") or "").strip()
|
|
lines.append(f"[{i}] ({country} · {source}) {text}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _build_historical_block(historical_docs: list[dict]) -> str:
|
|
if not historical_docs:
|
|
return "(과거 참고 자료 없음)"
|
|
lines = ["※ 이전 30일 흐름 참고용 — 본 분석에서 직접 인용 금지, 맥락 파악 용도."]
|
|
for i, d in enumerate(historical_docs, start=1):
|
|
text = (d.get("ai_summary") or d.get("title") or "").strip()
|
|
# historical 은 ai_summary 가 길 수 있어 200자 cap
|
|
if len(text) > 200:
|
|
text = text[:200] + "…"
|
|
lines.append(f"[H{i}] {text}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def build_prompt(selected: list[dict], historical_docs: list[dict]) -> str:
|
|
template = _load_prompt()
|
|
articles_block = _build_articles_block(selected)
|
|
historical_block = _build_historical_block(historical_docs)
|
|
return template.replace("{articles_block}", articles_block).replace(
|
|
"{historical_block}", historical_block
|
|
)
|
|
|
|
|
|
def retrieve_historical(
|
|
cluster: dict,
|
|
candidates: list[dict],
|
|
*,
|
|
top_k: int = HISTORICAL_TOP_K,
|
|
sim_min: float = HISTORICAL_SIMILARITY_MIN,
|
|
) -> list[dict]:
|
|
"""cluster centroid 와 candidate pool 의 cosine top-K (sim ≥ sim_min).
|
|
|
|
candidates 가 비어있거나 sim 미달 시 빈 list.
|
|
"""
|
|
if not candidates:
|
|
return []
|
|
centroid = cluster["centroid"]
|
|
scored = []
|
|
for d in candidates:
|
|
v = normalize_vector(d["embedding"])
|
|
sim = float(np.dot(centroid, v))
|
|
if sim >= sim_min:
|
|
scored.append((sim, d))
|
|
scored.sort(key=lambda x: -x[0])
|
|
return [d for _, d in scored[:top_k]]
|
|
|
|
|
|
async def _try_call_llm(client: Any, prompt: str) -> str:
|
|
async with _llm_sem:
|
|
return await asyncio.wait_for(
|
|
client.call_primary(prompt),
|
|
timeout=LLM_CALL_TIMEOUT,
|
|
)
|
|
|
|
|
|
def _truncate_str(s: Any, limit: int) -> str:
|
|
if not isinstance(s, str):
|
|
return ""
|
|
s = s.strip()
|
|
if len(s) > limit:
|
|
s = s[:limit].rstrip() + "…"
|
|
return s
|
|
|
|
|
|
def _country_article_id_map(cluster: dict) -> dict[str, list[int]]:
|
|
"""cluster.members 를 country 별 article_id list 로 그룹 (weight 내림차순).
|
|
|
|
Phase 4 selection 단계에서 m['weight'] 가 채워져 있음. 누락 시 0.0 으로 fallback.
|
|
"""
|
|
grouped: dict[str, list[tuple[float, int]]] = {}
|
|
for m in cluster.get("members", []):
|
|
country = (m.get("country") or "").upper()
|
|
if not country:
|
|
continue
|
|
weight = float(m.get("weight", 0.0))
|
|
grouped.setdefault(country, []).append((weight, int(m["id"])))
|
|
out: dict[str, list[int]] = {}
|
|
for country, pairs in grouped.items():
|
|
pairs.sort(key=lambda x: -x[0])
|
|
out[country] = [doc_id for _, doc_id in pairs]
|
|
return out
|
|
|
|
|
|
def _resolve_article_ids(
|
|
raw_ids: list,
|
|
country: str,
|
|
cluster_country_ids: dict[str, list[int]],
|
|
) -> list[int]:
|
|
"""country_perspectives[].article_ids 후처리.
|
|
|
|
1) LLM 이 준 id 가 cluster member 와 교집합인 것만 유지 (엉뚱한 id 차단).
|
|
2) 비어있으면 같은 country 의 cluster member top weight N 개 자동 주입.
|
|
3) 그래도 없으면 [] (country 매핑된 member 부재).
|
|
"""
|
|
cluster_ids = cluster_country_ids.get(country, [])
|
|
cluster_id_set = set(cluster_ids)
|
|
|
|
# 1) LLM id ∩ cluster
|
|
cleaned = []
|
|
if isinstance(raw_ids, list):
|
|
for x in raw_ids:
|
|
try:
|
|
doc_id = int(x)
|
|
except (TypeError, ValueError):
|
|
continue
|
|
if doc_id in cluster_id_set and doc_id not in cleaned:
|
|
cleaned.append(doc_id)
|
|
if cleaned:
|
|
return cleaned[:MAX_ARTICLE_IDS_PER_COUNTRY]
|
|
|
|
# 2) Country fallback top-N
|
|
return cluster_ids[:MAX_ARTICLE_IDS_PER_COUNTRY]
|
|
|
|
|
|
def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
|
|
"""LLM 응답 envelope 검증 + cap 강제 + article_ids 후처리. None → fallback."""
|
|
if not isinstance(parsed, dict):
|
|
return None
|
|
|
|
topic_label = _truncate_str(parsed.get("topic_label"), 120)
|
|
headline = _truncate_str(parsed.get("headline"), 200)
|
|
if not topic_label or not headline:
|
|
return None
|
|
|
|
# cluster.members 의 country → [id] 매핑을 미리 만들어 후처리 input 으로 사용
|
|
country_ids_map = _country_article_id_map(cluster)
|
|
|
|
# country_perspectives
|
|
raw_persp = parsed.get("country_perspectives")
|
|
perspectives = []
|
|
if isinstance(raw_persp, list):
|
|
for p in raw_persp[:MAX_PERSPECTIVES]:
|
|
if not isinstance(p, dict):
|
|
continue
|
|
country = _truncate_str(p.get("country"), 10).upper()
|
|
summary = _truncate_str(p.get("summary"), MAX_PERSPECTIVE_SUMMARY_LEN)
|
|
raw_ids = p.get("article_ids") or []
|
|
article_ids = _resolve_article_ids(raw_ids, country, country_ids_map)
|
|
if country and summary:
|
|
perspectives.append({
|
|
"country": country,
|
|
"summary": summary,
|
|
"article_ids": article_ids,
|
|
})
|
|
if not perspectives:
|
|
return None
|
|
|
|
def _str_array(key: str, cap: int, item_limit: int) -> list[str]:
|
|
raw = parsed.get(key)
|
|
if not isinstance(raw, list):
|
|
return []
|
|
out = []
|
|
for it in raw[:cap]:
|
|
t = _truncate_str(it, item_limit)
|
|
if t:
|
|
out.append(t)
|
|
return out
|
|
|
|
divergences = _str_array("divergences", MAX_DIVERGENCES, 200)
|
|
convergences = _str_array("convergences", MAX_CONVERGENCES, 200)
|
|
|
|
# key_quotes: [{country, source, quote}]
|
|
raw_quotes = parsed.get("key_quotes")
|
|
quotes = []
|
|
if isinstance(raw_quotes, list):
|
|
for q in raw_quotes[:MAX_KEY_QUOTES]:
|
|
if not isinstance(q, dict):
|
|
continue
|
|
entry = {
|
|
"country": _truncate_str(q.get("country"), 10).upper(),
|
|
"source": _truncate_str(q.get("source"), 60),
|
|
"quote": _truncate_str(q.get("quote"), 240),
|
|
}
|
|
if entry["quote"]:
|
|
quotes.append(entry)
|
|
|
|
historical_context = _truncate_str(parsed.get("historical_context"), MAX_HISTORICAL_CONTEXT_LEN) or None
|
|
|
|
return {
|
|
"topic_label": topic_label,
|
|
"headline": headline,
|
|
"country_perspectives": perspectives,
|
|
"divergences": divergences,
|
|
"convergences": convergences,
|
|
"key_quotes": quotes,
|
|
"historical_context": historical_context,
|
|
"llm_fallback_used": False,
|
|
}
|
|
|
|
|
|
def _make_fallback(cluster: dict) -> dict:
|
|
"""Plan §"Fallback Topic Row (고정 형태)". drop 금지, country_perspectives 빈 list."""
|
|
return {
|
|
"topic_label": FALLBACK_TOPIC_LABEL,
|
|
"headline": FALLBACK_HEADLINE,
|
|
"country_perspectives": [],
|
|
"divergences": [],
|
|
"convergences": [],
|
|
"key_quotes": [],
|
|
"historical_context": None,
|
|
"llm_fallback_used": True,
|
|
}
|
|
|
|
|
|
async def compare_cluster_with_fallback(
|
|
client: Any,
|
|
cluster: dict,
|
|
selected: list[dict],
|
|
historical_docs: list[dict] | None = None,
|
|
) -> dict:
|
|
"""1 cluster 비교 분석. LLM 2회 재시도 → 실패 시 fallback row.
|
|
|
|
Returns:
|
|
sanitized envelope dict (Plan §"LLM 프롬프트 출력 envelope") + llm_fallback_used.
|
|
"""
|
|
historical_docs = historical_docs or []
|
|
prompt = build_prompt(selected, historical_docs)
|
|
|
|
for attempt in range(2):
|
|
try:
|
|
raw = await _try_call_llm(client, prompt)
|
|
except asyncio.TimeoutError:
|
|
logger.warning(
|
|
f"LLM timeout {LLM_CALL_TIMEOUT}s "
|
|
f"(attempt={attempt + 1}, cluster size={len(cluster['members'])})"
|
|
)
|
|
continue
|
|
except Exception as e:
|
|
logger.warning(f"LLM 호출 실패 attempt={attempt + 1}: {e}")
|
|
continue
|
|
|
|
parsed = parse_json_response(raw)
|
|
sanitized = _sanitize_envelope(parsed, cluster) if parsed else None
|
|
if sanitized:
|
|
return sanitized
|
|
logger.warning(
|
|
f"envelope 검증 실패 attempt={attempt + 1} "
|
|
f"(raw_len={len(raw) if raw else 0}, parsed_keys={list(parsed.keys()) if isinstance(parsed, dict) else None})"
|
|
)
|
|
|
|
return _make_fallback(cluster)
|