"""Cluster 내 LLM 입력 선정 — top-k + MMR diversity + ai_summary truncate. 순수 top-relevance 는 동일 사건 중복 요약문에 편향되므로 MMR 로 다양성 확보. ai_summary 길이는 LLM 토큰 보호를 위해 SUMMARY_TRUNCATE 로 제한. """ import numpy as np from services.clustering_common import normalize_vector as _normalize K_PER_CLUSTER = 5 LAMBDA_MMR = 0.7 # relevance 70% / diversity 30% SUMMARY_TRUNCATE = 300 # long tail ai_summary 방어 def select_for_llm( cluster: dict, k: int = K_PER_CLUSTER, *, lambda_mmr: float = LAMBDA_MMR, summary_truncate: int = SUMMARY_TRUNCATE, ) -> list[dict]: """cluster 내 LLM 호출용 대표 article 들 선정. Args: cluster: clustering.cluster_country / briefing.cluster_global 결과 단일 cluster k: 선정 개수 (Phase 4=5, briefing=7) lambda_mmr: relevance vs diversity (Phase 4=0.7, briefing=0.6) summary_truncate: ai_summary 자르기 길이 (LLM 토큰 보호) Returns: 선정된 doc dict 리스트. 각 항목에 ai_summary_truncated 필드가 추가됨. """ members = cluster["members"] if len(members) <= k: selected = list(members) else: centroid = cluster["centroid"] for m in members: v = _normalize(m["embedding"]) m["_rel"] = float(np.dot(centroid, v)) * m["weight"] first = max(members, key=lambda x: x["_rel"]) selected = [first] candidates = [m for m in members if m is not first] while len(selected) < k and candidates: def mmr_score(c: dict) -> float: v = _normalize(c["embedding"]) max_sim = max( float(np.dot(v, _normalize(s["embedding"]))) for s in selected ) return lambda_mmr * c["_rel"] - (1.0 - lambda_mmr) * max_sim pick = max(candidates, key=mmr_score) selected.append(pick) candidates.remove(pick) for m in selected: m["ai_summary_truncated"] = (m.get("ai_summary") or "")[:summary_truncate] return selected