"""Cluster 내 LLM 입력 선정 — top-k + MMR diversity + ai_summary truncate. 순수 top-relevance 는 동일 사건 중복 요약문에 편향되므로 MMR 로 다양성 확보. ai_summary 길이는 LLM 토큰 보호를 위해 SUMMARY_TRUNCATE 로 제한. """ import numpy as np K_PER_CLUSTER = 5 LAMBDA_MMR = 0.7 # relevance 70% / diversity 30% SUMMARY_TRUNCATE = 300 # long tail ai_summary 방어 def _normalize(v: np.ndarray) -> np.ndarray: norm = float(np.linalg.norm(v)) if norm == 0.0: return v return v / norm def select_for_llm(cluster: dict, k: int = K_PER_CLUSTER) -> list[dict]: """cluster 내 LLM 호출용 대표 article 들 선정. Args: cluster: clustering.cluster_country 결과 단일 cluster k: 선정 개수 (기본 5) Returns: 선정된 doc dict 리스트. 각 항목에 ai_summary_truncated 필드가 추가됨. """ members = cluster["members"] if len(members) <= k: selected = list(members) else: centroid = cluster["centroid"] # relevance = centroid 유사도 × decay weight for m in members: v = _normalize(m["embedding"]) m["_rel"] = float(np.dot(centroid, v)) * m["weight"] first = max(members, key=lambda x: x["_rel"]) selected = [first] candidates = [m for m in members if m is not first] while len(selected) < k and candidates: def mmr_score(c: dict) -> float: v = _normalize(c["embedding"]) max_sim = max( float(np.dot(v, _normalize(s["embedding"]))) for s in selected ) return LAMBDA_MMR * c["_rel"] - (1.0 - LAMBDA_MMR) * max_sim pick = max(candidates, key=mmr_score) selected.append(pick) candidates.remove(pick) # LLM 입력 토큰 보호 for m in selected: m["ai_summary_truncated"] = (m.get("ai_summary") or "")[:SUMMARY_TRUNCATE] return selected