hyungi_document_server/app/services/briefing/comparator.py

"""Cluster → 26B MLX 비교 분석 호출 + JSON envelope + historical context + fallback row.

Plan §"LLM Parse 실패 시 Fallback Topic Row (고정 형태)":
LLM JSON parse 2회 재시도 후 실패 → 고정 형태 fallback 저장 (drop 금지).

Plan §"Historical Context":
BRIEFING_HISTORICAL_ENABLED=true 시 cluster centroid 와 historical candidate
cosine top-K 5 (similarity ≥0.70) 추출 → 프롬프트 {historical_block} 주입.
LLM 응답 envelope 의 historical_context 옵션 필드.
"""

import asyncio
import json
import os
from pathlib import Path
from typing import Any

import numpy as np

from ai.client import parse_json_response
from core.utils import setup_logger
from services.clustering_common import normalize_vector

logger = setup_logger("briefing_comparator")

LLM_CALL_TIMEOUT = 25                 # 초. Phase 4 와 동일
HISTORICAL_TOP_K = 5
HISTORICAL_SIMILARITY_MIN = 0.70
HISTORICAL_WINDOW_DAYS = 30

# JSON envelope cap (프롬프트 + 후처리 양쪽 강제)
MAX_PERSPECTIVES = 10
MAX_DIVERGENCES = 3
MAX_CONVERGENCES = 2
MAX_KEY_QUOTES = 5
MAX_PERSPECTIVE_SUMMARY_LEN = 240     # 한국어 1~2문장 ≤120자 × 2
MAX_HISTORICAL_CONTEXT_LEN = 240
MAX_ARTICLE_IDS_PER_COUNTRY = 5       # country_perspectives[].article_ids 후처리 cap
FALLBACK_HEADLINE = "LLM 분석 실패로 원문 기사 묶음만 표시합니다."
FALLBACK_TOPIC_LABEL = "주요 뉴스 묶음"

_llm_sem = asyncio.Semaphore(1)
_PROMPT_PATH = Path(__file__).resolve().parent.parent.parent / "prompts" / "briefing_comparative.txt"
_PROMPT_TEMPLATE: str | None = None


def historical_enabled() -> bool:
    return os.environ.get("BRIEFING_HISTORICAL_ENABLED", "false").lower() in {"1", "true", "yes"}


def _load_prompt() -> str:
    global _PROMPT_TEMPLATE
    if _PROMPT_TEMPLATE is None:
        _PROMPT_TEMPLATE = _PROMPT_PATH.read_text(encoding="utf-8")
    return _PROMPT_TEMPLATE


def _build_articles_block(selected: list[dict]) -> str:
    lines = []
    for i, m in enumerate(selected, start=1):
        country = m.get("country") or "??"
        source = m.get("ai_sub_group") or ""
        text = (m.get("ai_summary_truncated") or m.get("ai_summary") or m.get("title") or "").strip()
        lines.append(f"[{i}] ({country} · {source}) {text}")
    return "\n".join(lines)


def _build_historical_block(historical_docs: list[dict]) -> str:
    if not historical_docs:
        return "(과거 참고 자료 없음)"
    lines = ["※ 이전 30일 흐름 참고용 — 본 분석에서 직접 인용 금지, 맥락 파악 용도."]
    for i, d in enumerate(historical_docs, start=1):
        text = (d.get("ai_summary") or d.get("title") or "").strip()
        # historical 은 ai_summary 가 길 수 있어 200자 cap
        if len(text) > 200:
            text = text[:200] + "…"
        lines.append(f"[H{i}] {text}")
    return "\n".join(lines)


def build_prompt(selected: list[dict], historical_docs: list[dict]) -> str:
    template = _load_prompt()
    articles_block = _build_articles_block(selected)
    historical_block = _build_historical_block(historical_docs)
    return template.replace("{articles_block}", articles_block).replace(
        "{historical_block}", historical_block
    )


def retrieve_historical(
    cluster: dict,
    candidates: list[dict],
    *,
    top_k: int = HISTORICAL_TOP_K,
    sim_min: float = HISTORICAL_SIMILARITY_MIN,
) -> list[dict]:
    """cluster centroid 와 candidate pool 의 cosine top-K (sim ≥ sim_min).

    candidates 가 비어있거나 sim 미달 시 빈 list.
    """
    if not candidates:
        return []
    centroid = cluster["centroid"]
    scored = []
    for d in candidates:
        v = normalize_vector(d["embedding"])
        sim = float(np.dot(centroid, v))
        if sim >= sim_min:
            scored.append((sim, d))
    scored.sort(key=lambda x: -x[0])
    return [d for _, d in scored[:top_k]]


async def _try_call_llm(client: Any, prompt: str) -> str:
    async with _llm_sem:
        return await asyncio.wait_for(
            client.call_primary(prompt),
            timeout=LLM_CALL_TIMEOUT,
        )


def _truncate_str(s: Any, limit: int) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    if len(s) > limit:
        s = s[:limit].rstrip() + "…"
    return s


def _country_article_id_map(cluster: dict) -> dict[str, list[int]]:
    """cluster.members 를 country 별 article_id list 로 그룹 (weight 내림차순).

    Phase 4 selection 단계에서 m['weight'] 가 채워져 있음. 누락 시 0.0 으로 fallback.
    """
    grouped: dict[str, list[tuple[float, int]]] = {}
    for m in cluster.get("members", []):
        country = (m.get("country") or "").upper()
        if not country:
            continue
        weight = float(m.get("weight", 0.0))
        grouped.setdefault(country, []).append((weight, int(m["id"])))
    out: dict[str, list[int]] = {}
    for country, pairs in grouped.items():
        pairs.sort(key=lambda x: -x[0])
        out[country] = [doc_id for _, doc_id in pairs]
    return out


def _resolve_article_ids(
    raw_ids: list,
    country: str,
    cluster_country_ids: dict[str, list[int]],
) -> list[int]:
    """country_perspectives[].article_ids 후처리.

    1) LLM 이 준 id 가 cluster member 와 교집합인 것만 유지 (엉뚱한 id 차단).
    2) 비어있으면 같은 country 의 cluster member top weight N 개 자동 주입.
    3) 그래도 없으면 [] (country 매핑된 member 부재).
    """
    cluster_ids = cluster_country_ids.get(country, [])
    cluster_id_set = set(cluster_ids)

    # 1) LLM id ∩ cluster
    cleaned = []
    if isinstance(raw_ids, list):
        for x in raw_ids:
            try:
                doc_id = int(x)
            except (TypeError, ValueError):
                continue
            if doc_id in cluster_id_set and doc_id not in cleaned:
                cleaned.append(doc_id)
    if cleaned:
        return cleaned[:MAX_ARTICLE_IDS_PER_COUNTRY]

    # 2) Country fallback top-N
    return cluster_ids[:MAX_ARTICLE_IDS_PER_COUNTRY]


def _sanitize_envelope(parsed: dict, cluster: dict) -> dict | None:
    """LLM 응답 envelope 검증 + cap 강제 + article_ids 후처리. None → fallback."""
    if not isinstance(parsed, dict):
        return None

    topic_label = _truncate_str(parsed.get("topic_label"), 120)
    headline = _truncate_str(parsed.get("headline"), 200)
    if not topic_label or not headline:
        return None

    # cluster.members 의 country → [id] 매핑을 미리 만들어 후처리 input 으로 사용
    country_ids_map = _country_article_id_map(cluster)

    # country_perspectives
    raw_persp = parsed.get("country_perspectives")
    perspectives = []
    if isinstance(raw_persp, list):
        for p in raw_persp[:MAX_PERSPECTIVES]:
            if not isinstance(p, dict):
                continue
            country = _truncate_str(p.get("country"), 10).upper()
            summary = _truncate_str(p.get("summary"), MAX_PERSPECTIVE_SUMMARY_LEN)
            raw_ids = p.get("article_ids") or []
            article_ids = _resolve_article_ids(raw_ids, country, country_ids_map)
            if country and summary:
                perspectives.append({
                    "country": country,
                    "summary": summary,
                    "article_ids": article_ids,
                })
    if not perspectives:
        return None

    def _str_array(key: str, cap: int, item_limit: int) -> list[str]:
        raw = parsed.get(key)
        if not isinstance(raw, list):
            return []
        out = []
        for it in raw[:cap]:
            t = _truncate_str(it, item_limit)
            if t:
                out.append(t)
        return out

    divergences = _str_array("divergences", MAX_DIVERGENCES, 200)
    convergences = _str_array("convergences", MAX_CONVERGENCES, 200)

    # key_quotes: [{country, source, quote}]
    raw_quotes = parsed.get("key_quotes")
    quotes = []
    if isinstance(raw_quotes, list):
        for q in raw_quotes[:MAX_KEY_QUOTES]:
            if not isinstance(q, dict):
                continue
            entry = {
                "country": _truncate_str(q.get("country"), 10).upper(),
                "source": _truncate_str(q.get("source"), 60),
                "quote": _truncate_str(q.get("quote"), 240),
            }
            if entry["quote"]:
                quotes.append(entry)

    historical_context = _truncate_str(parsed.get("historical_context"), MAX_HISTORICAL_CONTEXT_LEN) or None

    return {
        "topic_label": topic_label,
        "headline": headline,
        "country_perspectives": perspectives,
        "divergences": divergences,
        "convergences": convergences,
        "key_quotes": quotes,
        "historical_context": historical_context,
        "llm_fallback_used": False,
    }


def _make_fallback(cluster: dict) -> dict:
    """Plan §"Fallback Topic Row (고정 형태)". drop 금지, country_perspectives 빈 list."""
    return {
        "topic_label": FALLBACK_TOPIC_LABEL,
        "headline": FALLBACK_HEADLINE,
        "country_perspectives": [],
        "divergences": [],
        "convergences": [],
        "key_quotes": [],
        "historical_context": None,
        "llm_fallback_used": True,
    }


async def compare_cluster_with_fallback(
    client: Any,
    cluster: dict,
    selected: list[dict],
    historical_docs: list[dict] | None = None,
) -> dict:
    """1 cluster 비교 분석. LLM 2회 재시도 → 실패 시 fallback row.

    Returns:
        sanitized envelope dict (Plan §"LLM 프롬프트 출력 envelope") + llm_fallback_used.
    """
    historical_docs = historical_docs or []
    prompt = build_prompt(selected, historical_docs)

    for attempt in range(2):
        try:
            raw = await _try_call_llm(client, prompt)
        except asyncio.TimeoutError:
            logger.warning(
                f"LLM timeout {LLM_CALL_TIMEOUT}s "
                f"(attempt={attempt + 1}, cluster size={len(cluster['members'])})"
            )
            continue
        except Exception as e:
            logger.warning(f"LLM 호출 실패 attempt={attempt + 1}: {e}")
            continue

        parsed = parse_json_response(raw)
        sanitized = _sanitize_envelope(parsed, cluster) if parsed else None
        if sanitized:
            return sanitized
        logger.warning(
            f"envelope 검증 실패 attempt={attempt + 1} "
            f"(raw_len={len(raw) if raw else 0}, parsed_keys={list(parsed.keys()) if isinstance(parsed, dict) else None})"
        )

    return _make_fallback(cluster)