hyungi_document_server/app/services/briefing/clustering.py

"""야간 뉴스 topic-first 클러스터링.

Phase 4 와 axis 반대: country 별 cluster 가 아닌 **전체 doc 합쳐서 topic cluster**.
각 cluster 안에 country 분포가 자동으로 들어감 (doc dict 의 country field).

파라미터 (5h 윈도우용):
- LAMBDA = ln(2)/2h ≈ 0.347 (2시간 반감기, 야간 5h 윈도우라 빠른 감쇠)
- threshold = 0.70 (2026-05-13 조정 — 0.78 에서 spread case kept=1 발생 후 완화)
- MIN_ARTICLES_PER_TOPIC = 2 (야간 sparse 대비 완화)
- MIN_COUNTRIES_PER_TOPIC = 2 (cross-country 가치 핵심)
- MAX_TOPICS = 7 (1페이지 분량)
"""

import math

from core.utils import setup_logger
from services.clustering_common import (
    greedy_assign_cluster,
    normalize_importance_scores,
)

logger = setup_logger("briefing_clustering")

LAMBDA = math.log(2) / (2.0 / 24.0)   # 2시간 반감기 (단위: 일)
THRESHOLD = 0.70
CENTROID_ALPHA = 0.7
MIN_ARTICLES_PER_TOPIC = 2
MIN_COUNTRIES_PER_TOPIC = 2
MAX_TOPICS = 7


def _count_distinct_countries(cluster: dict) -> int:
    return len({m.get("country") for m in cluster["members"] if m.get("country")})


def cluster_global(docs: list[dict]) -> list[dict]:
    """모든 country docs 를 합쳐 topic cluster 생성.

    Args:
        docs: loader.load_night_window 의 출력 (각 dict 에 country field 포함).

    Returns:
        [{centroid, members, weight_sum, raw_weight_sum, importance_score, country_count}, ...]
        - MIN_ARTICLES + MIN_COUNTRIES 둘 다 충족 cluster 만
        - importance_score 내림차순, MAX_TOPICS 개 cap
    """
    if not docs:
        logger.info("[briefing] docs=0 → skip")
        return []

    clusters, raw_count = greedy_assign_cluster(
        docs,
        threshold=THRESHOLD,
        centroid_alpha=CENTROID_ALPHA,
        min_articles=MIN_ARTICLES_PER_TOPIC,
        max_topics=MAX_TOPICS * 4,  # MIN_COUNTRIES 필터 전 buffer
        lambda_val=LAMBDA,
    )

    # MIN_COUNTRIES_PER_TOPIC 필터 — single-country cluster drop
    pre_country_filter = len(clusters)
    filtered = []
    for c in clusters:
        cc = _count_distinct_countries(c)
        if cc >= MIN_COUNTRIES_PER_TOPIC:
            c["country_count"] = cc
            filtered.append(c)
    clusters = filtered[:MAX_TOPICS]
    dropped_country = pre_country_filter - len(clusters)
    dropped_min_articles = raw_count - pre_country_filter

    # MIN_COUNTRIES + MAX_TOPICS 필터 후 importance 재정규화 (briefing 내 0~1)
    normalize_importance_scores(clusters)

    logger.info(
        f"[briefing] docs={len(docs)} threshold={THRESHOLD} "
        f"raw_clusters={raw_count} dropped_min_articles={dropped_min_articles} "
        f"dropped_single_country={dropped_country} kept={len(clusters)}"
    )
    return clusters