hyungi_document_server/app/services/digest/clustering.py

"""Phase 4 Global Digest — country 내 topic cluster (time-decay + EMA + adaptive threshold).

알고리즘 코어는 `app/services/clustering_common.py` 로 추출되어 briefing 모듈과 공유.
본 파일은 Phase 4 고유 파라미터 (LAMBDA = ln(2)/3 일, MIN 3, MAX 10) 와 country 축 호출만 담당.
"""

import math

from core.utils import setup_logger
from services.clustering_common import (
    adaptive_threshold_by_density,
    greedy_assign_cluster,
)

logger = setup_logger("digest_clustering")

LAMBDA = math.log(2) / 3        # 3일 반감기 — 사용자 확정값
CENTROID_ALPHA = 0.7            # EMA: 기존 중심 70% 유지, 새 멤버 30% 반영
MIN_ARTICLES_PER_TOPIC = 3
MAX_TOPICS_PER_COUNTRY = 10


def adaptive_threshold(n_docs: int) -> float:
    """Phase 4 임계 (0.75 / 0.78 / 0.80). 외부 import 호환용 alias."""
    return adaptive_threshold_by_density(n_docs)


def cluster_country(country: str, docs: list[dict]) -> list[dict]:
    """단일 country 의 docs 를 cluster 로 묶어 정렬 + normalize 후 반환.

    공통 util `greedy_assign_cluster` 위에 country 라벨 로깅만 추가.
    """
    if not docs:
        logger.info(f"[{country}] docs=0 → skip")
        return []

    threshold = adaptive_threshold(len(docs))
    clusters, raw_count = greedy_assign_cluster(
        docs,
        threshold=threshold,
        centroid_alpha=CENTROID_ALPHA,
        min_articles=MIN_ARTICLES_PER_TOPIC,
        max_topics=MAX_TOPICS_PER_COUNTRY,
        lambda_val=LAMBDA,
    )
    dropped = raw_count - len(clusters)

    logger.info(
        f"[{country}] docs={len(docs)} threshold={threshold} "
        f"raw_clusters={raw_count} dropped={dropped} kept={len(clusters)}"
    )
    return clusters