feat(digest): Phase 4 Global News Digest (cluster-level batch summarization)

7일 rolling window 뉴스를 country × topic 2-level로 묶어 매일 04:00 KST 배치 생성.
search 파이프라인 미사용. documents → clustering → cluster-level LLM summarization → digest.

핵심 결정:
- adaptive threshold (0.75/0.78/0.80) + EMA centroid (α=0.7) + time-decay (λ=ln(2)/3)
- min_articles=3, max_topics=10/country, top-5 MMR diversity, ai_summary[:300] truncate
- cluster-level LLM only, drop금지 fallback (topic_label="주요 뉴스 묶음" + top member ai_summary[:200])
- importance_score country별 0~1 normalize + raw_weight_sum 별도 보존, max(score, 0.01) floor
- per-call timeout 25s + pipeline hard cap 600s
- DELETE+INSERT idempotent (UNIQUE digest_date), AIClient._call_chat 직접 호출 (client.py 수정 없음)

신규:
- migrations/101_global_digests.sql (2테이블 정규화)
- app/models/digest.py (GlobalDigest + DigestTopic ORM)
- app/services/digest/{loader,clustering,selection,summarizer,pipeline}.py
- app/workers/digest_worker.py (PIPELINE_HARD_CAP + CLI 진입점)
- app/api/digest.py (/latest, ?date|country, /regenerate, inline Pydantic)
- app/prompts/digest_topic.txt (JSON-only + 절대 금지 블록)

main.py 4줄: import 2 + scheduler add_job 1 + include_router 1.
plan: ~/.claude/plans/quiet-herding-tome.md
This commit is contained in:
Hyungi Ahn
2026-04-09 07:45:11 +09:00
parent 64322e4f6f
commit 75a1919342
12 changed files with 991 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""Phase 4 Global Digest 서비스 레이어 — 7일 뉴스 batch clustering + summarization."""

View File

@@ -0,0 +1,118 @@
"""Time-decay weight + adaptive threshold + EMA centroid greedy clustering.
플랜의 핵심 결정:
- λ = ln(2)/3 (3일 반감기)
- threshold: 0.75 / 0.78 / 0.80 (밀도 기반 adaptive)
- centroid: EMA α=0.7 (단순 평균의 seed bias / drift 방어)
- min_articles_per_topic = 3, max_topics_per_country = 10
- importance_score: country 내 0~1 normalize + max(score, 0.01) floor
- raw_weight_sum 별도 보존 (cross-day 트렌드 분석용)
"""
import math
from datetime import datetime, timezone
import numpy as np
from core.utils import setup_logger
logger = setup_logger("digest_clustering")
LAMBDA = math.log(2) / 3 # 3일 반감기 — 사용자 확정값
CENTROID_ALPHA = 0.7 # EMA: 기존 중심 70% 유지, 새 멤버 30% 반영
MIN_ARTICLES_PER_TOPIC = 3
MAX_TOPICS_PER_COUNTRY = 10
SCORE_FLOOR = 0.01 # UI 0 표시 문제 사전 차단
def adaptive_threshold(n_docs: int) -> float:
"""문서 밀도 기반 동적 threshold — fragmentation/blob 동시 방어."""
if n_docs > 200:
return 0.80
if n_docs < 50:
return 0.75
return 0.78
def _normalize(v: np.ndarray) -> np.ndarray:
norm = float(np.linalg.norm(v))
if norm == 0.0:
return v
return v / norm
def _decay_weight(now: datetime, created_at: datetime) -> float:
"""exp(-λ * days_ago). created_at 이 naive 면 UTC 가정."""
if created_at.tzinfo is None:
created_at = created_at.replace(tzinfo=timezone.utc)
days = (now - created_at).total_seconds() / 86400.0
if days < 0:
days = 0.0
return math.exp(-LAMBDA * days)
def cluster_country(country: str, docs: list[dict]) -> list[dict]:
"""단일 country 의 docs 를 cluster 로 묶어 정렬 + normalize 후 반환.
Args:
country: 국가 코드 (KR, US, ...)
docs: loader.load_news_window 의 출력 (단일 country 슬라이스)
Returns:
[{centroid, members, weight_sum, raw_weight_sum, importance_score}, ...]
- members 는 weight 가 채워진 doc dict 리스트
- 정렬: importance_score 내림차순, 최대 MAX_TOPICS_PER_COUNTRY 개
"""
if not docs:
logger.info(f"[{country}] docs=0 → skip")
return []
threshold = adaptive_threshold(len(docs))
now = datetime.now(timezone.utc)
# time-decay weight 계산 + 가중치 높은 순으로 seed 우선
for d in docs:
d["weight"] = _decay_weight(now, d["created_at"])
docs.sort(key=lambda d: -d["weight"])
clusters: list[dict] = []
for d in docs:
v = _normalize(d["embedding"])
best_idx, best_sim = -1, 0.0
for i, c in enumerate(clusters):
sim = float(np.dot(c["centroid"], v))
if sim > best_sim and sim >= threshold:
best_sim, best_idx = sim, i
if best_idx >= 0:
c = clusters[best_idx]
# EMA centroid update — drift 방지
c["centroid"] = CENTROID_ALPHA * c["centroid"] + (1.0 - CENTROID_ALPHA) * v
c["centroid"] = _normalize(c["centroid"])
c["members"].append(d)
c["weight_sum"] += d["weight"]
else:
clusters.append({
"centroid": v,
"members": [d],
"weight_sum": d["weight"],
})
raw_count = len(clusters)
clusters = [c for c in clusters if len(c["members"]) >= MIN_ARTICLES_PER_TOPIC]
dropped = raw_count - len(clusters)
clusters.sort(key=lambda c: -c["weight_sum"])
clusters = clusters[:MAX_TOPICS_PER_COUNTRY]
# country 내 normalize (0~1) + floor
if clusters:
max_w = max(c["weight_sum"] for c in clusters)
for c in clusters:
normalized = (c["weight_sum"] / max_w) if max_w > 0 else 0.0
c["raw_weight_sum"] = c["weight_sum"]
c["importance_score"] = max(normalized, SCORE_FLOOR)
logger.info(
f"[{country}] docs={len(docs)} threshold={threshold} "
f"raw_clusters={raw_count} dropped={dropped} kept={len(clusters)}"
)
return clusters

View File

@@ -0,0 +1,135 @@
"""뉴스 7일 window 로드 + country 정규화
- documents 테이블엔 country 컬럼이 없으므로 document_chunks.country 를 first non-null 로 조인.
- chunk-level country 도 NULL 이면 news_sources.name prefix(ai_sub_group) 매칭으로 fallback.
- 그래도 NULL 이면 drop(로그 경고).
- ai_summary / embedding 이 NULL 이면 처음부터 제외 (재요약/재임베딩 0회 원칙).
"""
from collections import defaultdict
from datetime import datetime
from typing import Any
import numpy as np
from sqlalchemy import text
from core.database import async_session
from core.utils import setup_logger
logger = setup_logger("digest_loader")
_NEWS_WINDOW_SQL = text("""
SELECT
d.id,
d.title,
d.ai_summary,
d.embedding,
d.created_at,
d.edit_url,
d.ai_sub_group,
(
SELECT c.country
FROM document_chunks c
WHERE c.doc_id = d.id AND c.country IS NOT NULL
LIMIT 1
) AS chunk_country
FROM documents d
WHERE d.source_channel = 'news'
AND d.deleted_at IS NULL
AND d.created_at >= :window_start
AND d.created_at < :window_end
AND d.embedding IS NOT NULL
AND d.ai_summary IS NOT NULL
""")
_SOURCE_COUNTRY_SQL = text("""
SELECT name, country FROM news_sources WHERE country IS NOT NULL
""")
def _to_numpy_embedding(raw: Any) -> np.ndarray | None:
"""pgvector 컬럼을 numpy array(float32)로 정규화."""
if raw is None:
return None
arr = np.asarray(raw, dtype=np.float32)
if arr.size == 0:
return None
return arr
async def _load_source_country_map(session) -> dict[str, str]:
"""news_sources name → country 매핑.
name 은 '경향신문 문화' 형태이고 documents.ai_sub_group 은 '경향신문' (split[0]).
prefix 매칭이 가능하도록 첫 토큰 → country 로 인덱싱.
"""
rows = await session.execute(_SOURCE_COUNTRY_SQL)
mapping: dict[str, str] = {}
for name, country in rows:
if not name or not country:
continue
prefix = name.split(" ")[0].strip()
if prefix and prefix not in mapping:
mapping[prefix] = country
return mapping
async def load_news_window(
window_start: datetime,
window_end: datetime,
) -> dict[str, list[dict]]:
"""주어진 윈도우 안의 뉴스 documents 를 country 별 dict 로 반환.
Returns:
{"KR": [doc_dict, ...], "US": [...], ...}
"""
docs_by_country: dict[str, list[dict]] = defaultdict(list)
null_country_count = 0
total = 0
async with async_session() as session:
source_country = await _load_source_country_map(session)
result = await session.execute(
_NEWS_WINDOW_SQL,
{"window_start": window_start, "window_end": window_end},
)
for row in result.mappings():
embedding = _to_numpy_embedding(row["embedding"])
if embedding is None:
continue
country = row["chunk_country"]
if not country:
# news_sources prefix fallback
ai_sub_group = (row["ai_sub_group"] or "").strip()
if ai_sub_group:
country = source_country.get(ai_sub_group)
if not country:
null_country_count += 1
continue
country = country.upper()
docs_by_country[country].append({
"id": int(row["id"]),
"title": row["title"] or "",
"ai_summary": row["ai_summary"] or "",
"embedding": embedding,
"created_at": row["created_at"],
"edit_url": row["edit_url"] or "",
"ai_sub_group": row["ai_sub_group"] or "",
})
total += 1
if null_country_count:
logger.warning(
f"[loader] country 분류 실패로 drop된 문서 {null_country_count}"
f"(chunk_country + news_sources fallback 모두 실패)"
)
logger.info(
f"[loader] window {window_start.date()} ~ {window_end.date()}"
f"{total}건 ({len(docs_by_country)}개 국가)"
)
return dict(docs_by_country)

View File

@@ -0,0 +1,177 @@
"""Phase 4 digest pipeline orchestration.
Step:
1. AIClient 생성
2. 7일 window 로 documents 로드 (loader)
3. country 별 cluster_country (clustering)
4. cluster 별 select_for_llm (selection)
5. cluster 별 summarize_cluster_with_fallback (summarizer, LLM)
6. DELETE+INSERT 단일 트랜잭션 (idempotent)
7. start/end 로그 + generation_ms + fallback 비율 health metric
"""
import hashlib
import time
from datetime import datetime, timedelta, timezone
from zoneinfo import ZoneInfo
from sqlalchemy import delete
from ai.client import AIClient
from core.database import async_session
from core.utils import setup_logger
from models.digest import DigestTopic, GlobalDigest
from .clustering import LAMBDA, cluster_country
from .loader import load_news_window
from .selection import select_for_llm
from .summarizer import summarize_cluster_with_fallback
logger = setup_logger("digest_pipeline")
WINDOW_DAYS = 7
KST = ZoneInfo("Asia/Seoul")
def _kst_today() -> datetime:
return datetime.now(KST).date()
def _summary_hash(text: str) -> str:
return hashlib.sha256((text or "").encode("utf-8")).hexdigest()[:16]
def _build_topic_row(
country: str,
rank: int,
cluster: dict,
selected: list[dict],
llm_result: dict,
primary_model: str,
) -> DigestTopic:
"""LLM 결과 + cluster 메타 → DigestTopic ORM 인스턴스.
article_ids 는 코드가 cluster.members 에서 직접 주입 (LLM 생성 금지 → id 위조 불가).
"""
article_ids = [int(m["id"]) for m in cluster["members"]]
centroid_sample = {
"selected_doc_ids": [int(m["id"]) for m in selected],
"summary_hashes": [_summary_hash(m.get("ai_summary") or "") for m in selected],
}
return DigestTopic(
country=country,
topic_rank=rank,
topic_label=llm_result["topic_label"],
summary=llm_result["summary"],
article_ids=article_ids,
article_count=len(article_ids),
importance_score=float(cluster["importance_score"]),
raw_weight_sum=float(cluster["raw_weight_sum"]),
centroid_sample=centroid_sample,
llm_model=primary_model,
llm_fallback_used=bool(llm_result["llm_fallback_used"]),
)
async def run_digest_pipeline() -> dict:
"""전체 파이프라인 실행. worker entry 에서 호출.
Returns:
실행 통계 dict {llm_calls, fallback_used, total_topics, generation_ms}
"""
start = time.time()
window_end = datetime.now(timezone.utc)
window_start = window_end - timedelta(days=WINDOW_DAYS)
digest_date = _kst_today()
logger.info(
f"[global_digest] start window={window_start.date()} ~ {window_end.date()} "
f"digest_date={digest_date} decay_lambda={LAMBDA:.4f}"
)
docs_by_country = await load_news_window(window_start, window_end)
if not docs_by_country:
logger.warning("[global_digest] 7일 window에 뉴스 0건 — digest 생성 스킵")
return {
"llm_calls": 0,
"fallback_used": 0,
"total_topics": 0,
"generation_ms": int((time.time() - start) * 1000),
}
client = AIClient()
primary_model = client.ai.primary.model
all_topic_rows: list[DigestTopic] = []
stats = {"llm_calls": 0, "fallback_used": 0}
try:
for country, docs in docs_by_country.items():
clusters = cluster_country(country, docs)
if not clusters:
continue # sparse country 자동 제외
for rank, cluster in enumerate(clusters, start=1):
selected = select_for_llm(cluster)
stats["llm_calls"] += 1
llm_result = await summarize_cluster_with_fallback(client, cluster, selected)
if llm_result["llm_fallback_used"]:
stats["fallback_used"] += 1
all_topic_rows.append(
_build_topic_row(country, rank, cluster, selected, llm_result, primary_model)
)
finally:
await client.close()
generation_ms = int((time.time() - start) * 1000)
total_articles = sum(len(d) for d in docs_by_country.values())
countries_with_topics = len({r.country for r in all_topic_rows})
if stats["fallback_used"] == 0:
status = "success"
elif stats["llm_calls"] and stats["fallback_used"] / stats["llm_calls"] > 0.5:
status = "failed"
else:
status = "partial"
async with async_session() as session:
# idempotent: 같은 날짜 row 가 있으면 CASCADE 로 topics 까지 삭제
await session.execute(
delete(GlobalDigest).where(GlobalDigest.digest_date == digest_date)
)
new_digest = GlobalDigest(
digest_date=digest_date,
window_start=window_start,
window_end=window_end,
decay_lambda=LAMBDA,
total_articles=total_articles,
total_countries=countries_with_topics,
total_topics=len(all_topic_rows),
generation_ms=generation_ms,
llm_calls=stats["llm_calls"],
llm_failures=stats["fallback_used"],
status=status,
)
new_digest.topics = all_topic_rows
session.add(new_digest)
await session.commit()
fallback_pct = (
(stats["fallback_used"] / stats["llm_calls"] * 100.0)
if stats["llm_calls"] else 0.0
)
logger.info(
f"[global_digest] done countries={countries_with_topics} "
f"topics={len(all_topic_rows)} llm_calls={stats['llm_calls']} "
f"fallback={stats['fallback_used']}/{stats['llm_calls']} ({fallback_pct:.2f}%) "
f"status={status} elapsed={generation_ms / 1000:.1f}s"
)
return {
"llm_calls": stats["llm_calls"],
"fallback_used": stats["fallback_used"],
"total_topics": len(all_topic_rows),
"generation_ms": generation_ms,
"status": status,
}

View File

@@ -0,0 +1,62 @@
"""Cluster 내 LLM 입력 선정 — top-k + MMR diversity + ai_summary truncate.
순수 top-relevance 는 동일 사건 중복 요약문에 편향되므로 MMR 로 다양성 확보.
ai_summary 길이는 LLM 토큰 보호를 위해 SUMMARY_TRUNCATE 로 제한.
"""
import numpy as np
K_PER_CLUSTER = 5
LAMBDA_MMR = 0.7 # relevance 70% / diversity 30%
SUMMARY_TRUNCATE = 300 # long tail ai_summary 방어
def _normalize(v: np.ndarray) -> np.ndarray:
norm = float(np.linalg.norm(v))
if norm == 0.0:
return v
return v / norm
def select_for_llm(cluster: dict, k: int = K_PER_CLUSTER) -> list[dict]:
"""cluster 내 LLM 호출용 대표 article 들 선정.
Args:
cluster: clustering.cluster_country 결과 단일 cluster
k: 선정 개수 (기본 5)
Returns:
선정된 doc dict 리스트. 각 항목에 ai_summary_truncated 필드가 추가됨.
"""
members = cluster["members"]
if len(members) <= k:
selected = list(members)
else:
centroid = cluster["centroid"]
# relevance = centroid 유사도 × decay weight
for m in members:
v = _normalize(m["embedding"])
m["_rel"] = float(np.dot(centroid, v)) * m["weight"]
first = max(members, key=lambda x: x["_rel"])
selected = [first]
candidates = [m for m in members if m is not first]
while len(selected) < k and candidates:
def mmr_score(c: dict) -> float:
v = _normalize(c["embedding"])
max_sim = max(
float(np.dot(v, _normalize(s["embedding"])))
for s in selected
)
return LAMBDA_MMR * c["_rel"] - (1.0 - LAMBDA_MMR) * max_sim
pick = max(candidates, key=mmr_score)
selected.append(pick)
candidates.remove(pick)
# LLM 입력 토큰 보호
for m in selected:
m["ai_summary_truncated"] = (m.get("ai_summary") or "")[:SUMMARY_TRUNCATE]
return selected

View File

@@ -0,0 +1,123 @@
"""Cluster-level LLM 호출 + JSON 파싱 + timeout + drop금지 fallback.
핵심 결정:
- AIClient._call_chat 직접 호출 (client.py 수정 회피, fallback 로직 재사용)
- Semaphore(1) 로 MLX 과부하 회피
- Per-call timeout 25초 (asyncio.wait_for) — MLX hang/Ollama stall 방어
- JSON 파싱 실패 → 1회 재시도 → 그래도 실패 시 minimal fallback (drop 금지)
- fallback: topic_label="주요 뉴스 묶음", summary = top member ai_summary[:200]
"""
import asyncio
from pathlib import Path
from typing import Any
from ai.client import parse_json_response
from core.utils import setup_logger
logger = setup_logger("digest_summarizer")
LLM_CALL_TIMEOUT = 25 # 초. MLX 평균 5초 + tail latency 마진
FALLBACK_SUMMARY_LIMIT = 200
_llm_sem = asyncio.Semaphore(1)
_PROMPT_PATH = Path(__file__).resolve().parent.parent.parent / "prompts" / "digest_topic.txt"
_PROMPT_TEMPLATE: str | None = None
def _load_prompt() -> str:
global _PROMPT_TEMPLATE
if _PROMPT_TEMPLATE is None:
_PROMPT_TEMPLATE = _PROMPT_PATH.read_text(encoding="utf-8")
return _PROMPT_TEMPLATE
def build_prompt(selected: list[dict]) -> str:
"""digest_topic.txt 템플릿에 selected article들의 ai_summary_truncated 주입.
템플릿 placeholder: {articles_block}
"""
template = _load_prompt()
lines = []
for i, m in enumerate(selected, start=1):
text = (m.get("ai_summary_truncated") or m.get("ai_summary") or m.get("title") or "").strip()
lines.append(f"[{i}] {text}")
articles_block = "\n".join(lines)
return template.replace("{articles_block}", articles_block)
async def _try_call_llm(client: Any, prompt: str) -> str:
"""Semaphore + per-call timeout 으로 감싼 단일 호출."""
async with _llm_sem:
return await asyncio.wait_for(
client._call_chat(client.ai.primary, prompt),
timeout=LLM_CALL_TIMEOUT,
)
def _make_fallback(cluster: dict) -> dict:
"""cluster 의 top member 데이터로 minimal fallback 생성 — 정보 손실 회피."""
members = cluster["members"]
if not members:
return {
"topic_label": "주요 뉴스 묶음",
"summary": "",
"llm_fallback_used": True,
}
top = max(members, key=lambda m: m.get("_rel", m.get("weight", 0.0)))
text = (top.get("ai_summary") or top.get("title") or "").strip()
return {
"topic_label": "주요 뉴스 묶음",
"summary": text[:FALLBACK_SUMMARY_LIMIT],
"llm_fallback_used": True,
}
async def summarize_cluster_with_fallback(
client: Any,
cluster: dict,
selected: list[dict],
) -> dict:
"""cluster 1개에 대해 LLM 호출 + JSON 파싱 + fallback.
Returns:
{topic_label, summary, llm_fallback_used}
"""
prompt = build_prompt(selected)
for attempt in range(2): # 1회 재시도 포함
try:
raw = await _try_call_llm(client, prompt)
except asyncio.TimeoutError:
logger.warning(
f"LLM 호출 timeout {LLM_CALL_TIMEOUT}s "
f"(attempt={attempt + 1}, cluster size={len(cluster['members'])})"
)
continue
except Exception as e:
logger.warning(
f"LLM 호출 실패 attempt={attempt + 1} "
f"(cluster size={len(cluster['members'])}): {e}"
)
continue
parsed = parse_json_response(raw)
if (
parsed
and isinstance(parsed.get("topic_label"), str)
and isinstance(parsed.get("summary"), str)
and parsed["topic_label"].strip()
and parsed["summary"].strip()
):
return {
"topic_label": parsed["topic_label"].strip(),
"summary": parsed["summary"].strip(),
"llm_fallback_used": False,
}
logger.warning(
f"JSON 파싱 실패 attempt={attempt + 1} "
f"(cluster size={len(cluster['members'])}, raw_len={len(raw) if raw else 0})"
)
return _make_fallback(cluster)