feat(digest): Phase 4 Global News Digest (cluster-level batch summarization)
7일 rolling window 뉴스를 country × topic 2-level로 묶어 매일 04:00 KST 배치 생성.
search 파이프라인 미사용. documents → clustering → cluster-level LLM summarization → digest.
핵심 결정:
- adaptive threshold (0.75/0.78/0.80) + EMA centroid (α=0.7) + time-decay (λ=ln(2)/3)
- min_articles=3, max_topics=10/country, top-5 MMR diversity, ai_summary[:300] truncate
- cluster-level LLM only, drop금지 fallback (topic_label="주요 뉴스 묶음" + top member ai_summary[:200])
- importance_score country별 0~1 normalize + raw_weight_sum 별도 보존, max(score, 0.01) floor
- per-call timeout 25s + pipeline hard cap 600s
- DELETE+INSERT idempotent (UNIQUE digest_date), AIClient._call_chat 직접 호출 (client.py 수정 없음)
신규:
- migrations/101_global_digests.sql (2테이블 정규화)
- app/models/digest.py (GlobalDigest + DigestTopic ORM)
- app/services/digest/{loader,clustering,selection,summarizer,pipeline}.py
- app/workers/digest_worker.py (PIPELINE_HARD_CAP + CLI 진입점)
- app/api/digest.py (/latest, ?date|country, /regenerate, inline Pydantic)
- app/prompts/digest_topic.txt (JSON-only + 절대 금지 블록)
main.py 4줄: import 2 + scheduler add_job 1 + include_router 1.
plan: ~/.claude/plans/quiet-herding-tome.md
This commit is contained in:
135
app/services/digest/loader.py
Normal file
135
app/services/digest/loader.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""뉴스 7일 window 로드 + country 정규화
|
||||
|
||||
- documents 테이블엔 country 컬럼이 없으므로 document_chunks.country 를 first non-null 로 조인.
|
||||
- chunk-level country 도 NULL 이면 news_sources.name prefix(ai_sub_group) 매칭으로 fallback.
|
||||
- 그래도 NULL 이면 drop(로그 경고).
|
||||
- ai_summary / embedding 이 NULL 이면 처음부터 제외 (재요약/재임베딩 0회 원칙).
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from sqlalchemy import text
|
||||
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
|
||||
logger = setup_logger("digest_loader")
|
||||
|
||||
|
||||
_NEWS_WINDOW_SQL = text("""
|
||||
SELECT
|
||||
d.id,
|
||||
d.title,
|
||||
d.ai_summary,
|
||||
d.embedding,
|
||||
d.created_at,
|
||||
d.edit_url,
|
||||
d.ai_sub_group,
|
||||
(
|
||||
SELECT c.country
|
||||
FROM document_chunks c
|
||||
WHERE c.doc_id = d.id AND c.country IS NOT NULL
|
||||
LIMIT 1
|
||||
) AS chunk_country
|
||||
FROM documents d
|
||||
WHERE d.source_channel = 'news'
|
||||
AND d.deleted_at IS NULL
|
||||
AND d.created_at >= :window_start
|
||||
AND d.created_at < :window_end
|
||||
AND d.embedding IS NOT NULL
|
||||
AND d.ai_summary IS NOT NULL
|
||||
""")
|
||||
|
||||
|
||||
_SOURCE_COUNTRY_SQL = text("""
|
||||
SELECT name, country FROM news_sources WHERE country IS NOT NULL
|
||||
""")
|
||||
|
||||
|
||||
def _to_numpy_embedding(raw: Any) -> np.ndarray | None:
|
||||
"""pgvector 컬럼을 numpy array(float32)로 정규화."""
|
||||
if raw is None:
|
||||
return None
|
||||
arr = np.asarray(raw, dtype=np.float32)
|
||||
if arr.size == 0:
|
||||
return None
|
||||
return arr
|
||||
|
||||
|
||||
async def _load_source_country_map(session) -> dict[str, str]:
|
||||
"""news_sources name → country 매핑.
|
||||
|
||||
name 은 '경향신문 문화' 형태이고 documents.ai_sub_group 은 '경향신문' (split[0]).
|
||||
prefix 매칭이 가능하도록 첫 토큰 → country 로 인덱싱.
|
||||
"""
|
||||
rows = await session.execute(_SOURCE_COUNTRY_SQL)
|
||||
mapping: dict[str, str] = {}
|
||||
for name, country in rows:
|
||||
if not name or not country:
|
||||
continue
|
||||
prefix = name.split(" ")[0].strip()
|
||||
if prefix and prefix not in mapping:
|
||||
mapping[prefix] = country
|
||||
return mapping
|
||||
|
||||
|
||||
async def load_news_window(
|
||||
window_start: datetime,
|
||||
window_end: datetime,
|
||||
) -> dict[str, list[dict]]:
|
||||
"""주어진 윈도우 안의 뉴스 documents 를 country 별 dict 로 반환.
|
||||
|
||||
Returns:
|
||||
{"KR": [doc_dict, ...], "US": [...], ...}
|
||||
"""
|
||||
docs_by_country: dict[str, list[dict]] = defaultdict(list)
|
||||
null_country_count = 0
|
||||
total = 0
|
||||
|
||||
async with async_session() as session:
|
||||
source_country = await _load_source_country_map(session)
|
||||
|
||||
result = await session.execute(
|
||||
_NEWS_WINDOW_SQL,
|
||||
{"window_start": window_start, "window_end": window_end},
|
||||
)
|
||||
for row in result.mappings():
|
||||
embedding = _to_numpy_embedding(row["embedding"])
|
||||
if embedding is None:
|
||||
continue
|
||||
|
||||
country = row["chunk_country"]
|
||||
if not country:
|
||||
# news_sources prefix fallback
|
||||
ai_sub_group = (row["ai_sub_group"] or "").strip()
|
||||
if ai_sub_group:
|
||||
country = source_country.get(ai_sub_group)
|
||||
if not country:
|
||||
null_country_count += 1
|
||||
continue
|
||||
|
||||
country = country.upper()
|
||||
docs_by_country[country].append({
|
||||
"id": int(row["id"]),
|
||||
"title": row["title"] or "",
|
||||
"ai_summary": row["ai_summary"] or "",
|
||||
"embedding": embedding,
|
||||
"created_at": row["created_at"],
|
||||
"edit_url": row["edit_url"] or "",
|
||||
"ai_sub_group": row["ai_sub_group"] or "",
|
||||
})
|
||||
total += 1
|
||||
|
||||
if null_country_count:
|
||||
logger.warning(
|
||||
f"[loader] country 분류 실패로 drop된 문서 {null_country_count}건 "
|
||||
f"(chunk_country + news_sources fallback 모두 실패)"
|
||||
)
|
||||
logger.info(
|
||||
f"[loader] window {window_start.date()} ~ {window_end.date()} → "
|
||||
f"{total}건 ({len(docs_by_country)}개 국가)"
|
||||
)
|
||||
return dict(docs_by_country)
|
||||
Reference in New Issue
Block a user