431d4fe010
야간 수집 뉴스 (KST 00:00~05:00) topic×country 비교 분석 1페이지 카드.
Phase 4 Global Digest 와 코드/로직/테이블 분리, 알고리즘만 services/clustering_common 공유.
Backend 신규:
- migrations/255_morning_briefings.sql: morning_briefings + briefing_topics
(briefing_date UNIQUE, UNIQUE(briefing_id,topic_rank), FK CASCADE,
historical_* 3컬럼 nullable, cluster_members JSONB, country_perspectives
JSONB, status 4-state success|partial|failed|empty)
- app/models/briefing.py: SQLAlchemy ORM
- app/services/briefing/loader.py: KST 5h 윈도우 + news_sources prefix
fallback (Phase 4 패턴 미러) + historical candidate pool 로더
- app/services/briefing/clustering.py: cluster_global topic-first
(LAMBDA=ln(2)/2h, MIN_COUNTRIES_PER_TOPIC=2, MAX_TOPICS=7)
- app/services/briefing/comparator.py: call_primary 26B + JSON envelope
sanitize (cap perspectives 10 / divergences 3 / convergences 2 /
quotes 5) + fallback row 고정 형태 + retrieve_historical cosine top-K
- app/services/briefing/pipeline.py: load→cluster→select(K=7,λ=0.6)
→historical→compare→status 4-state→delete+insert transaction
- app/workers/briefing_worker.py: APScheduler/수동 호출 공용 진입점,
600s hard cap
- app/prompts/briefing_comparative.txt: 한국어 비교 분석 JSON 프롬프트,
{articles_block} + {historical_block} 2섹션, 인용 금지 라벨
- app/api/briefing.py: GET /latest, GET ?date=, POST /regenerate?date=
(admin, sync delete+insert tx, regenerated:true)
Backend 수정:
- app/main.py: briefing_router 등록 (/api/briefing prefix). scheduler
등록은 PR-3 에서.
- app/services/digest/selection.py: select_for_llm 매개변수화 (K, λ
caller 주입). Phase 4 동작은 default 값으로 보존.
Historical 정책:
- BRIEFING_HISTORICAL_ENABLED env flag, default off.
- flag off → historical_* 컬럼 모두 NULL, prompt {historical_block} 빈
라벨, retrieval 호출 안 함.
- flag on (PR-1b 에서 enable) → cluster centroid 와 과거 30일 doc
embedding cosine top-K 5 (sim≥0.70), prompt 에 주입.
Country canonical (실측 확인 후):
- documents.country 컬럼 부재 확정
- document_chunks.country 매칭률 0% (chunks 자체가 뉴스에 안 만들어짐)
- 유일 country 신호 = news_sources prefix 매핑 (Phase 4 와 동일)
Tests:
- tests/test_briefing_historical.py: 3 경로 회귀 (flag off/on with
fixture/on zero match) + sanitize cap + fallback row 형태.
Verification: PR-1.8 에서 GPU 컨테이너 pytest + 수동 regenerate.
262 lines
9.3 KiB
Python
262 lines
9.3 KiB
Python
"""야간 수집 뉴스 브리핑 파이프라인 (Plan §"PR-MorningBriefing-1 Backend").
|
|
|
|
흐름: load_night_window → cluster_global → select_for_llm (k=7) →
|
|
(옵션) historical retrieval → compare_cluster_with_fallback → DB save.
|
|
|
|
regenerate 정책: briefing_date UNIQUE 충돌 시 transaction 안에서 DELETE+INSERT.
|
|
"""
|
|
|
|
import time
|
|
from datetime import date, datetime, timedelta, timezone
|
|
from typing import Any
|
|
from zoneinfo import ZoneInfo
|
|
|
|
from sqlalchemy import delete
|
|
|
|
from ai.client import AIClient
|
|
from core.database import async_session
|
|
from core.utils import setup_logger
|
|
from models.briefing import BriefingTopic, MorningBriefing
|
|
from services.briefing.clustering import LAMBDA, cluster_global
|
|
from services.briefing.comparator import (
|
|
HISTORICAL_WINDOW_DAYS,
|
|
compare_cluster_with_fallback,
|
|
historical_enabled,
|
|
retrieve_historical,
|
|
)
|
|
from services.briefing.loader import load_historical_candidates, load_night_window
|
|
from services.digest.selection import select_for_llm
|
|
|
|
logger = setup_logger("briefing_pipeline")
|
|
|
|
KST = ZoneInfo("Asia/Seoul")
|
|
NIGHT_WINDOW_HOURS = 5 # KST 00:00 ~ 05:00
|
|
SELECT_K = 7 # Plan §"Clustering 파라미터" briefing K_PER_CLUSTER=7
|
|
SELECT_LAMBDA_MMR = 0.6 # Plan briefing MMR lambda 0.6
|
|
PIPELINE_HARD_CAP = 600 # 초. Phase 4 와 동일
|
|
|
|
|
|
def _compute_window(target_date: date | None = None) -> tuple[datetime, datetime, date]:
|
|
"""target_date (KST 자정 시작일) → (window_start_utc, window_end_utc, kst_date).
|
|
|
|
target_date=None 시 오늘 KST.
|
|
"""
|
|
if target_date is None:
|
|
target_date = datetime.now(KST).date()
|
|
start_kst = datetime.combine(target_date, datetime.min.time(), tzinfo=KST)
|
|
end_kst = start_kst + timedelta(hours=NIGHT_WINDOW_HOURS)
|
|
return start_kst.astimezone(timezone.utc), end_kst.astimezone(timezone.utc), target_date
|
|
|
|
|
|
def _is_usable_topic(envelope: dict, topic_label: str) -> bool:
|
|
"""fallback row 가 아닌 진짜 LLM 결과인지 판정."""
|
|
if envelope.get("llm_fallback_used"):
|
|
return False
|
|
if not envelope.get("country_perspectives"):
|
|
return False
|
|
if topic_label == "주요 뉴스 묶음":
|
|
return False
|
|
return True
|
|
|
|
|
|
def _compute_status(llm_calls: int, fallback_count: int, usable_count: int, has_topics: bool) -> str:
|
|
"""Plan §"Status 4-state 판정표"."""
|
|
if not has_topics or llm_calls == 0:
|
|
return "empty"
|
|
if usable_count == 0:
|
|
return "failed"
|
|
fallback_pct = (fallback_count / llm_calls) if llm_calls else 0.0
|
|
if fallback_pct >= 0.5:
|
|
return "failed"
|
|
if fallback_count > 0 or usable_count < llm_calls:
|
|
return "partial"
|
|
return "success"
|
|
|
|
|
|
def _build_topic_row(
|
|
rank: int,
|
|
cluster: dict,
|
|
envelope: dict,
|
|
historical_docs: list[dict] | None,
|
|
primary_model: str,
|
|
) -> BriefingTopic:
|
|
historical_ids = None
|
|
historical_window = None
|
|
if historical_enabled():
|
|
historical_ids = [d["id"] for d in (historical_docs or [])]
|
|
historical_window = HISTORICAL_WINDOW_DAYS
|
|
|
|
return BriefingTopic(
|
|
topic_rank=rank,
|
|
topic_label=envelope["topic_label"],
|
|
headline=envelope["headline"],
|
|
country_perspectives=envelope["country_perspectives"],
|
|
divergences=envelope["divergences"],
|
|
convergences=envelope["convergences"],
|
|
key_quotes=envelope["key_quotes"],
|
|
historical_article_ids=historical_ids,
|
|
historical_context=envelope.get("historical_context"),
|
|
historical_window_days=historical_window,
|
|
cluster_members=[m["id"] for m in cluster["members"]],
|
|
article_count=len(cluster["members"]),
|
|
country_count=cluster.get("country_count", 0),
|
|
importance_score=cluster.get("importance_score", 0.0),
|
|
raw_weight_sum=cluster.get("raw_weight_sum", 0.0),
|
|
llm_model=primary_model,
|
|
llm_fallback_used=envelope.get("llm_fallback_used", False),
|
|
)
|
|
|
|
|
|
async def _save_briefing(
|
|
briefing_date: date,
|
|
window_start: datetime,
|
|
window_end: datetime,
|
|
total_articles: int,
|
|
total_countries: int,
|
|
topic_rows: list[BriefingTopic],
|
|
llm_calls: int,
|
|
llm_failures: int,
|
|
generation_ms: int,
|
|
status: str,
|
|
) -> int:
|
|
"""briefing_date UNIQUE 충돌은 DELETE+INSERT transaction 으로 처리."""
|
|
async with async_session() as session:
|
|
await session.execute(
|
|
delete(MorningBriefing).where(MorningBriefing.briefing_date == briefing_date)
|
|
)
|
|
new = MorningBriefing(
|
|
briefing_date=briefing_date,
|
|
window_start=window_start,
|
|
window_end=window_end,
|
|
decay_lambda=LAMBDA,
|
|
total_articles=total_articles,
|
|
total_countries=total_countries,
|
|
total_topics=len(topic_rows),
|
|
generation_ms=generation_ms,
|
|
llm_calls=llm_calls,
|
|
llm_failures=llm_failures,
|
|
status=status,
|
|
)
|
|
new.topics = topic_rows
|
|
session.add(new)
|
|
await session.commit()
|
|
return new.id
|
|
|
|
|
|
async def run_briefing_pipeline(target_date: date | None = None) -> dict[str, Any]:
|
|
"""야간 뉴스 브리핑 1회 실행. cron 또는 수동 regenerate API 에서 호출.
|
|
|
|
Returns:
|
|
{briefing_id, status, total_topics, total_articles, llm_calls, llm_failures, generation_ms, regenerated}
|
|
"""
|
|
start = time.time()
|
|
window_start, window_end, briefing_date = _compute_window(target_date)
|
|
logger.info(
|
|
f"[briefing] start date={briefing_date} window {window_start} ~ {window_end} "
|
|
f"decay_lambda={LAMBDA:.4f} historical={'on' if historical_enabled() else 'off'}"
|
|
)
|
|
|
|
# 1. Load night window
|
|
docs = await load_night_window(window_start, window_end)
|
|
total_articles = len(docs)
|
|
total_countries_in_window = len({d["country"] for d in docs})
|
|
|
|
# 2. Cluster (topic-first)
|
|
clusters = cluster_global(docs)
|
|
|
|
if not clusters:
|
|
briefing_id = await _save_briefing(
|
|
briefing_date=briefing_date,
|
|
window_start=window_start,
|
|
window_end=window_end,
|
|
total_articles=total_articles,
|
|
total_countries=total_countries_in_window,
|
|
topic_rows=[],
|
|
llm_calls=0,
|
|
llm_failures=0,
|
|
generation_ms=int((time.time() - start) * 1000),
|
|
status="empty",
|
|
)
|
|
logger.info(f"[briefing] empty (no usable clusters) → briefing_id={briefing_id}")
|
|
return {
|
|
"briefing_id": briefing_id,
|
|
"status": "empty",
|
|
"total_topics": 0,
|
|
"total_articles": total_articles,
|
|
"llm_calls": 0,
|
|
"llm_failures": 0,
|
|
"generation_ms": int((time.time() - start) * 1000),
|
|
"regenerated": True,
|
|
}
|
|
|
|
# 3. (옵션) Historical candidate pool 1회 로드
|
|
historical_candidates: list[dict] = []
|
|
if historical_enabled():
|
|
hist_end = window_start # 오늘 윈도우 직전까지
|
|
hist_start = hist_end - timedelta(days=HISTORICAL_WINDOW_DAYS)
|
|
exclude = {d["id"] for d in docs}
|
|
historical_candidates = await load_historical_candidates(hist_start, hist_end, exclude)
|
|
|
|
# 4. cluster 별 LLM 호출
|
|
client = AIClient()
|
|
primary_model = client.ai.primary.model
|
|
topic_rows: list[BriefingTopic] = []
|
|
llm_calls = 0
|
|
llm_failures = 0
|
|
usable_count = 0
|
|
|
|
try:
|
|
for rank, cluster in enumerate(clusters, start=1):
|
|
selected = select_for_llm(cluster, k=SELECT_K, lambda_mmr=SELECT_LAMBDA_MMR)
|
|
historical_docs = (
|
|
retrieve_historical(cluster, historical_candidates)
|
|
if historical_enabled() else []
|
|
)
|
|
llm_calls += 1
|
|
envelope = await compare_cluster_with_fallback(
|
|
client, cluster, selected, historical_docs=historical_docs
|
|
)
|
|
if envelope.get("llm_fallback_used"):
|
|
llm_failures += 1
|
|
if _is_usable_topic(envelope, envelope["topic_label"]):
|
|
usable_count += 1
|
|
topic_rows.append(
|
|
_build_topic_row(rank, cluster, envelope, historical_docs, primary_model)
|
|
)
|
|
finally:
|
|
await client.close()
|
|
|
|
generation_ms = int((time.time() - start) * 1000)
|
|
status = _compute_status(llm_calls, llm_failures, usable_count, has_topics=bool(topic_rows))
|
|
|
|
briefing_id = await _save_briefing(
|
|
briefing_date=briefing_date,
|
|
window_start=window_start,
|
|
window_end=window_end,
|
|
total_articles=total_articles,
|
|
total_countries=total_countries_in_window,
|
|
topic_rows=topic_rows,
|
|
llm_calls=llm_calls,
|
|
llm_failures=llm_failures,
|
|
generation_ms=generation_ms,
|
|
status=status,
|
|
)
|
|
|
|
fallback_pct = (llm_failures / llm_calls * 100.0) if llm_calls else 0.0
|
|
logger.info(
|
|
f"[briefing] done id={briefing_id} status={status} topics={len(topic_rows)} "
|
|
f"usable={usable_count}/{llm_calls} fallback={llm_failures}/{llm_calls} ({fallback_pct:.1f}%) "
|
|
f"elapsed={generation_ms / 1000:.1f}s"
|
|
)
|
|
|
|
return {
|
|
"briefing_id": briefing_id,
|
|
"status": status,
|
|
"total_topics": len(topic_rows),
|
|
"total_articles": total_articles,
|
|
"llm_calls": llm_calls,
|
|
"llm_failures": llm_failures,
|
|
"generation_ms": generation_ms,
|
|
"regenerated": True,
|
|
}
|