"""야간 5h 수집 뉴스 윈도우 로드 + country 정규화 + (옵션) 과거 N일 후보 로드. - KST 자정~05:00 사이 수집된 documents (source_channel='news' OR ai_domain='News'). - country canonical = document_chunks.country first non-null → news_sources prefix fallback (Phase 4 동일). - ai_summary/embedding NULL 제외 (재요약/재임베딩 0회 원칙). - 반환: doc dict 의 list (topic-first cluster 입력. country 는 각 dict 의 field). - 과거 retrieval 용 historical doc 후보는 별도 함수 (BRIEFING_HISTORICAL_ENABLED on 시). """ from datetime import datetime from typing import Any import numpy as np from sqlalchemy import text from core.database import async_session from core.utils import setup_logger from services.search.license_filter import restricted_exclude_sql logger = setup_logger("briefing_loader") _NEWS_WINDOW_SQL = text(f""" SELECT d.id, d.title, d.ai_summary, d.embedding, d.created_at, d.edit_url, d.ai_sub_group, ( SELECT c.country FROM document_chunks c WHERE c.doc_id = d.id AND c.country IS NOT NULL LIMIT 1 ) AS chunk_country FROM documents d WHERE (d.source_channel = 'news' OR d.ai_domain = 'News') AND d.deleted_at IS NULL AND d.created_at >= :window_start AND d.created_at < :window_end AND d.embedding IS NOT NULL AND d.ai_summary IS NOT NULL AND length(d.ai_summary) > 0 -- 안전 자료실 B-4: licensed_restricted 발행 차단 (digest 와 동일 공유 술어, 경로 일관성) AND {restricted_exclude_sql("d")} """) _SOURCE_COUNTRY_SQL = text(""" SELECT name, country FROM news_sources WHERE country IS NOT NULL """) _HISTORICAL_CANDIDATES_SQL = text(f""" SELECT d.id, d.title, d.ai_summary, d.embedding, d.created_at FROM documents d WHERE (d.source_channel = 'news' OR d.ai_domain = 'News') AND d.deleted_at IS NULL AND d.created_at >= :hist_start AND d.created_at < :hist_end AND d.embedding IS NOT NULL AND d.ai_summary IS NOT NULL AND length(d.ai_summary) > 0 -- 안전 자료실 B-4: licensed_restricted 발행 차단 (공유 술어) AND {restricted_exclude_sql("d")} """) def _to_numpy_embedding(raw: Any) -> np.ndarray | None: if raw is None: return None if isinstance(raw, str): import json try: raw = json.loads(raw) except json.JSONDecodeError: return None try: arr = np.asarray(raw, dtype=np.float32) except (TypeError, ValueError): return None if arr.size == 0: return None return arr async def _load_source_country_map(session) -> dict[str, str]: """news_sources name → country prefix 매핑 (Phase 4 패턴 미러).""" rows = await session.execute(_SOURCE_COUNTRY_SQL) mapping: dict[str, str] = {} for name, country in rows: if not name or not country: continue prefix = name.split(" ")[0].strip() if prefix and prefix not in mapping: mapping[prefix] = country tokens = name.split(" ") if len(tokens) >= 3: source_prefix = " ".join(tokens[:-1]).strip() if source_prefix and source_prefix not in mapping: mapping[source_prefix] = country return mapping async def load_night_window( window_start: datetime, window_end: datetime, ) -> list[dict]: """야간 윈도우 뉴스 docs 를 country 채워진 list 로 반환. Returns: [{id, title, ai_summary, embedding, created_at, edit_url, ai_sub_group, country}, ...] country 매핑 실패한 doc 은 drop (cross-country 비교가 핵심이므로). """ docs: list[dict] = [] null_country = 0 async with async_session() as session: source_country = await _load_source_country_map(session) result = await session.execute( _NEWS_WINDOW_SQL, {"window_start": window_start, "window_end": window_end}, ) for row in result.mappings(): embedding = _to_numpy_embedding(row["embedding"]) if embedding is None: continue country = row["chunk_country"] if not country: ai_sub_group = (row["ai_sub_group"] or "").strip() if ai_sub_group: country = source_country.get(ai_sub_group) if not country: null_country += 1 continue docs.append({ "id": int(row["id"]), "title": row["title"] or "", "ai_summary": row["ai_summary"] or "", "embedding": embedding, "created_at": row["created_at"], "edit_url": row["edit_url"] or "", "ai_sub_group": row["ai_sub_group"] or "", "country": country.upper(), }) if null_country: logger.warning( f"[loader] country 매핑 실패 drop {null_country}건 " f"(chunk_country + news_sources prefix 둘 다 fail)" ) logger.info( f"[loader] night window {window_start} ~ {window_end} → " f"{len(docs)}건 ({len({d['country'] for d in docs})}개 국가)" ) return docs async def load_historical_candidates( hist_start: datetime, hist_end: datetime, exclude_ids: set[int], ) -> list[dict]: """과거 N일 doc 후보 (BRIEFING_HISTORICAL_ENABLED=true 시만 호출). cluster centroid 와 cosine 비교용 raw candidate pool. country 매핑 안 함 (LLM 분석 input 으로만 사용하고 표시 안 함). Args: exclude_ids: 오늘 윈도우 article id (중복 retrieval 회피). Returns: [{id, title, ai_summary, embedding, created_at}, ...] """ out: list[dict] = [] async with async_session() as session: result = await session.execute( _HISTORICAL_CANDIDATES_SQL, {"hist_start": hist_start, "hist_end": hist_end}, ) for row in result.mappings(): doc_id = int(row["id"]) if doc_id in exclude_ids: continue embedding = _to_numpy_embedding(row["embedding"]) if embedding is None: continue out.append({ "id": doc_id, "title": row["title"] or "", "ai_summary": row["ai_summary"] or "", "embedding": embedding, "created_at": row["created_at"], }) logger.info(f"[loader] historical candidates: {len(out)} docs (window {hist_start.date()} ~ {hist_end.date()})") return out