hyungi_document_server/app/services/briefing/loader.py

"""야간 5h 수집 뉴스 윈도우 로드 + country 정규화 + (옵션) 과거 N일 후보 로드.

- KST 자정~05:00 사이 수집된 documents (source_channel='news' OR ai_domain='News').
- country canonical = document_chunks.country first non-null → news_sources prefix fallback (Phase 4 동일).
- ai_summary/embedding NULL 제외 (재요약/재임베딩 0회 원칙).
- 반환: doc dict 의 list (topic-first cluster 입력. country 는 각 dict 의 field).
- 과거 retrieval 용 historical doc 후보는 별도 함수 (BRIEFING_HISTORICAL_ENABLED on 시).
"""

from datetime import datetime
from typing import Any

import numpy as np
from sqlalchemy import text

from core.database import async_session
from core.utils import setup_logger

logger = setup_logger("briefing_loader")


_NEWS_WINDOW_SQL = text("""
    SELECT
        d.id,
        d.title,
        d.ai_summary,
        d.embedding,
        d.created_at,
        d.edit_url,
        d.ai_sub_group,
        (
            SELECT c.country
            FROM document_chunks c
            WHERE c.doc_id = d.id AND c.country IS NOT NULL
            LIMIT 1
        ) AS chunk_country
    FROM documents d
    WHERE (d.source_channel = 'news' OR d.ai_domain = 'News')
      AND d.deleted_at IS NULL
      AND d.created_at >= :window_start
      AND d.created_at < :window_end
      AND d.embedding IS NOT NULL
      AND d.ai_summary IS NOT NULL
""")


_SOURCE_COUNTRY_SQL = text("""
    SELECT name, country FROM news_sources WHERE country IS NOT NULL
""")


_HISTORICAL_CANDIDATES_SQL = text("""
    SELECT
        d.id,
        d.title,
        d.ai_summary,
        d.embedding,
        d.created_at
    FROM documents d
    WHERE (d.source_channel = 'news' OR d.ai_domain = 'News')
      AND d.deleted_at IS NULL
      AND d.created_at >= :hist_start
      AND d.created_at < :hist_end
      AND d.embedding IS NOT NULL
      AND d.ai_summary IS NOT NULL
""")


def _to_numpy_embedding(raw: Any) -> np.ndarray | None:
    if raw is None:
        return None
    if isinstance(raw, str):
        import json
        try:
            raw = json.loads(raw)
        except json.JSONDecodeError:
            return None
    try:
        arr = np.asarray(raw, dtype=np.float32)
    except (TypeError, ValueError):
        return None
    if arr.size == 0:
        return None
    return arr


async def _load_source_country_map(session) -> dict[str, str]:
    """news_sources name → country prefix 매핑 (Phase 4 패턴 미러)."""
    rows = await session.execute(_SOURCE_COUNTRY_SQL)
    mapping: dict[str, str] = {}
    for name, country in rows:
        if not name or not country:
            continue
        prefix = name.split(" ")[0].strip()
        if prefix and prefix not in mapping:
            mapping[prefix] = country
        tokens = name.split(" ")
        if len(tokens) >= 3:
            source_prefix = " ".join(tokens[:-1]).strip()
            if source_prefix and source_prefix not in mapping:
                mapping[source_prefix] = country
    return mapping


async def load_night_window(
    window_start: datetime,
    window_end: datetime,
) -> list[dict]:
    """야간 윈도우 뉴스 docs 를 country 채워진 list 로 반환.

    Returns:
        [{id, title, ai_summary, embedding, created_at, edit_url, ai_sub_group, country}, ...]
        country 매핑 실패한 doc 은 drop (cross-country 비교가 핵심이므로).
    """
    docs: list[dict] = []
    null_country = 0

    async with async_session() as session:
        source_country = await _load_source_country_map(session)

        result = await session.execute(
            _NEWS_WINDOW_SQL,
            {"window_start": window_start, "window_end": window_end},
        )
        for row in result.mappings():
            embedding = _to_numpy_embedding(row["embedding"])
            if embedding is None:
                continue

            country = row["chunk_country"]
            if not country:
                ai_sub_group = (row["ai_sub_group"] or "").strip()
                if ai_sub_group:
                    country = source_country.get(ai_sub_group)
            if not country:
                null_country += 1
                continue

            docs.append({
                "id": int(row["id"]),
                "title": row["title"] or "",
                "ai_summary": row["ai_summary"] or "",
                "embedding": embedding,
                "created_at": row["created_at"],
                "edit_url": row["edit_url"] or "",
                "ai_sub_group": row["ai_sub_group"] or "",
                "country": country.upper(),
            })

    if null_country:
        logger.warning(
            f"[loader] country 매핑 실패 drop {null_country}건 "
            f"(chunk_country + news_sources prefix 둘 다 fail)"
        )
    logger.info(
        f"[loader] night window {window_start} ~ {window_end} → "
        f"{len(docs)}건 ({len({d['country'] for d in docs})}개 국가)"
    )
    return docs


async def load_historical_candidates(
    hist_start: datetime,
    hist_end: datetime,
    exclude_ids: set[int],
) -> list[dict]:
    """과거 N일 doc 후보 (BRIEFING_HISTORICAL_ENABLED=true 시만 호출).

    cluster centroid 와 cosine 비교용 raw candidate pool. country 매핑 안 함
    (LLM 분석 input 으로만 사용하고 표시 안 함).

    Args:
        exclude_ids: 오늘 윈도우 article id (중복 retrieval 회피).

    Returns:
        [{id, title, ai_summary, embedding, created_at}, ...]
    """
    out: list[dict] = []
    async with async_session() as session:
        result = await session.execute(
            _HISTORICAL_CANDIDATES_SQL,
            {"hist_start": hist_start, "hist_end": hist_end},
        )
        for row in result.mappings():
            doc_id = int(row["id"])
            if doc_id in exclude_ids:
                continue
            embedding = _to_numpy_embedding(row["embedding"])
            if embedding is None:
                continue
            out.append({
                "id": doc_id,
                "title": row["title"] or "",
                "ai_summary": row["ai_summary"] or "",
                "embedding": embedding,
                "created_at": row["created_at"],
            })
    logger.info(f"[loader] historical candidates: {len(out)} docs (window {hist_start.date()} ~ {hist_end.date()})")
    return out