hyungi_document_server/app/workers/news_collector.py

"""뉴스 수집 워커 — RSS/API에서 기사 수집, documents에 저장"""

import hashlib
import re
from datetime import datetime, timezone
from html import unescape
from urllib.parse import urlparse, urlunparse

import feedparser
import httpx
from sqlalchemy import select

from core.database import async_session
from core.utils import setup_logger
from models.document import Document
from models.news_source import NewsSource
from models.queue import ProcessingQueue

logger = setup_logger("news_collector")

# 카테고리 표준화 매핑
CATEGORY_MAP = {
    # 한국어
    "국제": "International", "정치": "Politics", "경제": "Economy",
    "사회": "Society", "문화": "Culture", "산업": "Industry",
    "환경": "Environment", "기술": "Technology",
    # 영어
    "World": "International", "International": "International",
    "Technology": "Technology", "Tech": "Technology", "Sci-Tech": "Technology",
    "Arts": "Culture", "Culture": "Culture",
    "Climate": "Environment", "Environment": "Environment",
    # 일본어
    "国際": "International", "文化": "Culture", "科学": "Technology",
    # 독일어
    "Kultur": "Culture", "Wissenschaft": "Technology",
    # 프랑스어
    "Environnement": "Environment",
}


def _normalize_category(raw: str) -> str:
    """카테고리 표준화"""
    return CATEGORY_MAP.get(raw, CATEGORY_MAP.get(raw.strip(), "Other"))


def _clean_html(text: str) -> str:
    """HTML 태그 제거 + 정제"""
    if not text:
        return ""
    text = re.sub(r"<[^>]+>", "", text)
    text = unescape(text)
    return text.strip()[:1000]


def _normalize_url(url: str) -> str:
    """URL 정규화 (tracking params 제거)"""
    parsed = urlparse(url)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))


def _article_hash(title: str, published: str, source_name: str) -> str:
    """기사 고유 해시 (중복 체크용)"""
    key = f"{title}|{published}|{source_name}"
    return hashlib.sha256(key.encode()).hexdigest()[:32]


def _normalize_to_utc(dt) -> datetime:
    """다양한 시간 형식을 UTC로 정규화"""
    if isinstance(dt, datetime):
        if dt.tzinfo is None:
            return dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    return datetime.now(timezone.utc)


async def run():
    """뉴스 수집 실행"""
    async with async_session() as session:
        result = await session.execute(
            select(NewsSource).where(NewsSource.enabled == True)
        )
        sources = result.scalars().all()

        if not sources:
            logger.info("활성화된 뉴스 소스 없음")
            return

        total = 0
        for source in sources:
            try:
                if source.feed_type == "api":
                    count = await _fetch_api(session, source)
                else:
                    count = await _fetch_rss(session, source)

                source.last_fetched_at = datetime.now(timezone.utc)
                total += count
            except Exception as e:
                logger.error(f"[{source.name}] 수집 실패: {e}")
                source.last_fetched_at = datetime.now(timezone.utc)

        await session.commit()
        logger.info(f"뉴스 수집 완료: {total}건 신규")


async def _fetch_rss(session, source: NewsSource) -> int:
    """RSS 피드 수집"""
    async with httpx.AsyncClient(timeout=10) as client:
        resp = await client.get(source.feed_url)
        resp.raise_for_status()

    feed = feedparser.parse(resp.text)
    count = 0

    for entry in feed.entries:
        title = entry.get("title", "").strip()
        if not title:
            continue

        summary = _clean_html(entry.get("summary", "") or entry.get("description", ""))
        if not summary:
            summary = title

        link = entry.get("link", "")
        published = entry.get("published_parsed") or entry.get("updated_parsed")
        pub_dt = datetime(*published[:6], tzinfo=timezone.utc) if published else datetime.now(timezone.utc)

        # 중복 체크
        article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name)
        normalized_url = _normalize_url(link)

        existing = await session.execute(
            select(Document).where(
                (Document.file_hash == article_id) |
                (Document.edit_url == normalized_url)
            )
        )
        if existing.scalar_one_or_none():
            continue

        category = _normalize_category(source.category or "")

        doc = Document(
            file_path=f"news/{source.name}/{article_id}",
            file_hash=article_id,
            file_format="article",
            file_size=len(summary.encode()),
            file_type="note",
            title=title,
            extracted_text=f"{title}\n\n{summary}",
            extracted_at=datetime.now(timezone.utc),
            extractor_version="rss",
            source_channel="news",
            data_origin="external",
            edit_url=link,
            review_status="approved",
        )
        session.add(doc)
        await session.flush()

        # classify + embed 큐 등록 (extract 불필요)
        session.add(ProcessingQueue(document_id=doc.id, stage="classify", status="pending"))

        # 30일 이내만 embed
        days_old = (datetime.now(timezone.utc) - pub_dt).days
        if days_old <= 30:
            session.add(ProcessingQueue(document_id=doc.id, stage="embed", status="pending"))

        count += 1

    logger.info(f"[{source.name}] RSS → {count}건 수집")
    return count


async def _fetch_api(session, source: NewsSource) -> int:
    """NYT API 수집"""
    import os
    nyt_key = os.getenv("NYT_API_KEY", "")
    if not nyt_key:
        logger.warning("NYT_API_KEY 미설정")
        return 0

    async with httpx.AsyncClient(timeout=10) as client:
        resp = await client.get(
            f"https://api.nytimes.com/svc/topstories/v2/{source.category or 'world'}.json",
            params={"api-key": nyt_key},
        )
        resp.raise_for_status()

    data = resp.json()
    count = 0

    for article in data.get("results", []):
        title = article.get("title", "").strip()
        if not title:
            continue

        summary = _clean_html(article.get("abstract", ""))
        if not summary:
            summary = title

        link = article.get("url", "")
        pub_str = article.get("published_date", "")
        try:
            pub_dt = datetime.fromisoformat(pub_str.replace("Z", "+00:00"))
        except (ValueError, AttributeError):
            pub_dt = datetime.now(timezone.utc)

        article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name)
        normalized_url = _normalize_url(link)

        existing = await session.execute(
            select(Document).where(
                (Document.file_hash == article_id) |
                (Document.edit_url == normalized_url)
            )
        )
        if existing.scalar_one_or_none():
            continue

        category = _normalize_category(article.get("section", source.category or ""))

        doc = Document(
            file_path=f"news/{source.name}/{article_id}",
            file_hash=article_id,
            file_format="article",
            file_size=len(summary.encode()),
            file_type="note",
            title=title,
            extracted_text=f"{title}\n\n{summary}",
            extracted_at=datetime.now(timezone.utc),
            extractor_version="nyt_api",
            source_channel="news",
            data_origin="external",
            edit_url=link,
            review_status="approved",
        )
        session.add(doc)
        await session.flush()

        session.add(ProcessingQueue(document_id=doc.id, stage="classify", status="pending"))

        days_old = (datetime.now(timezone.utc) - pub_dt).days
        if days_old <= 30:
            session.add(ProcessingQueue(document_id=doc.id, stage="embed", status="pending"))

        count += 1

    logger.info(f"[{source.name}] API → {count}건 수집")
    return count