"""뉴스 수집 워커 — RSS/API에서 기사 수집, documents에 저장""" import hashlib import re from datetime import datetime, timezone from html import unescape from urllib.parse import urlparse, urlunparse import feedparser import httpx from sqlalchemy import select from core.database import async_session from core.utils import setup_logger from models.document import Document from models.news_source import NewsSource from models.queue import ProcessingQueue logger = setup_logger("news_collector") # 카테고리 표준화 매핑 CATEGORY_MAP = { # 한국어 "국제": "International", "정치": "Politics", "경제": "Economy", "사회": "Society", "문화": "Culture", "산업": "Industry", "환경": "Environment", "기술": "Technology", # 영어 "World": "International", "International": "International", "Technology": "Technology", "Tech": "Technology", "Sci-Tech": "Technology", "Arts": "Culture", "Culture": "Culture", "Climate": "Environment", "Environment": "Environment", # 일본어 "国際": "International", "文化": "Culture", "科学": "Technology", # 독일어 "Kultur": "Culture", "Wissenschaft": "Technology", # 프랑스어 "Environnement": "Environment", } def _normalize_category(raw: str) -> str: """카테고리 표준화""" return CATEGORY_MAP.get(raw, CATEGORY_MAP.get(raw.strip(), "Other")) def _clean_html(text: str) -> str: """HTML 태그 제거 + 정제""" if not text: return "" text = re.sub(r"<[^>]+>", "", text) text = unescape(text) return text.strip()[:1000] def _normalize_url(url: str) -> str: """URL 정규화 (tracking params 제거)""" parsed = urlparse(url) return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) def _article_hash(title: str, published: str, source_name: str) -> str: """기사 고유 해시 (중복 체크용)""" key = f"{title}|{published}|{source_name}" return hashlib.sha256(key.encode()).hexdigest()[:32] def _normalize_to_utc(dt) -> datetime: """다양한 시간 형식을 UTC로 정규화""" if isinstance(dt, datetime): if dt.tzinfo is None: return dt.replace(tzinfo=timezone.utc) return dt.astimezone(timezone.utc) return datetime.now(timezone.utc) async def run(): """뉴스 수집 실행""" async with async_session() as session: result = await session.execute( select(NewsSource).where(NewsSource.enabled == True) ) sources = result.scalars().all() if not sources: logger.info("활성화된 뉴스 소스 없음") return total = 0 for source in sources: try: if source.feed_type == "api": count = await _fetch_api(session, source) else: count = await _fetch_rss(session, source) source.last_fetched_at = datetime.now(timezone.utc) total += count except Exception as e: logger.error(f"[{source.name}] 수집 실패: {e}") source.last_fetched_at = datetime.now(timezone.utc) await session.commit() logger.info(f"뉴스 수집 완료: {total}건 신규") async def _fetch_rss(session, source: NewsSource) -> int: """RSS 피드 수집""" async with httpx.AsyncClient(timeout=10) as client: resp = await client.get(source.feed_url) resp.raise_for_status() feed = feedparser.parse(resp.text) count = 0 for entry in feed.entries: title = entry.get("title", "").strip() if not title: continue summary = _clean_html(entry.get("summary", "") or entry.get("description", "")) if not summary: summary = title link = entry.get("link", "") published = entry.get("published_parsed") or entry.get("updated_parsed") pub_dt = datetime(*published[:6], tzinfo=timezone.utc) if published else datetime.now(timezone.utc) # 중복 체크 article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name) normalized_url = _normalize_url(link) existing = await session.execute( select(Document).where( (Document.file_hash == article_id) | (Document.edit_url == normalized_url) ) ) if existing.scalar_one_or_none(): continue category = _normalize_category(source.category or "") doc = Document( file_path=f"news/{source.name}/{article_id}", file_hash=article_id, file_format="article", file_size=len(summary.encode()), file_type="note", title=title, extracted_text=f"{title}\n\n{summary}", extracted_at=datetime.now(timezone.utc), extractor_version="rss", source_channel="news", data_origin="external", edit_url=link, review_status="approved", ) session.add(doc) await session.flush() # classify + embed 큐 등록 (extract 불필요) session.add(ProcessingQueue(document_id=doc.id, stage="classify", status="pending")) # 30일 이내만 embed days_old = (datetime.now(timezone.utc) - pub_dt).days if days_old <= 30: session.add(ProcessingQueue(document_id=doc.id, stage="embed", status="pending")) count += 1 logger.info(f"[{source.name}] RSS → {count}건 수집") return count async def _fetch_api(session, source: NewsSource) -> int: """NYT API 수집""" import os nyt_key = os.getenv("NYT_API_KEY", "") if not nyt_key: logger.warning("NYT_API_KEY 미설정") return 0 async with httpx.AsyncClient(timeout=10) as client: resp = await client.get( f"https://api.nytimes.com/svc/topstories/v2/{source.category or 'world'}.json", params={"api-key": nyt_key}, ) resp.raise_for_status() data = resp.json() count = 0 for article in data.get("results", []): title = article.get("title", "").strip() if not title: continue summary = _clean_html(article.get("abstract", "")) if not summary: summary = title link = article.get("url", "") pub_str = article.get("published_date", "") try: pub_dt = datetime.fromisoformat(pub_str.replace("Z", "+00:00")) except (ValueError, AttributeError): pub_dt = datetime.now(timezone.utc) article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name) normalized_url = _normalize_url(link) existing = await session.execute( select(Document).where( (Document.file_hash == article_id) | (Document.edit_url == normalized_url) ) ) if existing.scalar_one_or_none(): continue category = _normalize_category(article.get("section", source.category or "")) doc = Document( file_path=f"news/{source.name}/{article_id}", file_hash=article_id, file_format="article", file_size=len(summary.encode()), file_type="note", title=title, extracted_text=f"{title}\n\n{summary}", extracted_at=datetime.now(timezone.utc), extractor_version="nyt_api", source_channel="news", data_origin="external", edit_url=link, review_status="approved", ) session.add(doc) await session.flush() session.add(ProcessingQueue(document_id=doc.id, stage="classify", status="pending")) days_old = (datetime.now(timezone.utc) - pub_dt).days if days_old <= 30: session.add(ProcessingQueue(document_id=doc.id, stage="embed", status="pending")) count += 1 logger.info(f"[{source.name}] API → {count}건 수집") return count