Files
hyungi_document_server/app/workers/news_collector.py
Hyungi Ahn a6c19ef76c feat: 뉴스 자동 수집 시스템 — 6개국 신문 RSS/API
- news_sources 테이블 (소스 관리, UI 동적 제어)
- news_collector 워커: RSS(feedparser) + NYT API
  - 중복 체크: hash(title+date+source) + URL normalize
  - category 표준화, summary HTML 정제, timezone UTC
  - 30일 이내만 embed, source별 try/catch
- News API: 소스 CRUD + 수동 수집 트리거
- APScheduler: 6시간 간격 자동 수집
- 대상: 경향/아사히/NYT/르몽드/신화/슈피겔

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 13:38:07 +09:00

251 lines
8.0 KiB
Python

"""뉴스 수집 워커 — RSS/API에서 기사 수집, documents에 저장"""
import hashlib
import re
from datetime import datetime, timezone
from html import unescape
from urllib.parse import urlparse, urlunparse
import feedparser
import httpx
from sqlalchemy import select
from core.database import async_session
from core.utils import setup_logger
from models.document import Document
from models.news_source import NewsSource
from models.queue import ProcessingQueue
logger = setup_logger("news_collector")
# 카테고리 표준화 매핑
CATEGORY_MAP = {
# 한국어
"국제": "International", "정치": "Politics", "경제": "Economy",
"사회": "Society", "문화": "Culture", "산업": "Industry",
"환경": "Environment", "기술": "Technology",
# 영어
"World": "International", "International": "International",
"Technology": "Technology", "Tech": "Technology", "Sci-Tech": "Technology",
"Arts": "Culture", "Culture": "Culture",
"Climate": "Environment", "Environment": "Environment",
# 일본어
"国際": "International", "文化": "Culture", "科学": "Technology",
# 독일어
"Kultur": "Culture", "Wissenschaft": "Technology",
# 프랑스어
"Environnement": "Environment",
}
def _normalize_category(raw: str) -> str:
"""카테고리 표준화"""
return CATEGORY_MAP.get(raw, CATEGORY_MAP.get(raw.strip(), "Other"))
def _clean_html(text: str) -> str:
"""HTML 태그 제거 + 정제"""
if not text:
return ""
text = re.sub(r"<[^>]+>", "", text)
text = unescape(text)
return text.strip()[:1000]
def _normalize_url(url: str) -> str:
"""URL 정규화 (tracking params 제거)"""
parsed = urlparse(url)
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
def _article_hash(title: str, published: str, source_name: str) -> str:
"""기사 고유 해시 (중복 체크용)"""
key = f"{title}|{published}|{source_name}"
return hashlib.sha256(key.encode()).hexdigest()[:32]
def _normalize_to_utc(dt) -> datetime:
"""다양한 시간 형식을 UTC로 정규화"""
if isinstance(dt, datetime):
if dt.tzinfo is None:
return dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
return datetime.now(timezone.utc)
async def run():
"""뉴스 수집 실행"""
async with async_session() as session:
result = await session.execute(
select(NewsSource).where(NewsSource.enabled == True)
)
sources = result.scalars().all()
if not sources:
logger.info("활성화된 뉴스 소스 없음")
return
total = 0
for source in sources:
try:
if source.feed_type == "api":
count = await _fetch_api(session, source)
else:
count = await _fetch_rss(session, source)
source.last_fetched_at = datetime.now(timezone.utc)
total += count
except Exception as e:
logger.error(f"[{source.name}] 수집 실패: {e}")
source.last_fetched_at = datetime.now(timezone.utc)
await session.commit()
logger.info(f"뉴스 수집 완료: {total}건 신규")
async def _fetch_rss(session, source: NewsSource) -> int:
"""RSS 피드 수집"""
async with httpx.AsyncClient(timeout=10) as client:
resp = await client.get(source.feed_url)
resp.raise_for_status()
feed = feedparser.parse(resp.text)
count = 0
for entry in feed.entries:
title = entry.get("title", "").strip()
if not title:
continue
summary = _clean_html(entry.get("summary", "") or entry.get("description", ""))
if not summary:
summary = title
link = entry.get("link", "")
published = entry.get("published_parsed") or entry.get("updated_parsed")
pub_dt = datetime(*published[:6], tzinfo=timezone.utc) if published else datetime.now(timezone.utc)
# 중복 체크
article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name)
normalized_url = _normalize_url(link)
existing = await session.execute(
select(Document).where(
(Document.file_hash == article_id) |
(Document.edit_url == normalized_url)
)
)
if existing.scalar_one_or_none():
continue
category = _normalize_category(source.category or "")
doc = Document(
file_path=f"news/{source.name}/{article_id}",
file_hash=article_id,
file_format="article",
file_size=len(summary.encode()),
file_type="note",
title=title,
extracted_text=f"{title}\n\n{summary}",
extracted_at=datetime.now(timezone.utc),
extractor_version="rss",
source_channel="news",
data_origin="external",
edit_url=link,
review_status="approved",
)
session.add(doc)
await session.flush()
# classify + embed 큐 등록 (extract 불필요)
session.add(ProcessingQueue(document_id=doc.id, stage="classify", status="pending"))
# 30일 이내만 embed
days_old = (datetime.now(timezone.utc) - pub_dt).days
if days_old <= 30:
session.add(ProcessingQueue(document_id=doc.id, stage="embed", status="pending"))
count += 1
logger.info(f"[{source.name}] RSS → {count}건 수집")
return count
async def _fetch_api(session, source: NewsSource) -> int:
"""NYT API 수집"""
import os
nyt_key = os.getenv("NYT_API_KEY", "")
if not nyt_key:
logger.warning("NYT_API_KEY 미설정")
return 0
async with httpx.AsyncClient(timeout=10) as client:
resp = await client.get(
f"https://api.nytimes.com/svc/topstories/v2/{source.category or 'world'}.json",
params={"api-key": nyt_key},
)
resp.raise_for_status()
data = resp.json()
count = 0
for article in data.get("results", []):
title = article.get("title", "").strip()
if not title:
continue
summary = _clean_html(article.get("abstract", ""))
if not summary:
summary = title
link = article.get("url", "")
pub_str = article.get("published_date", "")
try:
pub_dt = datetime.fromisoformat(pub_str.replace("Z", "+00:00"))
except (ValueError, AttributeError):
pub_dt = datetime.now(timezone.utc)
article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name)
normalized_url = _normalize_url(link)
existing = await session.execute(
select(Document).where(
(Document.file_hash == article_id) |
(Document.edit_url == normalized_url)
)
)
if existing.scalar_one_or_none():
continue
category = _normalize_category(article.get("section", source.category or ""))
doc = Document(
file_path=f"news/{source.name}/{article_id}",
file_hash=article_id,
file_format="article",
file_size=len(summary.encode()),
file_type="note",
title=title,
extracted_text=f"{title}\n\n{summary}",
extracted_at=datetime.now(timezone.utc),
extractor_version="nyt_api",
source_channel="news",
data_origin="external",
edit_url=link,
review_status="approved",
)
session.add(doc)
await session.flush()
session.add(ProcessingQueue(document_id=doc.id, stage="classify", status="pending"))
days_old = (datetime.now(timezone.utc) - pub_dt).days
if days_old <= 30:
session.add(ProcessingQueue(document_id=doc.id, stage="embed", status="pending"))
count += 1
logger.info(f"[{source.name}] API → {count}건 수집")
return count