diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py index ea6945a..f1eb6d0 100644 --- a/app/workers/news_collector.py +++ b/app/workers/news_collector.py @@ -397,6 +397,21 @@ def _doc_identity(source: NewsSource, source_short: str, category: str) -> dict: } +async def _already_ingested(session, article_id: str, normalized_url: str, link: str) -> bool: + """이미 적재된 기사인지 — file_hash 또는 정규화/raw edit_url 매칭 (3 fetch 공통, R11c). + + 레거시 raw URL + 교차 게시 다중 매칭 내성(first). _fetch_rss/_fetch_api_guardian/ + _fetch_api_nyt 가 복제하던 동일 존재체크를 단일화. + """ + existing = await session.execute( + select(Document).where( + (Document.file_hash == article_id) + | (Document.edit_url.in_([normalized_url, link])) + ).limit(1) + ) + return existing.scalars().first() is not None + + async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: """RSS 피드 수집 — redirect 재검증 + 크기/content-type 제한 + 조건부 GET (A-1). @@ -515,13 +530,7 @@ async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name) normalized_url = _normalize_url(link) - existing = await session.execute( - select(Document).where( - (Document.file_hash == article_id) | - (Document.edit_url.in_([normalized_url, link])) - ).limit(1) - ) - if existing.scalars().first(): + if await _already_ingested(session, article_id, normalized_url, link): continue # A-6 2차: 포털 전재 dedup (first-wins — 먼저 적재된 쪽이 정본) @@ -658,13 +667,7 @@ async def _fetch_api_guardian(session, source: NewsSource) -> tuple[int, str]: normalized_url = _normalize_url(link) # RSS 수집부와 동일: 레거시 raw URL + 교차 게시 다중 매칭 내성 (first) - existing = await session.execute( - select(Document).where( - (Document.file_hash == article_id) | - (Document.edit_url.in_([normalized_url, link])) - ).limit(1) - ) - if existing.scalars().first(): + if await _already_ingested(session, article_id, normalized_url, link): continue if await _is_portal_duplicate(session, title): @@ -755,13 +758,7 @@ async def _fetch_api_nyt(session, source: NewsSource) -> tuple[int, str]: normalized_url = _normalize_url(link) # RSS 수집부와 동일: 레거시 raw URL + 교차 게시 다중 매칭 내성 (first) - existing = await session.execute( - select(Document).where( - (Document.file_hash == article_id) | - (Document.edit_url.in_([normalized_url, link])) - ).limit(1) - ) - if existing.scalars().first(): + if await _already_ingested(session, article_id, normalized_url, link): continue if await _is_portal_duplicate(session, title):