From 4d5f35b26e52e4af90b85a754be1ffc3a0d6d8d5 Mon Sep 17 00:00:00 2001 From: hyungi Date: Tue, 16 Jun 2026 14:30:47 +0900 Subject: [PATCH] =?UTF-8?q?refactor(news):=203=20fetch=20=EA=B3=B5?= =?UTF-8?q?=ED=86=B5=20=EC=A1=B4=EC=9E=AC=EC=B2=B4=ED=81=AC=20=5Falready?= =?UTF-8?q?=5Fingested=20=ED=97=AC=ED=8D=BC=20=EC=B6=94=EC=B6=9C=20(R11c)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _fetch_rss/_fetch_api_guardian/_fetch_api_nyt 가 복제하던 동일 존재체크 (file_hash 또는 edit_url.in_([normalized,link]) 매칭) 를 단일 헬퍼로 — byte-identical 블록이라 동작 100% 보존. news_collector god-file 중복 일부 감소. (채널별 Document 빌드 30줄 3중복 통합은 채널별 필드 차이 검증 필요 → staging/별도.) 검증: py_compile 통과. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/workers/news_collector.py | 39 ++++++++++++++++------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py index ea6945a..f1eb6d0 100644 --- a/app/workers/news_collector.py +++ b/app/workers/news_collector.py @@ -397,6 +397,21 @@ def _doc_identity(source: NewsSource, source_short: str, category: str) -> dict: } +async def _already_ingested(session, article_id: str, normalized_url: str, link: str) -> bool: + """이미 적재된 기사인지 — file_hash 또는 정규화/raw edit_url 매칭 (3 fetch 공통, R11c). + + 레거시 raw URL + 교차 게시 다중 매칭 내성(first). _fetch_rss/_fetch_api_guardian/ + _fetch_api_nyt 가 복제하던 동일 존재체크를 단일화. + """ + existing = await session.execute( + select(Document).where( + (Document.file_hash == article_id) + | (Document.edit_url.in_([normalized_url, link])) + ).limit(1) + ) + return existing.scalars().first() is not None + + async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: """RSS 피드 수집 — redirect 재검증 + 크기/content-type 제한 + 조건부 GET (A-1). @@ -515,13 +530,7 @@ async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name) normalized_url = _normalize_url(link) - existing = await session.execute( - select(Document).where( - (Document.file_hash == article_id) | - (Document.edit_url.in_([normalized_url, link])) - ).limit(1) - ) - if existing.scalars().first(): + if await _already_ingested(session, article_id, normalized_url, link): continue # A-6 2차: 포털 전재 dedup (first-wins — 먼저 적재된 쪽이 정본) @@ -658,13 +667,7 @@ async def _fetch_api_guardian(session, source: NewsSource) -> tuple[int, str]: normalized_url = _normalize_url(link) # RSS 수집부와 동일: 레거시 raw URL + 교차 게시 다중 매칭 내성 (first) - existing = await session.execute( - select(Document).where( - (Document.file_hash == article_id) | - (Document.edit_url.in_([normalized_url, link])) - ).limit(1) - ) - if existing.scalars().first(): + if await _already_ingested(session, article_id, normalized_url, link): continue if await _is_portal_duplicate(session, title): @@ -755,13 +758,7 @@ async def _fetch_api_nyt(session, source: NewsSource) -> tuple[int, str]: normalized_url = _normalize_url(link) # RSS 수집부와 동일: 레거시 raw URL + 교차 게시 다중 매칭 내성 (first) - existing = await session.execute( - select(Document).where( - (Document.file_hash == article_id) | - (Document.edit_url.in_([normalized_url, link])) - ).limit(1) - ) - if existing.scalars().first(): + if await _already_ingested(session, article_id, normalized_url, link): continue if await _is_portal_duplicate(session, title):