From acd595244a0bad0b5c083f32e594af770c415e53 Mon Sep 17 00:00:00 2001
From: hyungi <hyun49196@gmail.com>
Date: Tue, 9 Jun 2026 22:26:22 +0000
Subject: [PATCH] =?UTF-8?q?fix(news):=20URL=20dedup=20=EC=A0=95=EA=B7=9C?=
 =?UTF-8?q?=ED=99=94=20=EC=A0=80=EC=9E=A5=C2=B7=EC=A1=B0=ED=9A=8C=20?=
 =?UTF-8?q?=ED=86=B5=EC=9D=BC=20+=20=EB=8B=A4=EC=A4=91=EB=A7=A4=EC=B9=AD?=
 =?UTF-8?q?=20=EB=82=B4=EC=84=B1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BBC Technology 매 사이클 MultipleResultsFound (06-04~) 해소.
- 저장 edit_url=raw vs 조회 normalized 비대칭으로 URL dedup 무력화돼
  교차게시(HN x BBC) 시 2행 동시매칭 -> scalar_one_or_none raise.
- _normalize_url: query 전체 제거 -> tracking 파라미터만 제거로 교정
  (hada.io/topic?id= 등 query-식별 사이트 870건 붕괴 방지, 리뷰 게이트).
- 조회 .first() + edit_url IN (normalized, raw) 레거시 행 내성. RSS/NYT 양쪽.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 app/workers/news_collector.py | 42 +++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py
index dc8ad90..5b4c8ad 100644
--- a/app/workers/news_collector.py
+++ b/app/workers/news_collector.py
@@ -4,7 +4,7 @@ import hashlib
 import re
 from datetime import datetime, timezone
 from html import unescape
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
 
 import feedparser
 import httpx
@@ -52,10 +52,24 @@ def _clean_html(text: str) -> str:
     return text.strip()[:1000]
 
 
+# tracking 파라미터 판별 — prefix(utm_/at_=BBC/ns_=BBC/mc_=mailchimp) + 단독 키
+_TRACKING_PREFIXES = ("utm_", "at_", "ns_", "mc_")
+_TRACKING_PARAMS = {"fbclid", "gclid", "igshid", "ref", "smid", "partner", "cmp", "ocid", "ftag"}
+
+
 def _normalize_url(url: str) -> str:
-    """URL 정규화 (tracking params 제거)"""
+    """URL 정규화 — tracking 파라미터만 제거, 콘텐츠 식별 파라미터는 보존.
+
+    query 전체 제거 금지: hada.io/topic?id= · aitimes articleView.html?idxno= ·
+    HN item?id= 등 query-식별 사이트에서 별개 기사가 같은 URL 로 붕괴된다.
+    저장(edit_url)·조회 양쪽이 이 함수를 공유해야 dedup 이 성립.
+    """
     parsed = urlparse(url)
-    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
+    kept = [
+        (k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=True)
+        if not (k.lower().startswith(_TRACKING_PREFIXES) or k.lower() in _TRACKING_PARAMS)
+    ]
+    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", urlencode(kept), ""))
 
 
 def _article_hash(title: str, published: str, source_name: str) -> str:
@@ -180,17 +194,19 @@ async def _fetch_rss(session, source: NewsSource) -> int:
         published = entry.get("published_parsed") or entry.get("updated_parsed")
         pub_dt = datetime(*published[:6], tzinfo=timezone.utc) if published else datetime.now(timezone.utc)
 
-        # 중복 체크
+        # 중복 체크 — 레거시 행은 raw URL 로 저장돼 있어 normalized/raw 양쪽 매칭.
+        # 교차 게시(같은 기사가 두 피드에 존재)로 2행 이상 매칭될 수 있어 first() 사용
+        # (scalar_one_or_none 은 MultipleResultsFound raise — 2026-06 BBC 수집 중단 원인).
         article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name)
         normalized_url = _normalize_url(link)
 
         existing = await session.execute(
             select(Document).where(
                 (Document.file_hash == article_id) |
-                (Document.edit_url == normalized_url)
-            )
+                (Document.edit_url.in_([normalized_url, link]))
+            ).limit(1)
         )
-        if existing.scalar_one_or_none():
+        if existing.scalars().first():
             continue
 
         category = _normalize_category(source.category or "")
@@ -213,7 +229,8 @@ async def _fetch_rss(session, source: NewsSource) -> int:
             md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
             source_channel="news",
             data_origin="external",
-            edit_url=link,
+            # 조회와 동일하게 정규화해 저장 — raw(tracking param 포함) 저장 시 URL dedup 무력화
+            edit_url=normalized_url,
             review_status="approved",
             ai_domain="News",
             ai_sub_group=source_short,
@@ -282,13 +299,14 @@ async def _fetch_api(session, source: NewsSource) -> int:
         article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name)
         normalized_url = _normalize_url(link)
 
+        # RSS 수집부와 동일: 레거시 raw URL + 교차 게시 다중 매칭 내성 (first)
         existing = await session.execute(
             select(Document).where(
                 (Document.file_hash == article_id) |
-                (Document.edit_url == normalized_url)
-            )
+                (Document.edit_url.in_([normalized_url, link]))
+            ).limit(1)
         )
-        if existing.scalar_one_or_none():
+        if existing.scalars().first():
             continue
 
         category = _normalize_category(article.get("section", source.category or ""))
@@ -311,7 +329,7 @@ async def _fetch_api(session, source: NewsSource) -> int:
             md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
             source_channel="news",
             data_origin="external",
-            edit_url=link,
+            edit_url=normalized_url,
             review_status="approved",
             ai_domain="News",
             ai_sub_group=source_short,