From 141eb7793829f45568514cb16f76f71bda1ea528 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Mon, 13 Apr 2026 15:05:05 +0900 Subject: [PATCH] fix(news): allow HTTP redirect for HTTP_EXCEPTION_DOMAINS sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SCMP(www.scmp.com)처럼 HTTPS 원본이 HTTP로 301 redirect하는 소스에서 redirect target이 차단되던 문제 수정. allow_http를 원본 스킴이 아닌 소스 도메인의 allowlist 등록 여부로 판단하도록 변경. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/workers/news_collector.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py index 9945f3a..d709195 100644 --- a/app/workers/news_collector.py +++ b/app/workers/news_collector.py @@ -113,16 +113,19 @@ async def _fetch_rss(session, source: NewsSource) -> int: from urllib.parse import urljoin from core.url_validator import validate_feed_url, HTTP_EXCEPTION_DOMAINS - # HTTP allowlist 체크 - if source.feed_url.startswith("http://"): - hostname = urlparse(source.feed_url).hostname - if hostname not in HTTP_EXCEPTION_DOMAINS: - logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {hostname}") - return 0 + # HTTP 허용 여부: 소스 도메인이 allowlist에 있으면 HTTP 허용 + # SCMP처럼 HTTPS 원본이 HTTP로 redirect하는 경우도 커버 + source_hostname = urlparse(source.feed_url).hostname + http_allowed = source_hostname in HTTP_EXCEPTION_DOMAINS + + # 순수 HTTP 소스인데 allowlist에 없으면 차단 + if source.feed_url.startswith("http://") and not http_allowed: + logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {source_hostname}") + return 0 # fetch 전 URL 재검증 (등록 이후 DNS 변경 대비) try: - validate_feed_url(source.feed_url, allow_http=source.feed_url.startswith("http://")) + validate_feed_url(source.feed_url, allow_http=http_allowed) except ValueError as e: logger.error(f"[{source.name}] URL 검증 실패: {e}") return 0 @@ -131,12 +134,13 @@ async def _fetch_rss(session, source: NewsSource) -> int: resp = await client.get(source.feed_url) # redirect 수동 처리 (최대 3회, 각 target 재검증) + # allowlist 도메인이면 redirect target의 HTTP도 허용 redirects = 0 while resp.is_redirect and redirects < 3: location = resp.headers.get("location", "") location = urljoin(str(resp.request.url), location) try: - validate_feed_url(location, allow_http=source.feed_url.startswith("http://")) + validate_feed_url(location, allow_http=http_allowed) except ValueError as e: logger.error(f"[{source.name}] redirect target 차단: {e}") return 0