fix(news): allow HTTP redirect for HTTP_EXCEPTION_DOMAINS sources

SCMP(www.scmp.com)처럼 HTTPS 원본이 HTTP로 301 redirect하는 소스에서
redirect target이 차단되던 문제 수정. allow_http를 원본 스킴이 아닌
소스 도메인의 allowlist 등록 여부로 판단하도록 변경.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-13 15:05:05 +09:00
parent cbef646a3f
commit 141eb77938
+12 -8
View File
@@ -113,16 +113,19 @@ async def _fetch_rss(session, source: NewsSource) -> int:
from urllib.parse import urljoin
from core.url_validator import validate_feed_url, HTTP_EXCEPTION_DOMAINS
# HTTP allowlist 체크
if source.feed_url.startswith("http://"):
hostname = urlparse(source.feed_url).hostname
if hostname not in HTTP_EXCEPTION_DOMAINS:
logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {hostname}")
return 0
# HTTP 허용 여부: 소스 도메인이 allowlist에 있으면 HTTP 허용
# SCMP처럼 HTTPS 원본이 HTTP로 redirect하는 경우도 커버
source_hostname = urlparse(source.feed_url).hostname
http_allowed = source_hostname in HTTP_EXCEPTION_DOMAINS
# 순수 HTTP 소스인데 allowlist에 없으면 차단
if source.feed_url.startswith("http://") and not http_allowed:
logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {source_hostname}")
return 0
# fetch 전 URL 재검증 (등록 이후 DNS 변경 대비)
try:
validate_feed_url(source.feed_url, allow_http=source.feed_url.startswith("http://"))
validate_feed_url(source.feed_url, allow_http=http_allowed)
except ValueError as e:
logger.error(f"[{source.name}] URL 검증 실패: {e}")
return 0
@@ -131,12 +134,13 @@ async def _fetch_rss(session, source: NewsSource) -> int:
resp = await client.get(source.feed_url)
# redirect 수동 처리 (최대 3회, 각 target 재검증)
# allowlist 도메인이면 redirect target의 HTTP도 허용
redirects = 0
while resp.is_redirect and redirects < 3:
location = resp.headers.get("location", "")
location = urljoin(str(resp.request.url), location)
try:
validate_feed_url(location, allow_http=source.feed_url.startswith("http://"))
validate_feed_url(location, allow_http=http_allowed)
except ValueError as e:
logger.error(f"[{source.name}] redirect target 차단: {e}")
return 0