diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py index 9945f3a..d709195 100644 --- a/app/workers/news_collector.py +++ b/app/workers/news_collector.py @@ -113,16 +113,19 @@ async def _fetch_rss(session, source: NewsSource) -> int: from urllib.parse import urljoin from core.url_validator import validate_feed_url, HTTP_EXCEPTION_DOMAINS - # HTTP allowlist 체크 - if source.feed_url.startswith("http://"): - hostname = urlparse(source.feed_url).hostname - if hostname not in HTTP_EXCEPTION_DOMAINS: - logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {hostname}") - return 0 + # HTTP 허용 여부: 소스 도메인이 allowlist에 있으면 HTTP 허용 + # SCMP처럼 HTTPS 원본이 HTTP로 redirect하는 경우도 커버 + source_hostname = urlparse(source.feed_url).hostname + http_allowed = source_hostname in HTTP_EXCEPTION_DOMAINS + + # 순수 HTTP 소스인데 allowlist에 없으면 차단 + if source.feed_url.startswith("http://") and not http_allowed: + logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {source_hostname}") + return 0 # fetch 전 URL 재검증 (등록 이후 DNS 변경 대비) try: - validate_feed_url(source.feed_url, allow_http=source.feed_url.startswith("http://")) + validate_feed_url(source.feed_url, allow_http=http_allowed) except ValueError as e: logger.error(f"[{source.name}] URL 검증 실패: {e}") return 0 @@ -131,12 +134,13 @@ async def _fetch_rss(session, source: NewsSource) -> int: resp = await client.get(source.feed_url) # redirect 수동 처리 (최대 3회, 각 target 재검증) + # allowlist 도메인이면 redirect target의 HTTP도 허용 redirects = 0 while resp.is_redirect and redirects < 3: location = resp.headers.get("location", "") location = urljoin(str(resp.request.url), location) try: - validate_feed_url(location, allow_http=source.feed_url.startswith("http://")) + validate_feed_url(location, allow_http=http_allowed) except ValueError as e: logger.error(f"[{source.name}] redirect target 차단: {e}") return 0