diff --git a/app/core/crawl_politeness.py b/app/core/crawl_politeness.py index bba760a..9a36a33 100644 --- a/app/core/crawl_politeness.py +++ b/app/core/crawl_politeness.py @@ -164,8 +164,10 @@ async def fetch_page( resp = await client.get(url) redirects = 0 - while resp.is_redirect and redirects < _MAX_REDIRECTS: - location = urljoin(str(resp.request.url), resp.headers.get("location", "")) + # has_redirect_location = location 헤더 있는 진짜 redirect 만 (httpx 의 + # is_redirect 는 3xx 전체라 304 등을 redirect 로 오인 — news_collector 동일 함정) + while resp.has_redirect_location and redirects < _MAX_REDIRECTS: + location = urljoin(str(resp.request.url), resp.headers["location"]) try: validate_feed_url(location) except ValueError as e: @@ -173,7 +175,7 @@ async def fetch_page( # redirect 도 같은 도메인 연속 요청 — 간격은 lock 보유로 충분 (즉시 1회) resp = await client.get(location) redirects += 1 - if resp.is_redirect: + if resp.has_redirect_location: raise CrawlSkip(f"redirect {_MAX_REDIRECTS}회 초과: {url}") except httpx.TimeoutException as e: raise CrawlFetchError(f"timeout: {url}") from e diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py index e733fa9..69f7649 100644 --- a/app/workers/news_collector.py +++ b/app/workers/news_collector.py @@ -351,25 +351,30 @@ async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: ) as client: resp = await client.get(source.feed_url) - # redirect 수동 처리 (최대 3회, 각 target 재검증) + # 304 는 redirect 처리보다 먼저 — httpx 의 is_redirect 는 3xx 전체(304 포함)에 + # True 라, 304 를 redirect 로 오인하면 location 없는 같은 URL 을 재요청해 + # "redirect 3회 초과" 로 오류 처리됨(조건부 GET 안정 피드 전멸 버그). + if resp.status_code == 304: + logger.info(f"[{source.name}] 304 Not Modified — 본문 미전송") + return 0, "not_modified" + + # redirect 수동 처리 (최대 3회, 각 target 재검증) — location 있는 진짜 redirect 만. # allowlist 도메인이면 redirect target의 HTTP도 허용 redirects = 0 - while resp.is_redirect and redirects < 3: - location = resp.headers.get("location", "") - location = urljoin(str(resp.request.url), location) + while resp.has_redirect_location and redirects < 3: + location = urljoin(str(resp.request.url), resp.headers["location"]) try: validate_feed_url(location, allow_http=http_allowed) except ValueError as e: raise FeedError(f"redirect target 차단: {e}") from e resp = await client.get(location) + if resp.status_code == 304: + logger.info(f"[{source.name}] 304 Not Modified (redirect 후) — 본문 미전송") + return 0, "not_modified" redirects += 1 - if resp.is_redirect: + if resp.has_redirect_location: raise FeedError("redirect 3회 초과") - if resp.status_code == 304: - logger.info(f"[{source.name}] 304 Not Modified — 본문 미전송") - return 0, "not_modified" - resp.raise_for_status() if len(resp.content) > MAX_RESPONSE_SIZE: diff --git a/tests/test_crawl_cycle2_shapes.py b/tests/test_crawl_cycle2_shapes.py index 940aede..afde1d8 100644 --- a/tests/test_crawl_cycle2_shapes.py +++ b/tests/test_crawl_cycle2_shapes.py @@ -108,6 +108,30 @@ class TestSkipVideoQuirk: assert not self.PATTERN.search("https://psyche.co/ideas/how-to-think") +class TestRedirect304Distinction: + """httpx is_redirect 가 304(3xx 전체)에 True 라 redirect 로 오인 → 조건부 GET + 안정 피드가 'redirect 3회 초과'로 전멸하던 버그. has_redirect_location 으로 구분.""" + + def test_304_is_not_a_redirect_location(self): + import httpx + r = httpx.Response(304, request=httpx.Request("GET", "https://x/")) + assert r.is_redirect is True # httpx 함정: 304 도 is_redirect + assert r.has_redirect_location is False # 우리가 써야 하는 정확한 판별 + + def test_real_redirect_has_location(self): + import httpx + r = httpx.Response(301, headers={"location": "https://y/"}, + request=httpx.Request("GET", "https://x/")) + assert r.has_redirect_location is True + + def test_collector_uses_has_redirect_location(self): + import inspect + from workers import news_collector + src = inspect.getsource(news_collector._fetch_rss) + assert "has_redirect_location" in src + assert "while resp.is_redirect" not in src # 옛 버그 패턴 부재 + + class TestArticleHashStability: def test_static_corpus_hash_deterministic(self): a = _article_hash("Creep and Creep Failures", "static", "National Board 기술 아티클")