From b75307b89bbf27d84ca2f9127ce6450c2793a8aa Mon Sep 17 00:00:00 2001 From: hyungi Date: Thu, 11 Jun 2026 07:43:05 +0900 Subject: [PATCH] =?UTF-8?q?fix(news):=20=EC=97=B0=EA=B2=B0=20=EA=B3=84?= =?UTF-8?q?=EC=B8=B5(TCP/TLS)=20=EC=98=A4=EB=A5=98=201=ED=9A=8C=20?= =?UTF-8?q?=EC=9E=AC=EC=8B=9C=EB=8F=84=20=E2=80=94=20MOEL=20=EB=B3=B4?= =?UTF-8?q?=EC=95=88=EC=9E=A5=EB=B9=84=20=EC=B2=AB=20=ED=95=B8=EB=93=9C?= =?UTF-8?q?=EC=85=B0=EC=9D=B4=ED=81=AC=20=EA=B0=84=ED=97=90=20=EB=93=9C?= =?UTF-8?q?=EB=9E=8D=20(=EC=9E=AC=EC=8B=A4=EC=B8=A1=20=EC=A7=84=EB=8B=A8)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU 회선에서 moel.go.kr 첫 TLS 연결이 간헐 드랍(curl rc=35, 직후 재시도 5/5 성공, 맥북 무발생·단일 A 레코드) → 사이클당 1회 fetch 인 피드가 ConnectError('') 누적, 입법행정예고 circuit open. ConnectError/ConnectTimeout 만 1.5s 후 1회 재시도, HTTP 상태 오류 비대상. 회귀 테스트 3건 (42 passed). Co-Authored-By: Claude Fable 5 --- app/workers/news_collector.py | 17 +++++++++++++- tests/test_crawl_cycle3_shapes.py | 38 +++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/app/workers/news_collector.py b/app/workers/news_collector.py index 36f6cc0..8f501f0 100644 --- a/app/workers/news_collector.py +++ b/app/workers/news_collector.py @@ -244,6 +244,21 @@ ALLOWED_CONTENT_TYPES = ("application/rss+xml", "application/atom+xml", "application/xml", "text/xml") +async def _get_with_connect_retry(client, url: str): + """연결 계층(TCP/TLS) 오류만 1회 재시도 — HTTP 상태 오류는 비대상 (호출측 분기 보존). + + MOEL 실측(2026-06-11): 정부 사이트 보안장비가 첫 TLS 핸드셰이크를 간헐 드랍 + (curl rc=35, 직후 재시도 성공) → 사이클당 1회 fetch 인 피드 수집이 ConnectError('') + 로 실패 누적·circuit open. 재시도 1회면 흡수됨 — 지속 장애는 그대로 circuit 몫. + """ + try: + return await client.get(url) + except (httpx.ConnectError, httpx.ConnectTimeout) as e: + logger.info(f"연결 오류 1회 재시도 ({url.split('?')[0]}): {repr(e)}") + await asyncio.sleep(1.5) + return await client.get(url) + + async def _is_portal_duplicate(session, title: str) -> bool: """A-6 2차 dedup: 포털 전재본 vs 원본이 다른 URL 로 이중 적재되는 케이스. @@ -380,7 +395,7 @@ async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]: async with httpx.AsyncClient( timeout=10, follow_redirects=False, headers=headers ) as client: - resp = await client.get(source.feed_url) + resp = await _get_with_connect_retry(client, source.feed_url) # 304 는 redirect 처리보다 먼저 — httpx 의 is_redirect 는 3xx 전체(304 포함)에 # True 라, 304 를 redirect 로 오인하면 location 없는 같은 URL 을 재요청해 diff --git a/tests/test_crawl_cycle3_shapes.py b/tests/test_crawl_cycle3_shapes.py index 5d2455a..68e254f 100644 --- a/tests/test_crawl_cycle3_shapes.py +++ b/tests/test_crawl_cycle3_shapes.py @@ -117,6 +117,44 @@ class TestSignalOnlyEnqueueGuard: assert calls == ["embed", "chunk"] +# ── 연결 계층 1회 재시도 (MOEL 첫 TLS 핸드셰이크 간헐 드랍 실측) ────────────── + +class TestConnectRetry: + class _Client: + def __init__(self, errors: list): + self.errors = errors + self.calls = 0 + + async def get(self, url): + self.calls += 1 + if self.errors: + raise self.errors.pop(0) + return "OK" + + @pytest.mark.asyncio + async def test_single_connect_error_retried_once(self): + import httpx + client = self._Client([httpx.ConnectError("")]) + resp = await news_collector._get_with_connect_retry(client, "https://x/feed") + assert resp == "OK" and client.calls == 2 + + @pytest.mark.asyncio + async def test_persistent_connect_error_propagates(self): + import httpx + client = self._Client([httpx.ConnectError(""), httpx.ConnectError("")]) + with pytest.raises(httpx.ConnectError): + await news_collector._get_with_connect_retry(client, "https://x/feed") + assert client.calls == 2 # 1회만 재시도 — 지속 장애는 circuit 몫 + + @pytest.mark.asyncio + async def test_non_connect_errors_not_retried(self): + import httpx + client = self._Client([httpx.ReadTimeout("")]) + with pytest.raises(httpx.ReadTimeout): + await news_collector._get_with_connect_retry(client, "https://x/feed") + assert client.calls == 1 + + # ── C-4 / B-4 피드 shape (시드 전 live 박제) ───────────────────────────────── class TestNikkeiRdfNativeParsing: