From 61e5a416d069ba7c5476b9f38f761b7b250461b1 Mon Sep 17 00:00:00 2001 From: hyungi Date: Wed, 10 Jun 2026 16:41:30 +0900 Subject: [PATCH] =?UTF-8?q?fix(news):=20fetch=5Fpage=20content-type=20?= =?UTF-8?q?=ED=97=88=EC=9A=A9=20=ED=8C=8C=EB=9D=BC=EB=AF=B8=ED=84=B0=20?= =?UTF-8?q?=E2=80=94=20TWI=20sitemap(text/xml)=20=EC=88=98=EC=A7=91=20(?= =?UTF-8?q?=EA=B2=80=EC=A6=9D=20=EA=B2=8C=EC=9D=B4=ED=8A=B8=20=EB=B0=9C?= =?UTF-8?q?=EA=B2=AC)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Fable 5 --- app/core/crawl_politeness.py | 9 ++++++--- app/workers/static_corpus_ingest.py | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/app/core/crawl_politeness.py b/app/core/crawl_politeness.py index 7e78ef0..abcbaf7 100644 --- a/app/core/crawl_politeness.py +++ b/app/core/crawl_politeness.py @@ -124,7 +124,10 @@ async def _robots_allows(client: httpx.AsyncClient, url: str) -> bool: return rp.can_fetch(CRAWL_UA, url) -async def fetch_page(url: str, *, check_robots: bool = True) -> tuple[str, str]: +async def fetch_page( + url: str, *, check_robots: bool = True, + content_types: tuple[str, ...] = _HTML_CONTENT_TYPES, +) -> tuple[str, str]: """공개 페이지 1건 politeness fetch. (html_text, final_url) 반환. - SSRF 검증 (redirect target 포함, news_collector 피드 fetch 와 동일 이중 검증) @@ -181,8 +184,8 @@ async def fetch_page(url: str, *, check_robots: bool = True) -> tuple[str, str]: raise CrawlSkip(f"{resp.status_code}: {url}") ct = resp.headers.get("content-type", "").lower() - if ct and not any(t in ct for t in _HTML_CONTENT_TYPES): - raise CrawlSkip(f"비-HTML content-type: {ct}: {url}") + if ct and not any(t in ct for t in content_types): + raise CrawlSkip(f"비허용 content-type: {ct}: {url}") if len(resp.content) > _MAX_PAGE_BYTES: raise CrawlSkip(f"크기 초과: {len(resp.content)} bytes: {url}") diff --git a/app/workers/static_corpus_ingest.py b/app/workers/static_corpus_ingest.py index 06d4a41..1ab4703 100644 --- a/app/workers/static_corpus_ingest.py +++ b/app/workers/static_corpus_ingest.py @@ -60,7 +60,10 @@ async def _discover_national_board() -> list[str]: async def _discover_twi() -> list[str]: """sitemap 에서 job-knowledge 시리즈만 (faqs/published-papers 는 향후 증분 후보).""" - xml_text, _ = await fetch_page(_TWI_SITEMAP) + xml_text, _ = await fetch_page( + _TWI_SITEMAP, + content_types=("text/xml", "application/xml", "text/html"), + ) urls = re.findall( r"(https://www\.twi-global\.com/technical-knowledge/job-knowledge/[^<]+)", xml_text,