fix(news): fetch_page content-type 허용 파라미터 — TWI sitemap(text/xml) 수집 (검증 게이트 발견)
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -124,7 +124,10 @@ async def _robots_allows(client: httpx.AsyncClient, url: str) -> bool:
|
||||
return rp.can_fetch(CRAWL_UA, url)
|
||||
|
||||
|
||||
async def fetch_page(url: str, *, check_robots: bool = True) -> tuple[str, str]:
|
||||
async def fetch_page(
|
||||
url: str, *, check_robots: bool = True,
|
||||
content_types: tuple[str, ...] = _HTML_CONTENT_TYPES,
|
||||
) -> tuple[str, str]:
|
||||
"""공개 페이지 1건 politeness fetch. (html_text, final_url) 반환.
|
||||
|
||||
- SSRF 검증 (redirect target 포함, news_collector 피드 fetch 와 동일 이중 검증)
|
||||
@@ -181,8 +184,8 @@ async def fetch_page(url: str, *, check_robots: bool = True) -> tuple[str, str]:
|
||||
raise CrawlSkip(f"{resp.status_code}: {url}")
|
||||
|
||||
ct = resp.headers.get("content-type", "").lower()
|
||||
if ct and not any(t in ct for t in _HTML_CONTENT_TYPES):
|
||||
raise CrawlSkip(f"비-HTML content-type: {ct}: {url}")
|
||||
if ct and not any(t in ct for t in content_types):
|
||||
raise CrawlSkip(f"비허용 content-type: {ct}: {url}")
|
||||
if len(resp.content) > _MAX_PAGE_BYTES:
|
||||
raise CrawlSkip(f"크기 초과: {len(resp.content)} bytes: {url}")
|
||||
|
||||
|
||||
@@ -60,7 +60,10 @@ async def _discover_national_board() -> list[str]:
|
||||
|
||||
async def _discover_twi() -> list[str]:
|
||||
"""sitemap 에서 job-knowledge 시리즈만 (faqs/published-papers 는 향후 증분 후보)."""
|
||||
xml_text, _ = await fetch_page(_TWI_SITEMAP)
|
||||
xml_text, _ = await fetch_page(
|
||||
_TWI_SITEMAP,
|
||||
content_types=("text/xml", "application/xml", "text/html"),
|
||||
)
|
||||
urls = re.findall(
|
||||
r"<loc>(https://www\.twi-global\.com/technical-knowledge/job-knowledge/[^<]+)</loc>",
|
||||
xml_text,
|
||||
|
||||
Reference in New Issue
Block a user