hyungi_document_server/app/core/crawl_politeness.py

"""크롤링 politeness 코어 (A-4, plan crawl-24x7-1)

개인 아카이빙 권장치를 그대로 박은 공용 fetch 계층:
- per-domain 동시성 1 (asyncio.Lock) + 같은 도메인 연속 요청 5–15초 지연 + jitter
- robots.txt 존중 (urllib.robotparser, 24h 캐시) — 비로그인 공개 크롤링 한정.
  로그인 세션 fetch (B-3) 는 사용자 행위 성격이라 robots 대신 사람 속도가 기준.
- 정직 식별 UA + 연락처 (익명 크롤링 트랙. 로그인 세션은 브라우저 UA 유지 — B-3)
- 429 = Retry-After 존중 / 5xx = 재시도 가능 / 403 = 차단 신호 (호출측 circuit 연동)

도메인별 마지막 요청 시각 등 rate 상태는 in-process (영속 워터마크는 DB — news_sources).
SSRF 차단은 core.url_validator.validate_feed_url 재사용 (redirect target 재검증 포함).
"""

import asyncio
import logging
import random
import time
import urllib.robotparser
from urllib.parse import urljoin, urlparse

import httpx

from core.url_validator import validate_feed_url

logger = logging.getLogger("crawl_politeness")

# 정직 식별 UA + 연락처 — 차단 전 연락 통로 (A-4)
CRAWL_UA = "HyungiPKM-Archiver/1.0 (personal archive; +mailto:hyun49196@gmail.com)"

# 같은 도메인 연속 요청 간격 (초) — 권장치 5–15s + jitter
_DOMAIN_DELAY_MIN = 5.0
_DOMAIN_DELAY_MAX = 15.0

_ROBOTS_CACHE_TTL = 24 * 3600  # 24h
_MAX_PAGE_BYTES = 5 * 1024 * 1024  # 피드 fetch 와 동일 5MB cap
_PAGE_TIMEOUT = 20.0
_MAX_REDIRECTS = 3

_HTML_CONTENT_TYPES = ("text/html", "application/xhtml+xml")


class CrawlFetchError(Exception):
    """일시 오류 (5xx / timeout / 네트워크) — 큐 재시도 대상."""


class CrawlBlocked(Exception):
    """차단 신호 (403 / 429 / robots disallow) — 재시도보다 backoff/circuit 대상."""


class CrawlSkip(Exception):
    """영구 비대상 (비-HTML / 크기 초과 / SSRF 차단 / 4xx) — 격하 처리 대상."""


# 도메인별 직렬화 상태 (in-process)
_domain_locks: dict[str, asyncio.Lock] = {}
_domain_last_request: dict[str, float] = {}
# host → (cached_at, RobotFileParser | None).  None = robots 없음/4xx (전부 허용)
_robots_cache: dict[str, tuple[float, urllib.robotparser.RobotFileParser | None]] = {}


def _domain_of(url: str) -> str:
    return (urlparse(url).hostname or "").lower()


def _get_lock(domain: str) -> asyncio.Lock:
    if domain not in _domain_locks:
        _domain_locks[domain] = asyncio.Lock()
    return _domain_locks[domain]


async def _respect_domain_rate(domain: str) -> None:
    """같은 도메인 직전 요청에서 5–15초(jitter) 경과할 때까지 대기."""
    last = _domain_last_request.get(domain)
    if last is not None:
        delay = random.uniform(_DOMAIN_DELAY_MIN, _DOMAIN_DELAY_MAX)
        wait = last + delay - time.monotonic()
        if wait > 0:
            await asyncio.sleep(wait)


async def _fetch_robots(client: httpx.AsyncClient, scheme: str, host: str):
    """robots.txt 조회. 4xx/부재 = 전부 허용(None), 5xx/오류 = 보수적으로 이번 사이클 차단."""
    robots_url = f"{scheme}://{host}/robots.txt"
    try:
        resp = await client.get(robots_url, headers={"User-Agent": CRAWL_UA})
    except httpx.HTTPError as e:
        raise CrawlFetchError(f"robots.txt 조회 실패: {host}: {e}") from e
    if resp.status_code >= 500:
        # 5xx 는 의도 불명 — 표준 관행대로 이번 사이클은 차단 취급
        raise CrawlFetchError(f"robots.txt 5xx: {host}: {resp.status_code}")
    if resp.status_code >= 400:
        return None  # robots 없음 = 전부 허용
    rp = urllib.robotparser.RobotFileParser()
    rp.parse(resp.text.splitlines())
    return rp


async def _robots_allows(client: httpx.AsyncClient, url: str) -> bool:
    parsed = urlparse(url)
    host = (parsed.hostname or "").lower()
    cached = _robots_cache.get(host)
    if cached is None or time.monotonic() - cached[0] > _ROBOTS_CACHE_TTL:
        rp = await _fetch_robots(client, parsed.scheme or "https", host)
        _robots_cache[host] = (time.monotonic(), rp)
        cached = _robots_cache[host]
    rp = cached[1]
    if rp is None:
        return True
    return rp.can_fetch(CRAWL_UA, url)


async def fetch_page(url: str, *, check_robots: bool = True) -> tuple[str, str]:
    """공개 페이지 1건 politeness fetch. (html_text, final_url) 반환.

    - SSRF 검증 (redirect target 포함, news_collector 피드 fetch 와 동일 이중 검증)
    - per-domain 동시성 1 + 5–15s jitter 지연
    - 429: Retry-After 로그 후 CrawlBlocked / 403: CrawlBlocked / 그 외 4xx: CrawlSkip
    - 5xx/timeout: CrawlFetchError (큐 재시도)
    - 비-HTML content-type / 5MB 초과: CrawlSkip
    """
    try:
        validate_feed_url(url)
    except ValueError as e:
        raise CrawlSkip(f"URL 검증 실패: {e}") from e

    domain = _domain_of(url)
    async with _get_lock(domain):
        await _respect_domain_rate(domain)
        try:
            async with httpx.AsyncClient(
                timeout=_PAGE_TIMEOUT, follow_redirects=False,
                headers={"User-Agent": CRAWL_UA},
            ) as client:
                if check_robots and not await _robots_allows(client, url):
                    raise CrawlBlocked(f"robots.txt disallow: {url}")

                resp = await client.get(url)
                redirects = 0
                while resp.is_redirect and redirects < _MAX_REDIRECTS:
                    location = urljoin(str(resp.request.url), resp.headers.get("location", ""))
                    try:
                        validate_feed_url(location)
                    except ValueError as e:
                        raise CrawlSkip(f"redirect target 차단: {e}") from e
                    # redirect 도 같은 도메인 연속 요청 — 간격은 lock 보유로 충분 (즉시 1회)
                    resp = await client.get(location)
                    redirects += 1
                if resp.is_redirect:
                    raise CrawlSkip(f"redirect {_MAX_REDIRECTS}회 초과: {url}")
        except httpx.TimeoutException as e:
            raise CrawlFetchError(f"timeout: {url}") from e
        except httpx.HTTPError as e:
            raise CrawlFetchError(f"네트워크 오류: {url}: {e}") from e
        finally:
            _domain_last_request[domain] = time.monotonic()

    if resp.status_code == 429:
        retry_after = resp.headers.get("retry-after", "")
        logger.warning("[politeness] 429 %s (Retry-After=%s)", domain, retry_after or "-")
        raise CrawlBlocked(f"429 rate limited: {url} (Retry-After={retry_after or '-'})")
    if resp.status_code == 403:
        raise CrawlBlocked(f"403 forbidden: {url}")
    if resp.status_code >= 500:
        raise CrawlFetchError(f"{resp.status_code}: {url}")
    if resp.status_code >= 400:
        raise CrawlSkip(f"{resp.status_code}: {url}")

    ct = resp.headers.get("content-type", "").lower()
    if ct and not any(t in ct for t in _HTML_CONTENT_TYPES):
        raise CrawlSkip(f"비-HTML content-type: {ct}: {url}")
    if len(resp.content) > _MAX_PAGE_BYTES:
        raise CrawlSkip(f"크기 초과: {len(resp.content)} bytes: {url}")

    return resp.text, str(resp.request.url)