hyungi_document_server/app/workers/fulltext_worker.py

"""fulltext 승격 워커 (A-2 + A-7, plan crawl-24x7-1)

news_collector 가 fulltext_policy='page' 소스의 기사에 enqueue 한 'fulltext' stage 를 소비:
  기사 페이지 politeness fetch (A-4) → 원본 HTML NAS gzip 보존 (A-7)
  → extract_worker 4-tier 재사용 (tier 2 sibling .md 는 디스크 원본이 없어 비적용)
  → extracted_text/md_content 승격 → summarize + (30일 게이트) embed/chunk enqueue.

실패 처리 (큐 어휘 = DB enum, 분기만 워커):
  - 일시 오류 (5xx/timeout)            : raise → 큐 재시도 (max_attempts 3)
  - 차단/비대상 (403/429/robots/비HTML/추출부족): RSS 요약으로 격하(degrade) 후 완료
    → summarize/embed/chunk enqueue 보장 (기사 유실 0). 격하 사유는 extract_meta.fulltext 에 기록.
  - 영구 실패 (3회 소진)                : 야간 reconcile_unresolved() 가 summarize 안전망 enqueue
    ([[feedback_silent_skip_accumulation]] — 조건부 skip 이 영구 침묵으로 누적되지 않게).

승격 게이트: 전 tier 공통 본문 >= 200자 (devonagent 와 달리 tier 4 도 게이트 적용 —
페이월/오류 페이지의 nav 찌꺼기를 본문으로 승격하느니 RSS 요약 격하가 낫다).
"""

import gzip
import hashlib
import re
from datetime import datetime, timezone
from pathlib import Path

from sqlalchemy import exists, select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import aliased

from core.config import settings
from core.crawl_politeness import CrawlBlocked, CrawlFetchError, CrawlSkip, fetch_page
from core.database import async_session
from core.utils import setup_logger
from models.document import Document
from models.queue import ProcessingQueue, enqueue_stage
from workers.extract_worker import (
    _WEB_MIN_BODY_LEN,
    _extract_web_with_bs4,
    _extract_web_with_readability,
    _extract_web_with_trafilatura,
)

logger = setup_logger("fulltext_worker")

# 한국 기사 푸터 1층 후처리 (A-2) — 보수적으로 라인 단위만 제거
_FOOTER_PATTERNS = [
    re.compile(r"^.{0,120}(무단\s*전재|무단\s*복제|재배포\s*금지|저작권자\s*[ⓒ©(]).*$", re.M),
    re.compile(r"^[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}\s*$", re.M),  # 단독 이메일 라인
    re.compile(r"^\s*\S{2,4}\s*기자\s*$", re.M),               # 단독 '◯◯◯ 기자' 라인
]


def _strip_article_footer(body: str) -> str:
    for pat in _FOOTER_PATTERNS:
        body = pat.sub("", body)
    return re.sub(r"\n{3,}", "\n\n", body).strip()


def _extract_body(html_text: str) -> tuple[str, str | None, str | None]:
    """(body, engine, engine_version). 전 tier >= 200자 게이트, 미달이면 ("", None, None)."""
    body, ver = _extract_web_with_trafilatura(html_text)
    if body and len(body) >= _WEB_MIN_BODY_LEN:
        return body, "trafilatura", ver
    body, ver = _extract_web_with_readability(html_text)
    if body and len(body) >= _WEB_MIN_BODY_LEN:
        return body, "readability", ver
    body, ver = _extract_web_with_bs4(html_text)
    if body and len(body) >= _WEB_MIN_BODY_LEN:
        return body, "bs4_text", ver
    return "", None, None


def _raw_html_path(source_id: int | None, file_hash: str, now: datetime) -> Path:
    """A-7 원본 보존 경로 — NAS 본진. 한글 디렉토리의 NFC/NFD 비대칭을 피해 source_id 사용."""
    src_dir = f"src_{source_id}" if source_id is not None else "src_unknown"
    return (
        Path(settings.nas_mount_path) / "crawl_raw" / src_dir
        / now.strftime("%Y-%m") / f"{file_hash}.html.gz"
    )


def _save_raw_html(path: Path, html_text: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with gzip.open(path, "wb") as f:
        f.write(html_text.encode("utf-8", errors="replace"))


async def _enqueue_downstream(session: AsyncSession, doc: Document) -> None:
    """승격/격하 공통 후속 — summarize 무조건 + 30일 게이트 통과 시 embed/chunk."""
    await enqueue_stage(session, doc.id, "summarize")
    published_raw = (doc.extract_meta or {}).get("published_at")
    days_old = 0
    if published_raw:
        try:
            pub_dt = datetime.fromisoformat(published_raw)
            days_old = (datetime.now(timezone.utc) - pub_dt).days
        except ValueError:
            days_old = 0  # 파싱 불가 = 신규 취급 (수집 시점 기본과 동일)
    if days_old <= 30:
        await enqueue_stage(session, doc.id, "embed")
        await enqueue_stage(session, doc.id, "chunk")


def _set_fulltext_meta(doc: Document, **fields) -> None:
    """extract_meta.fulltext 갱신 — JSONB 변경 감지를 위해 dict 재할당."""
    meta = dict(doc.extract_meta or {})
    meta["fulltext"] = {**meta.get("fulltext", {}), **fields}
    doc.extract_meta = meta


async def _degrade(session: AsyncSession, doc: Document, reason: str) -> None:
    """본문 승격 실패 — RSS 요약 그대로 후속 단계 진행 (기사 유실 0)."""
    _set_fulltext_meta(
        doc, status="degraded", reason=reason[:300],
        resolved_at=datetime.now(timezone.utc).isoformat(),
    )
    await _enqueue_downstream(session, doc)
    logger.warning(f"[fulltext] doc={doc.id} 격하(RSS 요약 유지): {reason}")


async def process(document_id: int, session: AsyncSession) -> None:
    """기사 1건 풀텍스트 승격. queue_consumer 컨벤션 시그니처 (커밋은 consumer 가)."""
    doc = await session.get(Document, document_id)
    if not doc:
        raise ValueError(f"문서 ID {document_id}를 찾을 수 없음")
    if not doc.edit_url:
        await _degrade(session, doc, "edit_url 없음")
        return

    meta = doc.extract_meta or {}
    source_id = meta.get("source_id")

    try:
        html_text, final_url = await fetch_page(doc.edit_url)
    except (CrawlBlocked, CrawlSkip) as e:
        await _degrade(session, doc, f"{type(e).__name__}: {e}")
        return
    except CrawlFetchError:
        raise  # 일시 오류 — 큐 재시도

    now = datetime.now(timezone.utc)

    # A-7: 원본 HTML 보존 (추출기 교체 시 전체 재추출 가능 상태 유지)
    raw_path = _raw_html_path(source_id, doc.file_hash, now)
    try:
        _save_raw_html(raw_path, html_text)
        raw_saved = True
    except OSError as e:
        # NAS 일시 장애 시 보존만 누락하고 승격은 진행 — 사유 기록 (silent 누락 회피)
        raw_saved = False
        logger.error(f"[fulltext] doc={doc.id} 원본 보존 실패 (승격은 진행): {e}")

    body, engine, engine_ver = _extract_body(html_text)
    if not engine:
        await _degrade(session, doc, f"추출 실패 (전 tier < {_WEB_MIN_BODY_LEN}자)")
        return

    clean_body = _strip_article_footer(body.replace("\x00", ""))
    if len(clean_body) < _WEB_MIN_BODY_LEN:
        await _degrade(session, doc, "푸터 제거 후 본문 부족")
        return

    title = doc.title or ""
    doc.extracted_text = f"{title}\n\n{clean_body}" if title else clean_body
    doc.extracted_at = now
    doc.extractor_version = f"rss+page@{engine}"
    doc.md_content = clean_body
    doc.md_status = "success"
    doc.md_extraction_engine = engine
    doc.md_extraction_engine_version = engine_ver
    doc.md_format_version = "1.0"
    doc.md_generated_at = now
    doc.md_source_hash = hashlib.sha256(html_text.encode("utf-8", errors="replace")).hexdigest()
    doc.md_content_hash = hashlib.sha256(clean_body.encode("utf-8")).hexdigest()
    doc.md_extraction_error = None  # 수집 시점의 '변환 비대상' 마커 해제
    doc.content_origin = "extracted"
    doc.file_size = len(doc.extracted_text.encode())
    _set_fulltext_meta(
        doc, status="promoted", engine=engine,
        raw_html_path=str(raw_path) if raw_saved else None,
        final_url=final_url, body_chars=len(clean_body),
        resolved_at=now.isoformat(),
    )

    await _enqueue_downstream(session, doc)
    logger.info(
        f"[fulltext/{engine}] doc={doc.id} {len(clean_body)}자 승격 "
        f"(raw={'saved' if raw_saved else 'MISSING'})"
    )


async def reconcile_unresolved() -> None:
    """안전망 (야간 1회): fulltext 영구 실패(3회 소진)로 summarize 가 영영 안 잡힌
    뉴스 문서에 RSS 요약 기준 후속 단계를 enqueue. 멱등 — enqueue 후엔 조건 불일치."""
    async with async_session() as session:
        # 외부 쿼리 FROM 에 ProcessingQueue 가 이미 있어 alias 없이는 auto-correlation 이
        # 서브쿼리 FROM 을 전부 제거 → InvalidRequestError (queue_consumer.reset_stale_items 패턴)
        pq = aliased(ProcessingQueue)
        summarize_q = (
            select(pq.id)
            .where(
                pq.document_id == Document.id,
                pq.stage == "summarize",
            )
        )
        result = await session.execute(
            select(Document)
            .join(ProcessingQueue, ProcessingQueue.document_id == Document.id)
            .where(
                ProcessingQueue.stage == "fulltext",
                ProcessingQueue.status == "failed",
                Document.source_channel == "news",
                ~exists(summarize_q),
            )
            .limit(200)
        )
        docs = result.scalars().unique().all()
        for doc in docs:
            _set_fulltext_meta(doc, status="failed_reconciled")
            await _enqueue_downstream(session, doc)
        if docs:
            await session.commit()
            logger.warning(f"[fulltext] reconcile: 영구 실패 {len(docs)}건 RSS 요약으로 후속 enqueue")