hyungi_document_server/app/services/search/rerank_service.py

"""Reranker 서비스 — bge-reranker-v2-m3 통합 (Phase 1.3).

TEI 컨테이너 호출 + asyncio.Semaphore(2) + soft timeout fallback.

데이터 흐름 원칙:
- fusion = doc 기준 / reranker = chunk 기준 — 절대 섞지 말 것
- raw chunks를 끝까지 보존, fusion은 압축본만 사용
- reranker는 chunks_by_doc dict에서 raw chunks 회수해서 chunk 단위로 호출
- diversity는 reranker 직후 마지막 단계에서만 적용

snippet 생성:
- 200~400 토큰(800~1500자) 기준
- query keyword 위치 중심 ±target_chars/2 윈도우
- keyword 매치 없으면 첫 target_chars 문자 fallback (성능 손실 방지)
"""

from __future__ import annotations

import asyncio
import re
from typing import TYPE_CHECKING

import httpx

from ai.client import AIClient
from core.utils import setup_logger

if TYPE_CHECKING:
    from api.search import SearchResult

logger = setup_logger("rerank")

# 동시 rerank 호출 제한 (GPU saturation 방지)
RERANK_SEMAPHORE = asyncio.Semaphore(2)

# rerank input 크기 제한 (latency / VRAM hard cap)
MAX_RERANK_INPUT = 200
MAX_CHUNKS_PER_DOC = 2

# Soft timeout (초)
RERANK_TIMEOUT = 5.0


def _extract_window(text: str, query: str, target_chars: int = 800) -> str:
    """query keyword 위치 중심으로 ±target_chars/2 윈도우 추출.

    fallback: keyword 매치 없으면 첫 target_chars 문자 그대로.
    이게 없으면 reranker가 무관한 텍스트만 보고 점수 매겨 성능 급락.
    """
    keywords = [k for k in re.split(r"\s+", query) if len(k) >= 2]
    best_pos = -1
    for kw in keywords:
        pos = text.lower().find(kw.lower())
        if pos >= 0:
            best_pos = pos
            break

    if best_pos < 0:
        # Fallback: 첫 target_chars 문자
        return text[:target_chars]

    half = target_chars // 2
    start = max(0, best_pos - half)
    end = min(len(text), start + target_chars)
    return text[start:end]


def _make_snippet(c: "SearchResult", query: str, max_chars: int = 1500) -> str:
    """Reranker input snippet — title + query 중심 본문 윈도우.

    feedback_search_phase1_implementation.md 3번 항목 강제:
    snippet 200~400 토큰(800~1500자), full document 절대 안 됨.
    """
    title = c.title or ""
    text = c.snippet or ""

    # snippet은 chunk text 앞 200자 또는 doc text 앞 200자
    # 더 긴 chunk text가 필요하면 호출자가 따로 채워서 넘김
    if len(text) > max_chars:
        text = _extract_window(text, query, target_chars=max_chars - 100)

    return f"{title}\n\n{text}"


def _wrap_doc_as_chunk(doc: "SearchResult") -> "SearchResult":
    """text-only 매치 doc(chunks_by_doc에 없는 doc)을 ChunkResult 형태로 변환.

    Phase 1.3 reranker 입력에 doc 자체가 들어가야 하는 경우.
    snippet은 documents.extracted_text 앞 200자 (이미 SearchResult.snippet에 채워짐).
    chunk_id 등은 None 그대로.
    """
    return doc


async def rerank_chunks(
    query: str,
    candidates: list["SearchResult"],
    limit: int,
) -> list["SearchResult"]:
    """RRF 결과 candidates를 bge-reranker로 재정렬.

    Args:
        query: 사용자 쿼리
        candidates: chunk-level SearchResult 리스트 (이미 chunks_by_doc에서 회수)
        limit: 반환할 결과 수

    Returns:
        reranked SearchResult 리스트 (rerank score로 score 필드 업데이트)

    Fallback (timeout/HTTPError): RRF 순서 그대로 candidates[:limit] 반환.
    """
    if not candidates:
        return []

    # input 크기 제한 (latency/VRAM hard cap)
    if len(candidates) > MAX_RERANK_INPUT:
        logger.warning(
            f"rerank input {len(candidates)} > MAX {MAX_RERANK_INPUT}, 자름"
        )
        candidates = candidates[:MAX_RERANK_INPUT]

    snippets = [_make_snippet(c, query) for c in candidates]
    client = AIClient()

    try:
        async with asyncio.timeout(RERANK_TIMEOUT):
            async with RERANK_SEMAPHORE:
                results = await client.rerank(query, snippets)
        # results: [{"index": int, "score": float}, ...] (이미 정렬됨)
        reranked: list["SearchResult"] = []
        for r in results:
            idx = r.get("index")
            sc = r.get("score")
            if idx is None or sc is None or idx >= len(candidates):
                continue
            chunk = candidates[idx]
            chunk.score = float(sc)
            chunk.match_reason = (chunk.match_reason or "") + "+rerank"
            reranked.append(chunk)
        return reranked[:limit]
    except (asyncio.TimeoutError, httpx.HTTPError) as e:
        logger.warning(f"rerank failed → RRF fallback: {type(e).__name__}: {e}")
        return candidates[:limit]
    except Exception as e:
        logger.warning(f"rerank unexpected error → RRF fallback: {type(e).__name__}: {e}")
        return candidates[:limit]
    finally:
        await client.close()


async def warmup_reranker() -> bool:
    """TEI 부팅 후 모델 로딩 완료 대기 (10회 retry).

    TEI는 health 200을 빠르게 반환하지만 첫 모델 로딩(10~30초) 전에는
    rerank 요청이 실패하거나 매우 느림. FastAPI startup 또는 첫 요청 전 호출.
    """
    client = AIClient()
    try:
        for attempt in range(10):
            try:
                await client.rerank("warmup", ["dummy text for model load"])
                logger.info(f"reranker warmup OK (attempt {attempt + 1})")
                return True
            except Exception as e:
                logger.info(f"reranker warmup retry {attempt + 1}: {e}")
                await asyncio.sleep(3)
        logger.error("reranker warmup failed after 10 attempts")
        return False
    finally:
        await client.close()


def apply_diversity(
    results: list["SearchResult"],
    max_per_doc: int = MAX_CHUNKS_PER_DOC,
    top_score_threshold: float = 0.90,
) -> list["SearchResult"]:
    """chunk-level 결과를 doc 기준으로 압축 (max_per_doc).

    조건부 완화: 가장 상위 결과 score가 threshold 이상이면 unlimited
    (high confidence relevance > diversity).
    """
    if not results:
        return []

    # 가장 상위 score가 threshold 이상이면 diversity 제약 해제
    top_score = results[0].score if results else 0.0
    if top_score >= top_score_threshold:
        return results

    seen: dict[int, int] = {}
    out: list["SearchResult"] = []
    for r in results:
        doc_id = r.id
        if seen.get(doc_id, 0) >= max_per_doc:
            continue
        out.append(r)
        seen[doc_id] = seen.get(doc_id, 0) + 1
    return out