hyungi_document_server/app/services/search/fusion_service.py

"""검색 결과 fusion 전략 (Phase 0.5)

기존 가중합 → Reciprocal Rank Fusion 기본 + 강한 시그널 boost.

전략 비교:
- LegacyWeightedSum : 기존 _merge_results (text 가중치 + 0.5*벡터 합산). A/B 비교용.
- RRFOnly           : 순수 RRF, k=60. 안정적이지만 강한 키워드 신호 약화 가능.
- RRFWithBoost      : RRF + 강한 시그널 boost (title/tags/법령조문/high text score).
                       정확 키워드 케이스에서 RRF 한계를 보완. **default**.

fuse() 결과의 .score는 fusion 내부 점수(RRF는 1/60 단위로 작음).
사용자에게 노출되는 SearchResult.score는 search.py에서 normalize_display_scores로
[0..1] 랭크 기반 정규화 후 반환된다.
"""

from __future__ import annotations

import re
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from api.search import SearchResult


# ─── 추상 인터페이스 ─────────────────────────────────────


class FusionStrategy(ABC):
    name: str = "abstract"

    @abstractmethod
    def fuse(
        self,
        text_results: list["SearchResult"],
        vector_results: list["SearchResult"],
        query: str,
        limit: int,
    ) -> list["SearchResult"]:
        ...


# ─── 1) 기존 가중합 (legacy) ─────────────────────────────


class LegacyWeightedSum(FusionStrategy):
    """기존 _merge_results 동작.

    텍스트 점수에 벡터 cosine * 0.5 가산. 벡터 단독 결과는 cosine > 0.3만 채택.
    Phase 0.5 RRF로 교체 전 baseline. A/B 비교용으로 보존.
    """

    name = "legacy"

    def fuse(self, text_results, vector_results, query, limit):
        from api.search import SearchResult  # 순환 import 회피

        merged: dict[int, SearchResult] = {}

        for r in text_results:
            merged[r.id] = r

        for r in vector_results:
            if r.id in merged:
                existing = merged[r.id]
                merged[r.id] = SearchResult(
                    id=existing.id,
                    title=existing.title,
                    ai_domain=existing.ai_domain,
                    ai_summary=existing.ai_summary,
                    file_format=existing.file_format,
                    score=existing.score + r.score * 0.5,
                    snippet=existing.snippet,
                    match_reason=f"{existing.match_reason}+vector",
                )
            elif r.score > 0.3:
                merged[r.id] = r

        ordered = sorted(merged.values(), key=lambda x: x.score, reverse=True)
        return ordered[:limit]


# ─── 2) Reciprocal Rank Fusion ──────────────────────────


class RRFOnly(FusionStrategy):
    """순수 RRF.

    RRF_score(doc) = Σ (1 / (k + rank_i))
    k=60 (TREC 표준값). 점수 절대값을 무시하고 랭크만 사용 → 다른 retriever 간
    스케일 차이에 강하지만, FTS의 압도적 신호도 평탄화되는 단점.
    """

    name = "rrf"
    K = 60

    def fuse(self, text_results, vector_results, query, limit):
        from api.search import SearchResult

        scores: dict[int, float] = {}
        sources: dict[int, dict[str, SearchResult]] = {}

        for rank, r in enumerate(text_results, start=1):
            scores[r.id] = scores.get(r.id, 0.0) + 1.0 / (self.K + rank)
            sources.setdefault(r.id, {})["text"] = r

        for rank, r in enumerate(vector_results, start=1):
            scores[r.id] = scores.get(r.id, 0.0) + 1.0 / (self.K + rank)
            sources.setdefault(r.id, {})["vector"] = r

        merged: list[SearchResult] = []
        for doc_id, rrf_score in sorted(scores.items(), key=lambda kv: -kv[1]):
            srcs = sources[doc_id]
            base = srcs.get("text") or srcs.get("vector")
            assert base is not None
            reasons: list[str] = []
            if "text" in srcs:
                reasons.append(srcs["text"].match_reason or "text")
            if "vector" in srcs:
                reasons.append("vector")
            merged.append(
                SearchResult(
                    id=base.id,
                    title=base.title,
                    ai_domain=base.ai_domain,
                    ai_summary=base.ai_summary,
                    file_format=base.file_format,
                    score=rrf_score,
                    snippet=base.snippet,
                    match_reason="+".join(reasons),
                )
            )
        return merged[:limit]


# ─── 3) RRF + 강한 시그널 boost ─────────────────────────


class RRFWithBoost(RRFOnly):
    """RRF + 강한 시그널 boost.

    RRF의 점수 평탄화를 보완하기 위해 다음 케이스에 score를 추가 가산:
      - title 정확 substring 매치    : +0.020
      - tags 매치                    : +0.015
      - 법령 조문 정확 매치(예 제80조): +0.050  (가장 강한 override)
      - text score >= 5.0           : +0.010

    Boost 크기는 의도적으로 적당히. RRF의 안정성은 유지하되 강한 신호는 끌어올림.
    Phase 0.5 default 전략.
    """

    name = "rrf_boost"

    BOOST_TITLE = 0.020
    BOOST_TAGS = 0.015
    BOOST_LEGAL_ARTICLE = 0.050
    BOOST_HIGH_TEXT_SCORE = 0.010

    LEGAL_ARTICLE_RE = re.compile(r"제\s*\d+\s*조")
    HIGH_TEXT_SCORE_THRESHOLD = 5.0

    def fuse(self, text_results, vector_results, query, limit):
        # 일단 RRF로 후보 충분히 확보 (boost 후 재정렬되도록 limit 넓게)
        candidates = super().fuse(text_results, vector_results, query, max(limit * 3, 30))

        # 원본 text 신호 lookup
        text_score_by_id = {r.id: r.score for r in text_results}
        text_reason_by_id = {r.id: (r.match_reason or "") for r in text_results}

        # 쿼리에 법령 조문이 있으면 그 조문 추출
        legal_articles_in_query = set(
            re.sub(r"\s+", "", a) for a in self.LEGAL_ARTICLE_RE.findall(query)
        )

        for result in candidates:
            boost = 0.0
            text_reason = text_reason_by_id.get(result.id, "")

            if "title" in text_reason:
                boost += self.BOOST_TITLE
            elif "tags" in text_reason:
                boost += self.BOOST_TAGS

            if text_score_by_id.get(result.id, 0.0) >= self.HIGH_TEXT_SCORE_THRESHOLD:
                boost += self.BOOST_HIGH_TEXT_SCORE

            if legal_articles_in_query and result.title:
                title_articles = set(
                    re.sub(r"\s+", "", a)
                    for a in self.LEGAL_ARTICLE_RE.findall(result.title)
                )
                if legal_articles_in_query & title_articles:
                    boost += self.BOOST_LEGAL_ARTICLE

            if boost > 0:
                # pydantic v2에서도 mutate 가능
                result.score = result.score + boost

        candidates.sort(key=lambda r: r.score, reverse=True)
        return candidates[:limit]


# ─── factory ─────────────────────────────────────────────


_STRATEGIES: dict[str, type[FusionStrategy]] = {
    "legacy": LegacyWeightedSum,
    "rrf": RRFOnly,
    "rrf_boost": RRFWithBoost,
}

DEFAULT_FUSION = "rrf_boost"


def get_strategy(name: str) -> FusionStrategy:
    cls = _STRATEGIES.get(name)
    if cls is None:
        raise ValueError(f"unknown fusion strategy: {name}")
    return cls()


# ─── Phase 2.3: soft filter boost ───────────────────────

SOFT_FILTER_MAX_BOOST = 0.05  # plan 룰 (CRITICAL)
# ↑ RRF score는 0.01~0.05 범위 (k=60). 상한 초과 시 기존 랭킹 왜곡.
#   기존 RRFWithBoost의 legal article boost(0.05)와 동일 최대값 → 일관성.
SOFT_FILTER_DOMAIN_BOOST = 0.03
SOFT_FILTER_DOCTYPE_BOOST = 0.02


def apply_soft_filter_boost(
    results: list["SearchResult"],
    soft_filters: dict | None,
) -> int:
    """Phase 2.3 — QueryAnalyzer soft_filters 기반 score boost.

    ai_domain / ai_tags 매칭 시 소량 boost 적용. 총 boost는
    SOFT_FILTER_MAX_BOOST(0.05) 상한을 넘지 않음.

    Args:
        results: fusion 직후 SearchResult 리스트 (in-place 수정)
        soft_filters: query_analysis.soft_filters = {"domain": [...], "document_type": [...]}

    Returns:
        int — boost 적용된 결과 개수 (debug/notes용)
    """
    if not soft_filters:
        return 0
    domain_list = [str(d).lower() for d in soft_filters.get("domain", []) or []]
    doctype_list = [str(t).lower() for t in soft_filters.get("document_type", []) or []]
    if not domain_list and not doctype_list:
        return 0

    boosted_count = 0
    for r in results:
        boost = 0.0

        # domain 매칭 — ai_domain 부분 문자열 매칭 (Industrial_Safety/Legislation 같은 경로 매칭)
        if domain_list and r.ai_domain:
            ai_dom_lower = r.ai_domain.lower()
            for d in domain_list:
                if d in ai_dom_lower or ai_dom_lower in d:
                    boost += SOFT_FILTER_DOMAIN_BOOST
                    break  # 한 번만

        # document_type 매칭 — ai_tags JSON 문자열 또는 ai_domain 내 keyword 탐지
        # (ai_domain에 "Law_Document"는 안 들어감. ai_tags에 law/law_document 같은 태그가 있음.)
        # 간단화: ai_domain 경로에 keyword가 포함되면 매칭
        if doctype_list:
            hay = (r.ai_domain or "").lower() + " " + (getattr(r, "match_reason", "") or "").lower()
            for t in doctype_list:
                if t in hay or any(word in hay for word in t.split("_")):
                    boost += SOFT_FILTER_DOCTYPE_BOOST
                    break

        if boost > 0:
            boost = min(boost, SOFT_FILTER_MAX_BOOST)
            r.score += boost
            boosted_count += 1

    # boost 적용 후 재정렬
    results.sort(key=lambda x: x.score, reverse=True)
    return boosted_count


# ─── display score 정규화 ────────────────────────────────


def normalize_display_scores(results: list["SearchResult"]) -> None:
    """SearchResult.score를 [0.05..1.0] 랭크 기반 값으로 in-place 갱신.

    프론트엔드는 score*100을 % 표시하므로 [0..1] 범위가 적절.
    fusion 내부 score는 상대적 순서만 의미가 있으므로 절대값 노출 없이 랭크만 표시.

    랭크 1 → 1.0 / 랭크 2 → 0.95 / ... / 랭크 20 → 0.05 (균등 분포)
    """
    n = len(results)
    if n == 0:
        return
    for i, r in enumerate(results):
        # 1.0 → 0.05 사이 균등 분포
        rank_score = 1.0 - (i / max(n - 1, 1)) * 0.95
        r.score = round(rank_score, 4)