From e91c19953781b0371d0e56aec6c6ddae1abfaaf6 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 8 Apr 2026 15:30:23 +0900 Subject: [PATCH] feat(search): Phase 2.3 soft_filter boost (domain/doctype) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 변경 ### fusion_service.py - SOFT_FILTER_MAX_BOOST = 0.05 (plan 영구 룰, RRF score 왜곡 방지) - SOFT_FILTER_DOMAIN_BOOST = 0.03, SOFT_FILTER_DOCTYPE_BOOST = 0.02 - apply_soft_filter_boost(results, soft_filters) → int - ai_domain 부분 문자열 매칭 (path 포함 e.g. "Industrial_Safety/Legislation") - document_type 토큰 매칭 (ai_domain + match_reason 헤이스택) - 상한선 0.05 강제 - boost 후 score 기준 재정렬 ### api/search.py - fusion 직후 호출 조건: - analyzer_cache_hit == True - analyzer_tier != "ignore" (confidence >= 0.5) - query_analysis.soft_filters 존재 - notes에 "soft_filter_boost applied=N" 기록 ## Phase 2.3 범위 - hard_filter SQL WHERE는 현재 평가셋에 명시 필터 쿼리 없어 효과 측정 불가 → Phase 2.4 v0.2 확장 후 - document_type의 file_format 직접 매칭은 의미론적 mismatch → 제외 - hard_filter는 Phase 2.4 이후 iteration Co-Authored-By: Claude Opus 4.6 (1M context) --- app/api/search.py | 20 ++++++++- app/services/search/fusion_service.py | 64 +++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/app/api/search.py b/app/api/search.py index a0e8f54..7e5dc9e 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -16,7 +16,12 @@ from core.database import get_session from core.utils import setup_logger from models.user import User from services.search import query_analyzer -from services.search.fusion_service import DEFAULT_FUSION, get_strategy, normalize_display_scores +from services.search.fusion_service import ( + DEFAULT_FUSION, + apply_soft_filter_boost, + get_strategy, + normalize_display_scores, +) from services.search.rerank_service import ( MAX_CHUNKS_PER_DOC, MAX_RERANK_INPUT, @@ -258,6 +263,19 @@ async def search( f"unique_docs={len(chunks_by_doc)}" ) + # Phase 2.3: soft_filter boost (cache hit + tier != ignore 일 때만) + # analyzer_confidence < 0.5 (tier=ignore)는 비활성. + if ( + analyzer_cache_hit + and analyzer_tier != "ignore" + and query_analysis + ): + soft_filters = query_analysis.get("soft_filters") or {} + if soft_filters: + boosted = apply_soft_filter_boost(fused_docs, soft_filters) + if boosted > 0: + notes.append(f"soft_filter_boost applied={boosted}") + if rerank: # Phase 1.3: reranker — chunk 기준 입력 # fusion 결과 doc_id로 chunks_by_doc에서 raw chunks 회수 diff --git a/app/services/search/fusion_service.py b/app/services/search/fusion_service.py index 77de3ac..7e1e3f5 100644 --- a/app/services/search/fusion_service.py +++ b/app/services/search/fusion_service.py @@ -219,6 +219,70 @@ def get_strategy(name: str) -> FusionStrategy: return cls() +# ─── Phase 2.3: soft filter boost ─────────────────────── + +SOFT_FILTER_MAX_BOOST = 0.05 # plan 룰 (CRITICAL) +# ↑ RRF score는 0.01~0.05 범위 (k=60). 상한 초과 시 기존 랭킹 왜곡. +# 기존 RRFWithBoost의 legal article boost(0.05)와 동일 최대값 → 일관성. +SOFT_FILTER_DOMAIN_BOOST = 0.03 +SOFT_FILTER_DOCTYPE_BOOST = 0.02 + + +def apply_soft_filter_boost( + results: list["SearchResult"], + soft_filters: dict | None, +) -> int: + """Phase 2.3 — QueryAnalyzer soft_filters 기반 score boost. + + ai_domain / ai_tags 매칭 시 소량 boost 적용. 총 boost는 + SOFT_FILTER_MAX_BOOST(0.05) 상한을 넘지 않음. + + Args: + results: fusion 직후 SearchResult 리스트 (in-place 수정) + soft_filters: query_analysis.soft_filters = {"domain": [...], "document_type": [...]} + + Returns: + int — boost 적용된 결과 개수 (debug/notes용) + """ + if not soft_filters: + return 0 + domain_list = [str(d).lower() for d in soft_filters.get("domain", []) or []] + doctype_list = [str(t).lower() for t in soft_filters.get("document_type", []) or []] + if not domain_list and not doctype_list: + return 0 + + boosted_count = 0 + for r in results: + boost = 0.0 + + # domain 매칭 — ai_domain 부분 문자열 매칭 (Industrial_Safety/Legislation 같은 경로 매칭) + if domain_list and r.ai_domain: + ai_dom_lower = r.ai_domain.lower() + for d in domain_list: + if d in ai_dom_lower or ai_dom_lower in d: + boost += SOFT_FILTER_DOMAIN_BOOST + break # 한 번만 + + # document_type 매칭 — ai_tags JSON 문자열 또는 ai_domain 내 keyword 탐지 + # (ai_domain에 "Law_Document"는 안 들어감. ai_tags에 law/law_document 같은 태그가 있음.) + # 간단화: ai_domain 경로에 keyword가 포함되면 매칭 + if doctype_list: + hay = (r.ai_domain or "").lower() + " " + (getattr(r, "match_reason", "") or "").lower() + for t in doctype_list: + if t in hay or any(word in hay for word in t.split("_")): + boost += SOFT_FILTER_DOCTYPE_BOOST + break + + if boost > 0: + boost = min(boost, SOFT_FILTER_MAX_BOOST) + r.score += boost + boosted_count += 1 + + # boost 적용 후 재정렬 + results.sort(key=lambda x: x.score, reverse=True) + return boosted_count + + # ─── display score 정규화 ────────────────────────────────