From e595283e27a476ad223bbeb36ba62ebbc17edb5c Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Wed, 8 Apr 2026 15:20:05 +0900
Subject: [PATCH] =?UTF-8?q?fix(search):=20Phase=202.2=20multilingual=20?=
 =?UTF-8?q?=ED=99=9C=EC=84=B1=20=EC=A1=B0=EA=B1=B4=EC=9D=84=20news/global?=
 =?UTF-8?q?=20=ED=95=9C=EC=A0=95=EC=9C=BC=EB=A1=9C=20=EC=A2=81=ED=9E=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## 1차 측정 결과

| metric | Phase 1.3 | Phase 2.2 (all domains) | Δ |
|---|---|---|---|
| Recall@10 | 0.730 | 0.683 | -0.047 ❌ |
| natural_language_ko NDCG | 0.73 | 0.63 | -0.10 ❌ |
| news_crosslingual NDCG | 0.27 | 0.37 | +0.10 ✓ |
| crosslingual_ko_en NDCG | 0.53 | 0.50 | -0.03 ❌ |

document 도메인에서 ko→en 번역 쿼리가 한국어 법령 검색에 noise로 작용.
"기계 사고 관련 법령" → "machinery accident laws" 영어 embedding이
한국어 법령 문서와 매칭 약해서 ko 결과를 오히려 밀어냄.

## 수정

use_multilingual 조건 강화:
 - 기존: analyzer_tier == "analyzed" + normalized_queries >= 2
 - 추가: domain_hint == "news" OR language_scope == "global"

즉 document 도메인은 기존 single-query 경로 유지 → 회귀 복구.
news / global 영역만 multilingual → news_crosslingual 개선 유지.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/api/search.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/app/api/search.py b/app/api/search.py
index d9a3235..a0e8f54 100644
--- a/app/api/search.py
+++ b/app/api/search.py
@@ -185,19 +185,33 @@ async def search(
                 + (" (bg triggered)" if triggered else " (bg inflight)")
             )
 
-    # Phase 2.2: multilingual vector search 활성 조건
+    # Phase 2.2: multilingual vector search 활성 조건 (보수적)
     #   - cache hit + analyzer_tier == "analyzed" (≥0.85 고신뢰)
     #   - normalized_queries 2개 이상 (lang 다양성 있음)
-    # 그 외 케이스는 기존 single-query search_vector 그대로 사용 (회귀 0).
+    #   - domain_hint == "news" 또는 language_scope == "global"
+    #   ↑ 1차 측정 결과: document 도메인에서 multilingual이 natural_language_ko
+    #     -0.10 악화시킴. 영어 번역이 한국어 법령 검색에서 noise로 작용.
+    #     news / global 영역에서만 multilingual 활성 (news_crosslingual +0.10 개선 확인).
     use_multilingual: bool = False
     normalized_queries: list[dict] = []
     if analyzer_cache_hit and analyzer_tier == "analyzed" and query_analysis:
-        raw_nq = query_analysis.get("normalized_queries") or []
-        if isinstance(raw_nq, list) and len(raw_nq) >= 2:
-            normalized_queries = [nq for nq in raw_nq if isinstance(nq, dict) and nq.get("text")]
-            if len(normalized_queries) >= 2:
-                use_multilingual = True
-                notes.append(f"multilingual langs={[nq.get('lang') for nq in normalized_queries]}")
+        domain_hint = query_analysis.get("domain_hint", "mixed")
+        language_scope = query_analysis.get("language_scope", "limited")
+        is_multilingual_candidate = (
+            domain_hint == "news" or language_scope == "global"
+        )
+        if is_multilingual_candidate:
+            raw_nq = query_analysis.get("normalized_queries") or []
+            if isinstance(raw_nq, list) and len(raw_nq) >= 2:
+                normalized_queries = [
+                    nq for nq in raw_nq if isinstance(nq, dict) and nq.get("text")
+                ]
+                if len(normalized_queries) >= 2:
+                    use_multilingual = True
+                    notes.append(
+                        f"multilingual langs={[nq.get('lang') for nq in normalized_queries]}"
+                        f" hint={domain_hint}/{language_scope}"
+                    )
 
     if mode == "vector":
         t0 = time.perf_counter()