From 73f328cb65046dde0b7ab01af61682a5bc25c7f4 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Sun, 17 May 2026 08:01:22 +0900
Subject: [PATCH] =?UTF-8?q?fix(search):=20DS=20RAG=20LLM=5FTIMEOUT=5FMS=20?=
 =?UTF-8?q?align=2015s/3s=20=E2=86=92=2030s/10s=20(B-3=20Synthesis-Timeout?=
 =?UTF-8?q?-Calibration-1)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR-Hermes-Docsrv-Search-1 closure 측정 (synthesis_ms=30~48s / ev_ms=15005 /
query_analyze 45s) 으로 15s LLM_TIMEOUT 빈발 timeout 확인. Mac mini 26B 동시
호출 (gate Semaphore 1 직렬화 후에도 evidence + synthesis + classifier +
query_analyzer + verifier 가 sequential 누적) 시 각 호출 30s 까지 필요.

5곳 변경:
- synthesis_service.LLM_TIMEOUT_MS 15000 → 30000
- evidence_service.LLM_TIMEOUT_MS 15000 → 30000
- verifier_service.LLM_TIMEOUT_MS 3000 → 10000
- query_analyzer.LLM_TIMEOUT_MS 15000 → 30000
- search.py:522 classifier wait_for 15.0 → 30.0 (classifier_service align)
- search.py:641 verifier wait_for 4.0 → 10.0 (verifier_service align)

classifier (이전 PR 에서 30s 로 align 완료) 와 동일 정책 — outer wait_for
가 inner LLM_TIMEOUT_MS 를 override 하지 않도록 align.

ask 응답 latency 상한 ↑ 의도된 trade-off — 안정성 (refusal_gate
conservative_refuse 회피 + grounding/verifier 정상 동작) 우선.

영향: PR-1 fixture 회귀 0 예상 (이전 timeout 이 새 한도 안). B-1 Throughput-1
(priority queue / 모델 분리) 별 PR 진입 시 latency 본격 단축 검토.
---
 app/api/search.py                        | 8 ++++++--
 app/services/search/evidence_service.py  | 2 +-
 app/services/search/query_analyzer.py    | 2 +-
 app/services/search/synthesis_service.py | 2 +-
 app/services/search/verifier_service.py  | 2 +-
 5 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/app/api/search.py b/app/api/search.py
index e1badb3..8c0c517 100644
--- a/app/api/search.py
+++ b/app/api/search.py
@@ -518,8 +518,10 @@ async def ask(
     # 거의 모든 classifier 호출 timeout → conservative_refuse(no_classifier) 경로. 15s 로 상향 — classifier
     # 가 실제 작동하도록 (단, ask 전체 응답 시간 상한 영향: ev_ms + max(classifier_wait, evidence_extract) +
     # synth_ms + verifier 누적).
+    # 2026-05-17 B-3: 15s 도 동시 부하 시 부족 (classifier_service LLM_TIMEOUT_MS 30s 와 misalign).
+    # 30s 로 align → classifier 동작 안정. ask 응답 latency 상한 ↑ 의도.
     try:
-        classifier_result = await asyncio.wait_for(classifier_task, timeout=15.0)
+        classifier_result = await asyncio.wait_for(classifier_task, timeout=30.0)
     except (asyncio.TimeoutError, Exception):
         classifier_result = ClassifierResult("timeout", None, [], [], 0.0)
 
@@ -637,8 +639,10 @@ async def ask(
         verifier_task = asyncio.create_task(
             verify(q, sr.answer or "", evidence)
         )
+        # 2026-05-17 B-3: 4s outer wait_for 가 verifier_service LLM_TIMEOUT_MS (10s) 를 override
+        # → classifier 와 동일 패턴 (search.py:522 가 6s→15s swap 했던 case). 10s 로 align.
         try:
-            verifier_result = await asyncio.wait_for(verifier_task, timeout=4.0)
+            verifier_result = await asyncio.wait_for(verifier_task, timeout=10.0)
         except (asyncio.TimeoutError, Exception):
             verifier_result = VerifierResult("timeout", [], 0.0)
 
diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py
index 84ee264..453b29c 100644
--- a/app/services/search/evidence_service.py
+++ b/app/services/search/evidence_service.py
@@ -78,7 +78,7 @@ SPAN_MIN_CHARS = 80  # 이 미만이면 window enlarge
 SPAN_ENLARGE_TARGET = 120  # enlarge 시 재윈도우 target_chars
 SPAN_MAX_CHARS = 300  # 이 초과면 cut (synthesis token budget 보호)
 
-LLM_TIMEOUT_MS = 15000
+LLM_TIMEOUT_MS = 30000  # 2026-05-17 B-3: 15s 시 ev_ms=15005 timeout 빈발 — classifier (30s) 와 align
 PROMPT_VERSION = "v2-triage"   # B-2: primary(26B MLX) → triage path 전환. PR #20 이후 triage/primary 동일 endpoint (Mac mini 26B).
 
 # 확장 여지 — None 이면 비활성 (baseline). 실측 후 0.8 등으로 켠다.
diff --git a/app/services/search/query_analyzer.py b/app/services/search/query_analyzer.py
index e407f50..105383c 100644
--- a/app/services/search/query_analyzer.py
+++ b/app/services/search/query_analyzer.py
@@ -44,7 +44,7 @@ logger = setup_logger("query_analyzer")
 PROMPT_VERSION = "v2"  # prompts/query_analyze.txt 축소판
 CACHE_TTL = 86400  # 24h
 CACHE_MAXSIZE = 1000
-LLM_TIMEOUT_MS = 15000  # async 구조 (background), 동기 경로 금지
+LLM_TIMEOUT_MS = 30000  # 2026-05-17 B-3: 동시 부하 시 query_analyze 45s 측정 (fastapi log) — 15s 부족, classifier (30s) 와 align. async 구조 (background), 동기 경로 금지
 # ↑ 실측: gemma-4-26b-a4b-it-8bit MLX, 축소 프롬프트(prompt_tok=802) 7~11초.
 #   generation이 dominant (max_tokens 무효, 자연 EOS ~289 tok 생성).
 #   background 실행이라 15초도 안전. 상향 필요 시 여기서만 조정.
diff --git a/app/services/search/synthesis_service.py b/app/services/search/synthesis_service.py
index 51248a6..f0e5452 100644
--- a/app/services/search/synthesis_service.py
+++ b/app/services/search/synthesis_service.py
@@ -40,7 +40,7 @@ logger = setup_logger("synthesis")
 
 # ─── 상수 (plan 영구 룰) ─────────────────────────────────
 PROMPT_VERSION = "v2"
-LLM_TIMEOUT_MS = 15000
+LLM_TIMEOUT_MS = 30000  # 2026-05-17 B-3: 15s 시 동시 부하 (Mac mini 26B classifier+evidence+synthesis serialized) 빈발 timeout — classifier (30s) 와 align
 CACHE_TTL = 3600  # 1h (answer 는 원문 변경에 민감 → query_analyzer 24h 보다 짧게)
 CACHE_MAXSIZE = 300
 MAX_ANSWER_CHARS = 600
diff --git a/app/services/search/verifier_service.py b/app/services/search/verifier_service.py
index 8cda6bd..757986b 100644
--- a/app/services/search/verifier_service.py
+++ b/app/services/search/verifier_service.py
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
 
 logger = setup_logger("verifier")
 
-LLM_TIMEOUT_MS = 3000
+LLM_TIMEOUT_MS = 10000  # 2026-05-17 B-3: 3s 시 동시 부하 시 verifier 빈발 skip → grounding 약화. Mac mini 26B 가 verifier-style 짧은 LLM call 도 concurrent 호출 시 3s 초과 빈번 — 10s 로 raise
 CIRCUIT_THRESHOLD = 5
 CIRCUIT_RECOVERY_SEC = 60