From 3d60008965317378a05eefa4cde807be474d41f2 Mon Sep 17 00:00:00 2001
From: hyungi <hyun49196@gmail.com>
Date: Thu, 11 Jun 2026 17:19:35 +0900
Subject: [PATCH] =?UTF-8?q?ops(ai)!:=20=EB=A7=A5=EB=AF=B8=EB=8B=88=20?=
 =?UTF-8?q?=EC=83=9D=EC=84=B1=20=EB=AA=A8=EB=8D=B8=20Qwen3.6-27B-6bit=20?=
 =?UTF-8?q?=EC=A0=84=ED=99=98=20+=20=EC=83=9D=EC=84=B1=20LLM=20=ED=99=80?=
 =?UTF-8?q?=EB=93=9C=20=ED=95=B4=EC=A0=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

B안(사용자 2026-06-11): Gemma 26B-A4B → Qwen3.6-27B-6bit 풀교체.
- config.yaml triage/primary model 교체 + dense 감속 반영 timeout 상향(30→120/180→300)
- held_stages [] (홀드 해제 — 적체 자연 드레인, deep_summary 는 primary 복귀)
- eid deep 모드 = mac-mini-default 재지정(맥북 백지화). llm_gate '예외 없이 gate' invariant 에
  따라 deep 도 alias 조건으로 자동 게이트 (구 무게이트 = 맥북 별 endpoint 예외였음)
- deep probe 실패 reason = router_unreachable 로 정정 + 테스트 동기화
잔여(별 PR): ask 표면 qwen-macbook 옵션/백엔드 클래스/처리보드 맥북 카드 정리

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 app/api/eid_chat.py               | 11 +++++-----
 app/eid/ai.py                     | 17 +++++++++-------
 config.yaml                       | 34 +++++++++++++++----------------
 tests/eid/test_eid_chat_deep.py   |  4 ++--
 tests/eid/test_eid_chat_stream.py | 10 ++++-----
 5 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/app/api/eid_chat.py b/app/api/eid_chat.py
index 4b4cf77..4639466 100644
--- a/app/api/eid_chat.py
+++ b/app/api/eid_chat.py
@@ -2,8 +2,9 @@
 
 확정 결정:
   - D-1 경로 = /api/eid/chat (main.py prefix=/api/eid + 본 라우터 POST /chat)
-  - D-2 mode 닫힌 어휘: daily(mac-mini-default) / deep(qwen-macbook). 클라는 mode 만 보냄 —
-    claude-cloud / auto 금지 (Literal 로 422 차단). 심층(deep) 모드 무게이트.
+  - D-2 mode 닫힌 어휘: daily / deep — 둘 다 mac-mini-default (맥북 백지화 2026-06-11,
+    맥미니 Qwen 27B 단일 호스트. deep = ReAct 자동검색 모드 구분). 클라는 mode 만 보냄 —
+    claude-cloud / auto 금지 (Literal 로 422 차단). 게이트 = alias 기준 자동 적용(무게이트 폐지).
   - D-3 독립 /chat 라우트 (frontend) — 본 모듈은 백엔드 API 만.
   - D-5 LLM 호출 = EidAIClient.call_stream 한 곳 (이드 egress 봉쇄 불변식 #5,
     RouterBackend 직접 호출 금지).
@@ -43,7 +44,7 @@ logger = setup_logger("eid_chat")
 
 router = APIRouter()
 
-# ── ds-eid-ask-absorb P1: deep 모드 = ReAct 자동검색 (qwen-macbook 27B) ──
+# ── ds-eid-ask-absorb P1: deep 모드 = ReAct 자동검색 (맥미니 Qwen 27B, 2026-06-11~) ──
 # 비생성 reachability probe — router 도달만 확인(coarse). 27B(맥북) 자체 미가용은
 # 첫 generate_with_tools 호출의 BackendUnavailable → mid-stream error envelope 로 커버
 # (plan: probe 정밀도 불필요, TOCTOU 는 in-stream error 가 처리). ~2s 타임아웃·생성 슬롯 비점유.
@@ -160,10 +161,10 @@ async def _eid_chat_deep(body: ChatRequest, session: AsyncSession) -> StreamingR
     """
     # ① 첫 SSE 바이트(=HTTP 200 확정) 전 비생성 probe — router 도달 실패 시 503 (재매핑 가능 구간)
     if not await _probe_router_reachable():
-        return _backend_unavailable_response(body, "macbook_unavailable", "qwen-macbook")
+        return _backend_unavailable_response(body, "router_unreachable", "mac-mini-default")
 
     query = body.messages[-1].content  # 메시지 단독 처리 (마지막 user 턴)
-    backend = get_backend("qwen-macbook")
+    backend = get_backend("mac-mini-default")
 
     async def _stream() -> AsyncIterator[bytes]:
         # ② phase:searching 방출 = HTTP 200 확정. 이후 미가용은 503 불가 → in-stream error.
diff --git a/app/eid/ai.py b/app/eid/ai.py
index aeb3603..24892b2 100644
--- a/app/eid/ai.py
+++ b/app/eid/ai.py
@@ -29,16 +29,19 @@ import httpx
 from ai.client import AIClient
 from services.llm.backends import (
     MAC_MINI_DEFAULT,
-    QWEN_MACBOOK,
     BackendUnavailable,
     _router_url,  # router URL 단일 출처 재사용 (settings → env LLM_ROUTER_URL → MVP default)
 )
 from services.search.llm_gate import Priority, acquire_mlx_gate
 
 # 이드 채팅 mode → router alias 닫힌 매핑 (D-2). 클라는 mode 만 보냄 — claude-cloud/auto 금지.
+# 2026-06-11 맥북 백지화: deep 도 mac-mini-default (맥미니 Qwen 27B 단일 호스트).
+# mode 구분은 유지 — deep = ReAct 자동검색 경로(모델이 아니라 동작이 다름).
+# 게이트는 alias==MAC_MINI_DEFAULT 조건이라 deep 도 자동으로 mlx gate 적용
+# (llm_gate "예외 없이 gate 획득 필수" invariant 충족 — 구 무게이트는 맥북 예외였음).
 _CHAT_ALIAS: dict[str, str] = {
-    "daily": MAC_MINI_DEFAULT,  # router tier_b → Mac mini :8801 gemma-4-26b
-    "deep": QWEN_MACBOOK,       # router named upstream → M5 Max Qwen3.6-27B (무게이트, D-2)
+    "daily": MAC_MINI_DEFAULT,  # router tier_b → Mac mini :8801
+    "deep": MAC_MINI_DEFAULT,   # 맥북 폐기로 동일 upstream — ReAct 검색 모드 구분만 유지
 }
 
 # read 는 per-chunk 적용이라 MacBook wake(24s)+토큰 생성 간격 커버. connect 는 내부 router 라 짧게.
@@ -161,10 +164,10 @@ class EidAIClient(AIClient):
         _rewrite_sse_line 으로 model 치환(mode 어휘)·usage 제거만 하고 프레이밍은 보존.
         취소/disconnect 시 AsyncExitStack 이 response·client 정리(upstream 닫힘 보장).
 
-        daily(mac-mini-default)는 Mac mini MLX 단일 inference 영구 룰(llm_gate docstring
-        "예외 없이 gate 획득 필수")에 따라 acquire_mlx_gate(FOREGROUND) 안에서 스트리밍 —
-        RouterBackend 의 requires_gate=True 와 동일한 client-side mutex 효과.
-        deep(qwen-macbook)은 별 endpoint 라 무게이트 (D-2, RouterBackend 동형).
+        daily/deep 모두 mac-mini-default(2026-06-11 맥북 백지화) → Mac mini MLX 단일
+        inference 영구 룰(llm_gate docstring "예외 없이 gate 획득 필수")에 따라
+        acquire_mlx_gate(FOREGROUND) 안에서 스트리밍 — 게이트 조건이 alias 기준이라
+        deep 도 자동 적용 (구 무게이트는 맥북 별 endpoint 시절 예외였음).
 
         중계 전체(업스트림 진입~종료)는 asyncio.timeout(_STREAM_DEADLINE_S) wall-clock
         deadline 안 — llm_gate 계약 "timeout 은 gate 안쪽" 준수(gate 대기엔 미적용).
diff --git a/config.yaml b/config.yaml
index 20d59d2..58a2c45 100644
--- a/config.yaml
+++ b/config.yaml
@@ -6,25 +6,26 @@ ai:
 
   models:
     # ─── 단일 generation 호스트 routing (2026-05-14 GPU LLM 제거) ───
-    # GPU Ollama gemma4:e4b-it-q8_0 제거. Mac mini 26B-A4B 가 triage + primary + classifier 모두 흡수.
-    # fallback 은 Claude Sonnet 4 API (Mac mini 다운 시 자동 trigger, premium 과 budget 공유).
-    # plan: ~/.claude/plans/rosy-launching-otter.md §C/§D/§E
+    # 2026-06-11 B안: 맥미니 모델 = Gemma 26B-A4B → Qwen3.6-27B-6bit 풀교체 (사용자 결정).
+    # dense 27B 라 디코드 ~13 tok/s 급 (a4b ~42 대비 감속) → timeout 상향 (triage 30→120, primary 180→300).
+    # fallback 은 Claude Sonnet 4 API (CLAUDE_API_KEY 미주입 = 비활성).
+    # plan: ~/.claude/plans/rosy-launching-otter.md §C/§D/§E + project_macmini_model_decision
 
-    # triage: 상시 분류·요약·근거 선별. Mac mini 26B (primary 와 동일 endpoint, 짧은 max_tokens).
+    # triage: 상시 분류·요약·근거 선별. Mac mini Qwen 27B (primary 와 동일 endpoint, 짧은 max_tokens).
     triage:
       endpoint: "http://100.76.254.116:8801/v1/chat/completions"
-      model: "mlx-community/gemma-4-26b-a4b-it-8bit"
+      model: "mlx-community/Qwen3.6-27B-6bit"
       max_tokens: 4096
-      timeout: 30
+      timeout: 120
       context_char_limit: 120000
       temperature: 0.0
 
-    # primary: 에스컬레이션 전용. 26B MLX (맥미니 Semaphore(1) 보호 대상).
+    # primary: 에스컬레이션 전용. Qwen 27B MLX (맥미니 Semaphore(1) 보호 대상).
     primary:
       endpoint: "http://100.76.254.116:8801/v1/chat/completions"
-      model: "mlx-community/gemma-4-26b-a4b-it-8bit"
+      model: "mlx-community/Qwen3.6-27B-6bit"
       max_tokens: 8192
-      timeout: 180
+      timeout: 300
       context_char_limit: 260000
       temperature: 0.3
       top_p: 0.9
@@ -177,13 +178,10 @@ schedule:
   file_watcher_interval_minutes: 5
   queue_consumer_interval_minutes: 10
 
-# 생성 LLM 홀드 (2026-06-11, 사용자 지시): 맥미니 모델 확정까지 생성 LLM 소비 스테이지 보류.
-# - 큐: classify(triage)/summarize/deep_summary — claim 자체를 안 함 (attempts 미소모, pending 적체 = 의도)
-# - cron/컨슈머: digest(global 04:00), briefing(05:10), study_explanation/study_session_analysis/
-#   study_memo_card (1분 컨슈머)
-# - 무영향: extract/embed/chunk/markdown/stt/preview/thumbnail/fulltext, 수집기 전부,
-#   인터랙티브(ask/eid chat), daily_digest(LLM 미사용)
-# 유효 키 = 위 8개 — 그 외 문자열은 무동작(오타 주의). 해제 = held_stages: [] 후 fastapi 재기동.
+# 생성 LLM 홀드 게이트 (2026-06-11 신설): held_stages 에 든 이름의 컨슈머/워커는 claim 자체를
+# 하지 않는다 (attempts 미소모, pending 적체). 유효 키 8 = classify/summarize/deep_summary(큐) +
+# digest/briefing(cron) + study_explanation/study_session_analysis/study_memo_card(컨슈머).
+# 그 외 문자열은 무동작(오타 주의). 적용/해제 = 리스트 수정 후 fastapi 재기동.
+# 이력: 2026-06-11 맥미니 모델 확정까지 8키 홀드 → 同日 Qwen3.6-27B-6bit 전환과 함께 해제([]).
 pipeline:
-  held_stages: ["classify", "summarize", "deep_summary", "digest", "briefing",
-                "study_explanation", "study_session_analysis", "study_memo_card"]
+  held_stages: []
diff --git a/tests/eid/test_eid_chat_deep.py b/tests/eid/test_eid_chat_deep.py
index 7ca89be..6ffce26 100644
--- a/tests/eid/test_eid_chat_deep.py
+++ b/tests/eid/test_eid_chat_deep.py
@@ -126,11 +126,11 @@ async def test_deep_conversational_no_sources(client, monkeypatch):
 
 @pytest.mark.asyncio
 async def test_deep_probe_fail_503(client, monkeypatch):
-    """probe 실패(router 미도달) → 첫 바이트 전 503 macbook_unavailable."""
+    """probe 실패(router 미도달) → 첫 바이트 전 503 router_unreachable."""
     monkeypatch.setattr(eid_chat, "_probe_router_reachable", _async_false)
     r = await client.post("/api/eid/chat", json=_DEEP)
     assert r.status_code == 503
-    assert r.json()["error_reason"] == "macbook_unavailable"
+    assert r.json()["error_reason"] == "router_unreachable"
 
 
 @pytest.mark.asyncio
diff --git a/tests/eid/test_eid_chat_stream.py b/tests/eid/test_eid_chat_stream.py
index f0a1635..f7890b7 100644
--- a/tests/eid/test_eid_chat_stream.py
+++ b/tests/eid/test_eid_chat_stream.py
@@ -104,7 +104,7 @@ async def test_anthropic_router_url_blocked(monkeypatch):
 
 @pytest.mark.asyncio
 async def test_deep_mode_alias_and_sse_line_rewrite(monkeypatch):
-    """deep → qwen-macbook alias, system 은 messages[0] 단일 주입, 라인 단위 정화 중계."""
+    """deep → mac-mini-default alias (맥북 백지화 2026-06-11), system 은 messages[0] 단일 주입, 라인 단위 정화 중계."""
     seen: dict = {}
 
     def handler(request: httpx.Request) -> httpx.Response:
@@ -139,7 +139,7 @@ async def test_deep_mode_alias_and_sse_line_rewrite(monkeypatch):
     ]
     assert seen["url"].endswith("/v1/chat/completions")
     body = seen["json"]
-    assert body["model"] == "qwen-macbook"
+    assert body["model"] == "mac-mini-default"
     assert body["stream"] is True
     assert body["max_tokens"] == 2048
     assert body["temperature"] == 0.4
@@ -202,7 +202,7 @@ async def test_prestream_503_maps_reason(monkeypatch):
         with pytest.raises(BackendUnavailable) as ei:
             await anext(stream)
         assert ei.value.reason == "macbook_unavailable"
-        assert ei.value.backend_name == "qwen-macbook"
+        assert ei.value.backend_name == "mac-mini-default"
     finally:
         await c.close()
 
@@ -253,7 +253,7 @@ async def test_prestream_400_raises_valueerror_failloud(monkeypatch):
     c = EidAIClient()
     try:
         stream = c.call_stream("deep", _MSG, "sys")
-        with pytest.raises(ValueError, match="router rejected alias='qwen-macbook'"):
+        with pytest.raises(ValueError, match="router rejected alias='mac-mini-default'"):
             await anext(stream)
     finally:
         await c.close()
@@ -290,7 +290,7 @@ async def test_stream_deadline_exceeded(monkeypatch):
             async for _ in stream:
                 pass
         assert ei.value.reason == "stream_deadline_exceeded"
-        assert ei.value.backend_name == "qwen-macbook"
+        assert ei.value.backend_name == "mac-mini-default"
     finally:
         await c.close()