feat: NanoClaude Phase 2 — EXAONE→Gemma 파이프라인, 큐, 상태 API

- ModelAdapter: 범용 OpenAI-compat 어댑터 (stream/complete/health)
- BackendRegistry: rewriter(EXAONE) + reasoner(Gemma4) 헬스체크 루프
- 2단계 파이프라인: EXAONE rewrite → Gemma reasoning (SSE rewrite 이벤트 노출)
- Fallback: 맥미니 다운 시 EXAONE 단독 모드, stream 중간 실패 시 자동 전환
- Cancel-safe: rewrite 전/후, streaming loop 내, fallback 경로 모두 체크
- Rewrite heartbeat: complete_chat 대기 중 2초 간격 processing 이벤트
- JobQueue: Semaphore(3) 기반 동시성 제한, 정확한 queue position
- GET /chat/{job_id}/status, GET /queue/stats 엔드포인트
- DB: rewrite_model, reasoning_model, rewritten_message 컬럼 추가

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-06 12:04:15 +09:00
parent 8c41a5dead
commit c4c32170f1
14 changed files with 495 additions and 141 deletions

View File

@@ -0,0 +1,93 @@
"""BackendRegistry — 모델 어댑터 관리 + 헬스체크 루프."""
from __future__ import annotations
import asyncio
import logging
import time
from services.model_adapter import ModelAdapter
logger = logging.getLogger(__name__)
REWRITER_PROMPT = (
"너는 질문 재구성 전문가다. "
"사용자의 질문을 분석하여 의도를 명확히 하고, 구조화된 질문으로 재작성하라. "
"재구성된 질문만 출력하라. 부연 설명이나 답변은 절대 하지 마라."
)
REASONER_PROMPT = (
"너는 NanoClaude, 사용자의 질문에 구조화되고 정확한 답변을 제공하는 AI 어시스턴트다. "
"논리적으로 사고하고, 명확하게 설명하며, 필요시 예시를 포함하라."
)
class BackendRegistry:
def __init__(self) -> None:
self.rewriter: ModelAdapter | None = None
self.reasoner: ModelAdapter | None = None
self._health: dict[str, bool] = {"rewriter": False, "reasoner": False}
self._latency: dict[str, float] = {"rewriter": 0.0, "reasoner": 0.0}
self._health_task: asyncio.Task | None = None
def init_from_settings(self, settings) -> None:
self.rewriter = ModelAdapter(
name="EXAONE",
base_url=settings.exaone_base_url,
model=settings.exaone_model,
system_prompt=REWRITER_PROMPT,
temperature=settings.exaone_temperature,
timeout=settings.exaone_timeout,
)
self.reasoner = ModelAdapter(
name="Gemma4",
base_url=settings.reasoning_base_url,
model=settings.reasoning_model,
system_prompt=REASONER_PROMPT,
temperature=settings.reasoning_temperature,
timeout=settings.reasoning_timeout,
)
def start_health_loop(self, interval: float = 30.0) -> None:
self._health_task = asyncio.create_task(self._health_loop(interval))
def stop_health_loop(self) -> None:
if self._health_task and not self._health_task.done():
self._health_task.cancel()
async def _health_loop(self, interval: float) -> None:
while True:
await self._check_all()
await asyncio.sleep(interval)
async def _check_all(self) -> None:
for role, adapter in [("rewriter", self.rewriter), ("reasoner", self.reasoner)]:
if not adapter:
continue
start = time.monotonic()
healthy = await adapter.health_check()
elapsed = round((time.monotonic() - start) * 1000, 1)
prev = self._health[role]
self._health[role] = healthy
self._latency[role] = elapsed
if prev != healthy:
status = "UP" if healthy else "DOWN"
logger.warning("%s (%s) → %s (%.0fms)", adapter.name, role, status, elapsed)
def is_healthy(self, role: str) -> bool:
return self._health.get(role, False)
def health_summary(self) -> dict:
result = {}
for role, adapter in [("rewriter", self.rewriter), ("reasoner", self.reasoner)]:
if adapter:
result[role] = {
"name": adapter.name,
"model": adapter.model,
"healthy": self._health[role],
"latency_ms": self._latency[role],
}
return result
backend_registry = BackendRegistry()

View File

@@ -1,90 +0,0 @@
"""EXAONE Adapter — Ollama OpenAI-compat endpoint를 통한 EXAONE 호출."""
from __future__ import annotations
import logging
from collections.abc import AsyncGenerator
import httpx
from config import settings
logger = logging.getLogger(__name__)
SYSTEM_PROMPT = (
"너는 NanoClaude, 사용자의 질문을 이해하고 정리하여 명확한 답변을 제공하는 AI 어시스턴트다. "
"사용자의 질문 의도를 파악하고, 문장을 정리하며, 구조화된 응답을 생성한다."
)
async def stream_chat(message: str) -> AsyncGenerator[str, None]:
"""EXAONE 스트리밍 호출. OpenAI-compat SSE를 chunk 단위로 yield."""
payload = {
"model": settings.exaone_model,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": message},
],
"stream": True,
"temperature": settings.exaone_temperature,
}
async with httpx.AsyncClient(timeout=settings.exaone_timeout) as client:
try:
async with client.stream(
"POST",
f"{settings.exaone_base_url}/v1/chat/completions",
json=payload,
) as resp:
if resp.status_code != 200:
body = await resp.aread()
logger.error("EXAONE error %d: %s", resp.status_code, body.decode())
yield f"[Error] EXAONE 응답 실패 ({resp.status_code})"
return
async for line in resp.aiter_lines():
line = line.strip()
if not line or not line.startswith("data: "):
continue
payload_str = line[len("data: "):]
if payload_str == "[DONE]":
return
# Extract content delta from OpenAI-format chunk
try:
import json
chunk = json.loads(payload_str)
delta = chunk.get("choices", [{}])[0].get("delta", {})
content = delta.get("content", "")
if content:
yield content
except (json.JSONDecodeError, IndexError, KeyError):
continue
except httpx.ConnectError:
logger.error("EXAONE connection failed: %s", settings.exaone_base_url)
yield "[Error] EXAONE 서버에 연결할 수 없습니다."
except httpx.ReadTimeout:
logger.error("EXAONE read timeout")
yield "[Error] EXAONE 응답 시간이 초과되었습니다."
async def complete_chat(message: str) -> str:
"""EXAONE 비스트리밍 호출. 전체 응답 텍스트 반환."""
payload = {
"model": settings.exaone_model,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": message},
],
"stream": False,
"temperature": settings.exaone_temperature,
}
async with httpx.AsyncClient(timeout=settings.exaone_timeout) as client:
resp = await client.post(
f"{settings.exaone_base_url}/v1/chat/completions",
json=payload,
)
resp.raise_for_status()
data = resp.json()
return data["choices"][0]["message"]["content"]

View File

@@ -17,6 +17,8 @@ class Job:
status: JobStatus = JobStatus.queued
created_at: float = field(default_factory=time)
task: asyncio.Task | None = field(default=None, repr=False)
pipeline: bool = True
rewritten_message: str = ""
class JobManager:

View File

@@ -0,0 +1,57 @@
"""JobQueue — Semaphore 기반 동시성 제한."""
from __future__ import annotations
import asyncio
import logging
from services import worker
from services.job_manager import Job, job_manager
from services.state_stream import state_stream
logger = logging.getLogger(__name__)
class JobQueue:
def __init__(self, max_concurrent: int = 3) -> None:
self._semaphore = asyncio.Semaphore(max_concurrent)
self._waiting: list[str] = [] # 대기 중 job_id (순서 보장)
self._active: set[str] = set()
async def submit(self, job: Job) -> asyncio.Task:
task = asyncio.create_task(self._run_with_semaphore(job))
job_manager.attach_task(job.id, task)
return task
async def _run_with_semaphore(self, job: Job) -> None:
self._waiting.append(job.id)
pos = self.position(job.id)
if pos and pos > 0:
await state_stream.push(job.id, "queued", {"position": pos})
try:
async with self._semaphore:
self._waiting.remove(job.id)
self._active.add(job.id)
await worker.run(job)
finally:
self._active.discard(job.id)
def position(self, job_id: str) -> int | None:
try:
return self._waiting.index(job_id) + 1
except ValueError:
return None
@property
def stats(self) -> dict:
return {"pending": len(self._waiting), "active": len(self._active)}
job_queue: JobQueue | None = None
def init_queue(max_concurrent: int = 3) -> JobQueue:
global job_queue
job_queue = JobQueue(max_concurrent)
return job_queue

View File

@@ -0,0 +1,109 @@
"""ModelAdapter — 범용 OpenAI-compat 모델 어댑터."""
from __future__ import annotations
import json
import logging
from collections.abc import AsyncGenerator
import httpx
logger = logging.getLogger(__name__)
class ModelAdapter:
"""OpenAI-compatible /v1/chat/completions 백엔드 범용 어댑터.
Ollama, MLX 등 모두 동일 인터페이스로 호출."""
def __init__(
self,
name: str,
base_url: str,
model: str,
system_prompt: str,
temperature: float = 0.7,
timeout: float = 120.0,
):
self.name = name
self.base_url = base_url
self.model = model
self.system_prompt = system_prompt
self.temperature = temperature
self.timeout = timeout
async def stream_chat(self, message: str) -> AsyncGenerator[str, None]:
"""스트리밍 호출. content chunk를 yield."""
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": message},
],
"stream": True,
"temperature": self.temperature,
}
async with httpx.AsyncClient(timeout=self.timeout) as client:
try:
async with client.stream(
"POST",
f"{self.base_url}/v1/chat/completions",
json=payload,
) as resp:
if resp.status_code != 200:
body = await resp.aread()
logger.error("%s error %d: %s", self.name, resp.status_code, body.decode())
raise RuntimeError(f"{self.name} 응답 실패 ({resp.status_code})")
async for line in resp.aiter_lines():
line = line.strip()
if not line or not line.startswith("data: "):
continue
payload_str = line[len("data: "):]
if payload_str == "[DONE]":
return
try:
chunk = json.loads(payload_str)
delta = chunk.get("choices", [{}])[0].get("delta", {})
content = delta.get("content", "")
if content:
yield content
except (json.JSONDecodeError, IndexError, KeyError):
continue
except httpx.ConnectError:
logger.error("%s connection failed: %s", self.name, self.base_url)
raise
except httpx.ReadTimeout:
logger.error("%s read timeout", self.name)
raise
async def complete_chat(self, message: str) -> str:
"""비스트리밍 호출. 전체 응답 텍스트 반환."""
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": message},
],
"stream": False,
"temperature": self.temperature,
}
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(
f"{self.base_url}/v1/chat/completions",
json=payload,
)
resp.raise_for_status()
data = resp.json()
return data["choices"][0]["message"]["content"]
async def health_check(self) -> bool:
"""GET /v1/models — 3초 timeout."""
try:
async with httpx.AsyncClient(timeout=3.0) as client:
resp = await client.get(f"{self.base_url}/v1/models")
return resp.status_code < 500
except Exception:
return False

View File

@@ -1,4 +1,4 @@
"""Worker — background task that drives EXAONE call and pushes SSE events."""
"""Worker — 2단계 파이프라인: EXAONE rewrite → Gemma reasoning (cancel-safe + fallback)."""
from __future__ import annotations
@@ -9,23 +9,66 @@ from time import time
from config import settings
from db.database import log_completion, log_request
from models.schemas import JobStatus
from services.exaone_adapter import stream_chat
from services.backend_registry import backend_registry
from services.job_manager import Job, job_manager
from services.state_stream import state_stream
logger = logging.getLogger(__name__)
# 무응답 방지: 3~5초 간격으로 processing heartbeat
HEARTBEAT_INTERVAL = 4.0
REWRITE_HEARTBEAT = 2.0
MAX_REWRITE_LENGTH = 1000
async def _complete_with_heartbeat(adapter, message: str, job_id: str) -> str:
"""complete_chat + heartbeat 병행. rewrite 대기 중 사용자 체감 멈춤 방지."""
result_holder: dict[str, str] = {}
exc_holder: list[Exception] = []
async def call():
try:
result_holder["text"] = await adapter.complete_chat(message)
except Exception as e:
exc_holder.append(e)
task = asyncio.create_task(call())
while not task.done():
await asyncio.sleep(REWRITE_HEARTBEAT)
if not task.done():
await state_stream.push(job_id, "processing", {"message": "질문을 재구성하고 있습니다..."})
if exc_holder:
raise exc_holder[0]
return result_holder.get("text", "")
async def _stream_with_cancel(adapter, message: str, job: Job, collected: list[str]) -> bool:
"""스트리밍 + cancel 체크. 정상 완료 시 True, cancel 시 False."""
last_heartbeat = asyncio.get_event_loop().time()
async for chunk in adapter.stream_chat(message):
if job.status == JobStatus.cancelled:
return False
collected.append(chunk)
await state_stream.push(job.id, "result", {"content": chunk})
now = asyncio.get_event_loop().time()
if now - last_heartbeat >= HEARTBEAT_INTERVAL:
await state_stream.push(job.id, "processing", {"message": "응답 생성 중..."})
last_heartbeat = now
return True
async def run(job: Job) -> None:
"""EXAONE 호출 → SSE 이벤트 발행."""
"""EXAONE rewrite → Gemma reasoning 파이프라인 (fallback + cancel-safe)."""
start_time = time()
rewrite_model = None
reasoning_model = None
rewritten_message = ""
# DB 로깅: 요청 기록
try:
await log_request(job.id, job.message, settings.exaone_model, job.created_at)
await log_request(job.id, job.message, "pipeline", job.created_at)
except Exception:
logger.warning("Failed to log request for job %s", job.id, exc_info=True)
@@ -34,49 +77,91 @@ async def run(job: Job) -> None:
await state_stream.push(job.id, "ack", {"message": "요청을 확인했습니다. 분석을 시작합니다."})
job_manager.set_status(job.id, JobStatus.processing)
# --- Processing + Streaming ---
await state_stream.push(job.id, "processing", {"message": "EXAONE 모델이 응답을 생성하고 있습니다..."})
# --- Cancel 체크 #1 ---
if job.status == JobStatus.cancelled:
return
use_pipeline = settings.pipeline_enabled and backend_registry.is_healthy("reasoner")
collected: list[str] = []
last_heartbeat = asyncio.get_event_loop().time()
async for chunk in stream_chat(job.message):
if not use_pipeline:
# === EXAONE 단독 모드 (Phase 1 fallback) ===
rewrite_model = backend_registry.rewriter.model
await state_stream.push(job.id, "processing", {"message": "EXAONE 모델이 응답을 생성하고 있습니다..."})
ok = await _stream_with_cancel(backend_registry.rewriter, job.message, job, collected)
if not ok:
return
else:
# === 파이프라인 모드: EXAONE rewrite → Gemma reasoning ===
rewrite_model = backend_registry.rewriter.model
reasoning_model = backend_registry.reasoner.model
# --- Rewrite ---
await state_stream.push(job.id, "processing", {"message": "질문을 재구성하고 있습니다..."})
rewrite_start = time()
try:
rewritten_message = await _complete_with_heartbeat(
backend_registry.rewriter, job.message, job.id
)
rewritten_message = rewritten_message[:MAX_REWRITE_LENGTH]
except Exception:
logger.warning("Rewrite failed for job %s, using original message", job.id)
rewritten_message = job.message
rewrite_latency = (time() - rewrite_start) * 1000
job.rewritten_message = rewritten_message
# --- Rewrite 결과 SSE 노출 ---
await state_stream.push(job.id, "rewrite", {"content": rewritten_message})
# --- Cancel 체크 #2 ---
if job.status == JobStatus.cancelled:
logger.info("Job %s cancelled during streaming", job.id)
await state_stream.push(job.id, "error", {"message": "작업이 취소되었습니다."})
latency_ms = (time() - start_time) * 1000
try:
await log_completion(job.id, "cancelled", len("".join(collected)), latency_ms, time())
except Exception:
pass
return
collected.append(chunk)
# --- Reasoning ---
await state_stream.push(job.id, "processing", {"message": "Gemma 4가 응답을 생성하고 있습니다..."})
# Stream partial result
await state_stream.push(job.id, "result", {"content": chunk})
try:
ok = await _stream_with_cancel(backend_registry.reasoner, rewritten_message, job, collected)
if not ok:
return
except Exception:
# Gemma streaming 중간 실패 → EXAONE fallback
logger.warning("Reasoner failed for job %s, falling back to rewriter", job.id, exc_info=True)
# Heartbeat: 긴 침묵 방지
now = asyncio.get_event_loop().time()
if now - last_heartbeat >= HEARTBEAT_INTERVAL:
await state_stream.push(job.id, "processing", {"message": "응답 생성 중..."})
last_heartbeat = now
if job.status == JobStatus.cancelled:
return
await state_stream.push(job.id, "processing", {"message": "모델 전환 중..."})
reasoning_model = rewrite_model # fallback 기록
ok = await _stream_with_cancel(backend_registry.rewriter, job.message, job, collected)
if not ok:
return
# --- Complete ---
if not collected:
job_manager.set_status(job.id, JobStatus.failed)
await state_stream.push(job.id, "error", {"message": "EXAONE으로부터 응답을 받지 못했습니다."})
await state_stream.push(job.id, "error", {"message": "응답을 받지 못했습니다."})
status = "failed"
else:
job_manager.set_status(job.id, JobStatus.completed)
await state_stream.push(job.id, "done", {"message": "완료"})
status = "completed"
# DB 로깅: 완료 기록
# --- DB 로깅 ---
latency_ms = (time() - start_time) * 1000
response_text = "".join(collected)
try:
await log_completion(job.id, status, len(response_text), latency_ms, time())
await log_completion(
job.id, status, len(response_text), latency_ms, time(),
rewrite_model=rewrite_model,
reasoning_model=reasoning_model,
rewritten_message=rewritten_message,
rewrite_latency_ms=rewrite_latency if use_pipeline else 0,
)
except Exception:
logger.warning("Failed to log completion for job %s", job.id, exc_info=True)