feat: NanoClaude Phase 2 — EXAONE→Gemma 파이프라인, 큐, 상태 API
- ModelAdapter: 범용 OpenAI-compat 어댑터 (stream/complete/health)
- BackendRegistry: rewriter(EXAONE) + reasoner(Gemma4) 헬스체크 루프
- 2단계 파이프라인: EXAONE rewrite → Gemma reasoning (SSE rewrite 이벤트 노출)
- Fallback: 맥미니 다운 시 EXAONE 단독 모드, stream 중간 실패 시 자동 전환
- Cancel-safe: rewrite 전/후, streaming loop 내, fallback 경로 모두 체크
- Rewrite heartbeat: complete_chat 대기 중 2초 간격 processing 이벤트
- JobQueue: Semaphore(3) 기반 동시성 제한, 정확한 queue position
- GET /chat/{job_id}/status, GET /queue/stats 엔드포인트
- DB: rewrite_model, reasoning_model, rewritten_message 컬럼 추가
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -17,4 +17,8 @@ DB_PATH=/app/data/gateway.db
|
||||
|
||||
# NanoClaude
|
||||
EXAONE_MODEL=exaone3.5:7.8b-instruct-q8_0
|
||||
REASONING_BASE_URL=http://192.168.1.122:8800
|
||||
REASONING_MODEL=mlx-community/gemma-4-26b-a4b-it-8bit
|
||||
PIPELINE_ENABLED=true
|
||||
MAX_CONCURRENT_JOBS=3
|
||||
NANOCLAUDE_API_KEY=
|
||||
|
||||
@@ -16,7 +16,8 @@ GPU 서버(RTX 4070 Ti Super)에서 운영하는 중앙 AI 라우팅 서비스.
|
||||
|
||||
- GPU Ollama: host.docker.internal:11434
|
||||
- 맥미니 Ollama: 100.115.153.119:11434
|
||||
- NanoClaude: localhost:8100 (비동기 job 기반 AI Gateway)
|
||||
- 맥미니 MLX: 192.168.1.122:8800 (Gemma 4)
|
||||
- NanoClaude: localhost:8100 (EXAONE → Gemma 파이프라인)
|
||||
|
||||
## 개발
|
||||
|
||||
@@ -41,7 +42,9 @@ OpenAI 호환: `/v1/chat/completions`, `/v1/models`, `/v1/embeddings`
|
||||
## NanoClaude API
|
||||
|
||||
비동기 job 기반: `POST /nano/chat` → `{ job_id }`, `GET /nano/chat/{job_id}/stream` → SSE
|
||||
상태: `GET /nano/chat/{job_id}/status`, 큐: `GET /nano/queue/stats`
|
||||
취소: `POST /nano/chat/{job_id}/cancel`
|
||||
파이프라인: EXAONE (rewrite) → Gemma 4 (reasoning), 맥미니 다운 시 EXAONE fallback
|
||||
|
||||
## 백엔드 설정
|
||||
|
||||
|
||||
@@ -54,6 +54,10 @@ services:
|
||||
environment:
|
||||
- EXAONE_BASE_URL=http://host.docker.internal:11434
|
||||
- EXAONE_MODEL=${EXAONE_MODEL:-exaone3.5:7.8b-instruct-q8_0}
|
||||
- REASONING_BASE_URL=${REASONING_BASE_URL:-http://192.168.1.122:8800}
|
||||
- REASONING_MODEL=${REASONING_MODEL:-mlx-community/gemma-4-26b-a4b-it-8bit}
|
||||
- PIPELINE_ENABLED=${PIPELINE_ENABLED:-true}
|
||||
- MAX_CONCURRENT_JOBS=${MAX_CONCURRENT_JOBS:-3}
|
||||
- DB_PATH=/app/data/nanoclaude.db
|
||||
- API_KEY=${NANOCLAUDE_API_KEY:-}
|
||||
volumes:
|
||||
|
||||
@@ -2,11 +2,26 @@ from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# EXAONE via Ollama
|
||||
# EXAONE (rewriter) via Ollama
|
||||
exaone_base_url: str = "http://localhost:11434"
|
||||
exaone_model: str = "exaone3.5:7.8b-instruct-q8_0"
|
||||
exaone_temperature: float = 0.7
|
||||
exaone_timeout: float = 120.0
|
||||
exaone_timeout: float = 30.0 # rewrite는 짧아야 함
|
||||
|
||||
# Gemma 4 (reasoner) via MLX on Mac mini
|
||||
reasoning_base_url: str = "http://192.168.1.122:8800"
|
||||
reasoning_model: str = "mlx-community/gemma-4-26b-a4b-it-8bit"
|
||||
reasoning_temperature: float = 0.7
|
||||
reasoning_timeout: float = 180.0
|
||||
|
||||
# Pipeline
|
||||
pipeline_enabled: bool = True # False = EXAONE 단독 모드 (Phase 1 fallback)
|
||||
|
||||
# Queue
|
||||
max_concurrent_jobs: int = 3
|
||||
|
||||
# Health check
|
||||
health_check_interval: float = 30.0
|
||||
|
||||
# Server
|
||||
host: str = "0.0.0.0"
|
||||
|
||||
@@ -14,18 +14,35 @@ CREATE TABLE IF NOT EXISTS request_logs (
|
||||
response_chars INTEGER DEFAULT 0,
|
||||
latency_ms REAL DEFAULT 0,
|
||||
created_at REAL NOT NULL,
|
||||
completed_at REAL
|
||||
completed_at REAL,
|
||||
rewrite_model TEXT,
|
||||
reasoning_model TEXT,
|
||||
rewritten_message TEXT,
|
||||
rewrite_latency_ms REAL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_logs_job ON request_logs(job_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_logs_created ON request_logs(created_at);
|
||||
"""
|
||||
|
||||
# Phase 1 → Phase 2 마이그레이션 (이미 존재하면 무시)
|
||||
MIGRATIONS = [
|
||||
"ALTER TABLE request_logs ADD COLUMN rewrite_model TEXT",
|
||||
"ALTER TABLE request_logs ADD COLUMN reasoning_model TEXT",
|
||||
"ALTER TABLE request_logs ADD COLUMN rewritten_message TEXT",
|
||||
"ALTER TABLE request_logs ADD COLUMN rewrite_latency_ms REAL DEFAULT 0",
|
||||
]
|
||||
|
||||
|
||||
async def init_db():
|
||||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.executescript(SCHEMA)
|
||||
for migration in MIGRATIONS:
|
||||
try:
|
||||
await db.execute(migration)
|
||||
except Exception:
|
||||
pass # 이미 존재하는 컬럼
|
||||
await db.commit()
|
||||
|
||||
|
||||
@@ -38,10 +55,26 @@ async def log_request(job_id: str, message: str, model: str, created_at: float):
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def log_completion(job_id: str, status: str, response_chars: int, latency_ms: float, completed_at: float):
|
||||
async def log_completion(
|
||||
job_id: str,
|
||||
status: str,
|
||||
response_chars: int,
|
||||
latency_ms: float,
|
||||
completed_at: float,
|
||||
*,
|
||||
rewrite_model: str | None = None,
|
||||
reasoning_model: str | None = None,
|
||||
rewritten_message: str | None = None,
|
||||
rewrite_latency_ms: float = 0,
|
||||
):
|
||||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
await db.execute(
|
||||
"UPDATE request_logs SET status=?, response_chars=?, latency_ms=?, completed_at=? WHERE job_id=?",
|
||||
(status, response_chars, latency_ms, completed_at, job_id),
|
||||
"""UPDATE request_logs
|
||||
SET status=?, response_chars=?, latency_ms=?, completed_at=?,
|
||||
rewrite_model=?, reasoning_model=?, rewritten_message=?, rewrite_latency_ms=?
|
||||
WHERE job_id=?""",
|
||||
(status, response_chars, latency_ms, completed_at,
|
||||
rewrite_model, reasoning_model, rewritten_message, rewrite_latency_ms,
|
||||
job_id),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""NanoClaude — 비동기 job 기반 AI Gateway."""
|
||||
"""NanoClaude — 비동기 job 기반 AI Gateway (Phase 2: EXAONE → Gemma 파이프라인)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -12,22 +12,29 @@ from fastapi.responses import JSONResponse
|
||||
from config import settings
|
||||
from db.database import init_db
|
||||
from routers import chat
|
||||
from services.backend_registry import backend_registry
|
||||
from services.job_queue import init_queue, job_queue
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s — %(message)s",
|
||||
)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
await init_db()
|
||||
backend_registry.init_from_settings(settings)
|
||||
backend_registry.start_health_loop(settings.health_check_interval)
|
||||
init_queue(settings.max_concurrent_jobs)
|
||||
yield
|
||||
backend_registry.stop_health_loop()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="NanoClaude",
|
||||
version="0.1.0",
|
||||
description="비동기 job 기반 AI Gateway — Phase 1",
|
||||
version="0.2.0",
|
||||
description="비동기 job 기반 AI Gateway — Phase 2 (EXAONE → Gemma 파이프라인)",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
@@ -54,9 +61,14 @@ app.include_router(chat.router)
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {"service": "NanoClaude", "version": "0.1.0", "phase": 1}
|
||||
return {"service": "NanoClaude", "version": "0.2.0", "phase": 2}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
from services.job_queue import job_queue
|
||||
return {
|
||||
"status": "ok",
|
||||
"backends": backend_registry.health_summary(),
|
||||
"queue": job_queue.stats if job_queue else {},
|
||||
}
|
||||
|
||||
@@ -27,6 +27,14 @@ class CancelResponse(BaseModel):
|
||||
status: str
|
||||
|
||||
|
||||
class JobStatusResponse(BaseModel):
|
||||
job_id: str
|
||||
status: JobStatus
|
||||
created_at: float
|
||||
pipeline: bool
|
||||
queue_position: int | None = None
|
||||
|
||||
|
||||
class SSEEvent(BaseModel):
|
||||
event: str # ack | processing | result | error | done
|
||||
event: str # ack | processing | rewrite | result | error | done | queued
|
||||
data: dict
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
"""Chat router — POST /chat, GET /chat/{job_id}/stream, POST /chat/{job_id}/cancel."""
|
||||
"""Chat router — POST /chat, GET /chat/{job_id}/stream, GET /chat/{job_id}/status, POST /chat/{job_id}/cancel."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from models.schemas import CancelResponse, ChatRequest, ChatResponse
|
||||
from services import worker
|
||||
from models.schemas import CancelResponse, ChatRequest, ChatResponse, JobStatusResponse
|
||||
from services.job_manager import job_manager
|
||||
from services.job_queue import job_queue
|
||||
from services.state_stream import state_stream
|
||||
|
||||
router = APIRouter(tags=["chat"])
|
||||
@@ -17,13 +15,10 @@ router = APIRouter(tags=["chat"])
|
||||
|
||||
@router.post("/chat", response_model=ChatResponse)
|
||||
async def create_chat(body: ChatRequest):
|
||||
"""job_id 즉시 반환 (ACK). 백그라운드에서 EXAONE 처리 시작."""
|
||||
"""job_id 즉시 반환 (ACK). 백그라운드에서 파이프라인 처리 시작."""
|
||||
job = job_manager.create(body.message)
|
||||
state_stream.create(job.id)
|
||||
|
||||
task = asyncio.create_task(worker.run(job))
|
||||
job_manager.attach_task(job.id, task)
|
||||
|
||||
await job_queue.submit(job)
|
||||
return ChatResponse(job_id=job.id)
|
||||
|
||||
|
||||
@@ -52,6 +47,22 @@ async def _stream_with_cleanup(job_id: str):
|
||||
state_stream.cleanup(job_id)
|
||||
|
||||
|
||||
@router.get("/chat/{job_id}/status", response_model=JobStatusResponse)
|
||||
async def job_status(job_id: str):
|
||||
"""job 상태 조회 (SSE 없이)."""
|
||||
job = job_manager.get(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
return JobStatusResponse(
|
||||
job_id=job.id,
|
||||
status=job.status,
|
||||
created_at=job.created_at,
|
||||
pipeline=job.pipeline,
|
||||
queue_position=job_queue.position(job.id) if job_queue else None,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/chat/{job_id}/cancel", response_model=CancelResponse)
|
||||
async def cancel_chat(job_id: str):
|
||||
"""진행 중인 job 취소."""
|
||||
@@ -59,3 +70,11 @@ async def cancel_chat(job_id: str):
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Job not found or already finished")
|
||||
return CancelResponse(status="cancelled")
|
||||
|
||||
|
||||
@router.get("/queue/stats")
|
||||
async def queue_stats():
|
||||
"""큐 통계."""
|
||||
if job_queue:
|
||||
return job_queue.stats
|
||||
return {"pending": 0, "active": 0}
|
||||
|
||||
93
nanoclaude/services/backend_registry.py
Normal file
93
nanoclaude/services/backend_registry.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""BackendRegistry — 모델 어댑터 관리 + 헬스체크 루프."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
|
||||
from services.model_adapter import ModelAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
REWRITER_PROMPT = (
|
||||
"너는 질문 재구성 전문가다. "
|
||||
"사용자의 질문을 분석하여 의도를 명확히 하고, 구조화된 질문으로 재작성하라. "
|
||||
"재구성된 질문만 출력하라. 부연 설명이나 답변은 절대 하지 마라."
|
||||
)
|
||||
|
||||
REASONER_PROMPT = (
|
||||
"너는 NanoClaude, 사용자의 질문에 구조화되고 정확한 답변을 제공하는 AI 어시스턴트다. "
|
||||
"논리적으로 사고하고, 명확하게 설명하며, 필요시 예시를 포함하라."
|
||||
)
|
||||
|
||||
|
||||
class BackendRegistry:
|
||||
def __init__(self) -> None:
|
||||
self.rewriter: ModelAdapter | None = None
|
||||
self.reasoner: ModelAdapter | None = None
|
||||
self._health: dict[str, bool] = {"rewriter": False, "reasoner": False}
|
||||
self._latency: dict[str, float] = {"rewriter": 0.0, "reasoner": 0.0}
|
||||
self._health_task: asyncio.Task | None = None
|
||||
|
||||
def init_from_settings(self, settings) -> None:
|
||||
self.rewriter = ModelAdapter(
|
||||
name="EXAONE",
|
||||
base_url=settings.exaone_base_url,
|
||||
model=settings.exaone_model,
|
||||
system_prompt=REWRITER_PROMPT,
|
||||
temperature=settings.exaone_temperature,
|
||||
timeout=settings.exaone_timeout,
|
||||
)
|
||||
self.reasoner = ModelAdapter(
|
||||
name="Gemma4",
|
||||
base_url=settings.reasoning_base_url,
|
||||
model=settings.reasoning_model,
|
||||
system_prompt=REASONER_PROMPT,
|
||||
temperature=settings.reasoning_temperature,
|
||||
timeout=settings.reasoning_timeout,
|
||||
)
|
||||
|
||||
def start_health_loop(self, interval: float = 30.0) -> None:
|
||||
self._health_task = asyncio.create_task(self._health_loop(interval))
|
||||
|
||||
def stop_health_loop(self) -> None:
|
||||
if self._health_task and not self._health_task.done():
|
||||
self._health_task.cancel()
|
||||
|
||||
async def _health_loop(self, interval: float) -> None:
|
||||
while True:
|
||||
await self._check_all()
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
async def _check_all(self) -> None:
|
||||
for role, adapter in [("rewriter", self.rewriter), ("reasoner", self.reasoner)]:
|
||||
if not adapter:
|
||||
continue
|
||||
start = time.monotonic()
|
||||
healthy = await adapter.health_check()
|
||||
elapsed = round((time.monotonic() - start) * 1000, 1)
|
||||
prev = self._health[role]
|
||||
self._health[role] = healthy
|
||||
self._latency[role] = elapsed
|
||||
if prev != healthy:
|
||||
status = "UP" if healthy else "DOWN"
|
||||
logger.warning("%s (%s) → %s (%.0fms)", adapter.name, role, status, elapsed)
|
||||
|
||||
def is_healthy(self, role: str) -> bool:
|
||||
return self._health.get(role, False)
|
||||
|
||||
def health_summary(self) -> dict:
|
||||
result = {}
|
||||
for role, adapter in [("rewriter", self.rewriter), ("reasoner", self.reasoner)]:
|
||||
if adapter:
|
||||
result[role] = {
|
||||
"name": adapter.name,
|
||||
"model": adapter.model,
|
||||
"healthy": self._health[role],
|
||||
"latency_ms": self._latency[role],
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
backend_registry = BackendRegistry()
|
||||
@@ -1,90 +0,0 @@
|
||||
"""EXAONE Adapter — Ollama OpenAI-compat endpoint를 통한 EXAONE 호출."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
import httpx
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"너는 NanoClaude, 사용자의 질문을 이해하고 정리하여 명확한 답변을 제공하는 AI 어시스턴트다. "
|
||||
"사용자의 질문 의도를 파악하고, 문장을 정리하며, 구조화된 응답을 생성한다."
|
||||
)
|
||||
|
||||
|
||||
async def stream_chat(message: str) -> AsyncGenerator[str, None]:
|
||||
"""EXAONE 스트리밍 호출. OpenAI-compat SSE를 chunk 단위로 yield."""
|
||||
payload = {
|
||||
"model": settings.exaone_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": message},
|
||||
],
|
||||
"stream": True,
|
||||
"temperature": settings.exaone_temperature,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=settings.exaone_timeout) as client:
|
||||
try:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{settings.exaone_base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
) as resp:
|
||||
if resp.status_code != 200:
|
||||
body = await resp.aread()
|
||||
logger.error("EXAONE error %d: %s", resp.status_code, body.decode())
|
||||
yield f"[Error] EXAONE 응답 실패 ({resp.status_code})"
|
||||
return
|
||||
|
||||
async for line in resp.aiter_lines():
|
||||
line = line.strip()
|
||||
if not line or not line.startswith("data: "):
|
||||
continue
|
||||
payload_str = line[len("data: "):]
|
||||
if payload_str == "[DONE]":
|
||||
return
|
||||
# Extract content delta from OpenAI-format chunk
|
||||
try:
|
||||
import json
|
||||
chunk = json.loads(payload_str)
|
||||
delta = chunk.get("choices", [{}])[0].get("delta", {})
|
||||
content = delta.get("content", "")
|
||||
if content:
|
||||
yield content
|
||||
except (json.JSONDecodeError, IndexError, KeyError):
|
||||
continue
|
||||
|
||||
except httpx.ConnectError:
|
||||
logger.error("EXAONE connection failed: %s", settings.exaone_base_url)
|
||||
yield "[Error] EXAONE 서버에 연결할 수 없습니다."
|
||||
except httpx.ReadTimeout:
|
||||
logger.error("EXAONE read timeout")
|
||||
yield "[Error] EXAONE 응답 시간이 초과되었습니다."
|
||||
|
||||
|
||||
async def complete_chat(message: str) -> str:
|
||||
"""EXAONE 비스트리밍 호출. 전체 응답 텍스트 반환."""
|
||||
payload = {
|
||||
"model": settings.exaone_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": message},
|
||||
],
|
||||
"stream": False,
|
||||
"temperature": settings.exaone_temperature,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=settings.exaone_timeout) as client:
|
||||
resp = await client.post(
|
||||
f"{settings.exaone_base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
@@ -17,6 +17,8 @@ class Job:
|
||||
status: JobStatus = JobStatus.queued
|
||||
created_at: float = field(default_factory=time)
|
||||
task: asyncio.Task | None = field(default=None, repr=False)
|
||||
pipeline: bool = True
|
||||
rewritten_message: str = ""
|
||||
|
||||
|
||||
class JobManager:
|
||||
|
||||
57
nanoclaude/services/job_queue.py
Normal file
57
nanoclaude/services/job_queue.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""JobQueue — Semaphore 기반 동시성 제한."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from services import worker
|
||||
from services.job_manager import Job, job_manager
|
||||
from services.state_stream import state_stream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobQueue:
|
||||
def __init__(self, max_concurrent: int = 3) -> None:
|
||||
self._semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self._waiting: list[str] = [] # 대기 중 job_id (순서 보장)
|
||||
self._active: set[str] = set()
|
||||
|
||||
async def submit(self, job: Job) -> asyncio.Task:
|
||||
task = asyncio.create_task(self._run_with_semaphore(job))
|
||||
job_manager.attach_task(job.id, task)
|
||||
return task
|
||||
|
||||
async def _run_with_semaphore(self, job: Job) -> None:
|
||||
self._waiting.append(job.id)
|
||||
pos = self.position(job.id)
|
||||
if pos and pos > 0:
|
||||
await state_stream.push(job.id, "queued", {"position": pos})
|
||||
|
||||
try:
|
||||
async with self._semaphore:
|
||||
self._waiting.remove(job.id)
|
||||
self._active.add(job.id)
|
||||
await worker.run(job)
|
||||
finally:
|
||||
self._active.discard(job.id)
|
||||
|
||||
def position(self, job_id: str) -> int | None:
|
||||
try:
|
||||
return self._waiting.index(job_id) + 1
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@property
|
||||
def stats(self) -> dict:
|
||||
return {"pending": len(self._waiting), "active": len(self._active)}
|
||||
|
||||
|
||||
job_queue: JobQueue | None = None
|
||||
|
||||
|
||||
def init_queue(max_concurrent: int = 3) -> JobQueue:
|
||||
global job_queue
|
||||
job_queue = JobQueue(max_concurrent)
|
||||
return job_queue
|
||||
109
nanoclaude/services/model_adapter.py
Normal file
109
nanoclaude/services/model_adapter.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""ModelAdapter — 범용 OpenAI-compat 모델 어댑터."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ModelAdapter:
|
||||
"""OpenAI-compatible /v1/chat/completions 백엔드 범용 어댑터.
|
||||
Ollama, MLX 등 모두 동일 인터페이스로 호출."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
base_url: str,
|
||||
model: str,
|
||||
system_prompt: str,
|
||||
temperature: float = 0.7,
|
||||
timeout: float = 120.0,
|
||||
):
|
||||
self.name = name
|
||||
self.base_url = base_url
|
||||
self.model = model
|
||||
self.system_prompt = system_prompt
|
||||
self.temperature = temperature
|
||||
self.timeout = timeout
|
||||
|
||||
async def stream_chat(self, message: str) -> AsyncGenerator[str, None]:
|
||||
"""스트리밍 호출. content chunk를 yield."""
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": self.system_prompt},
|
||||
{"role": "user", "content": message},
|
||||
],
|
||||
"stream": True,
|
||||
"temperature": self.temperature,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||
try:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{self.base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
) as resp:
|
||||
if resp.status_code != 200:
|
||||
body = await resp.aread()
|
||||
logger.error("%s error %d: %s", self.name, resp.status_code, body.decode())
|
||||
raise RuntimeError(f"{self.name} 응답 실패 ({resp.status_code})")
|
||||
|
||||
async for line in resp.aiter_lines():
|
||||
line = line.strip()
|
||||
if not line or not line.startswith("data: "):
|
||||
continue
|
||||
payload_str = line[len("data: "):]
|
||||
if payload_str == "[DONE]":
|
||||
return
|
||||
try:
|
||||
chunk = json.loads(payload_str)
|
||||
delta = chunk.get("choices", [{}])[0].get("delta", {})
|
||||
content = delta.get("content", "")
|
||||
if content:
|
||||
yield content
|
||||
except (json.JSONDecodeError, IndexError, KeyError):
|
||||
continue
|
||||
|
||||
except httpx.ConnectError:
|
||||
logger.error("%s connection failed: %s", self.name, self.base_url)
|
||||
raise
|
||||
except httpx.ReadTimeout:
|
||||
logger.error("%s read timeout", self.name)
|
||||
raise
|
||||
|
||||
async def complete_chat(self, message: str) -> str:
|
||||
"""비스트리밍 호출. 전체 응답 텍스트 반환."""
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": self.system_prompt},
|
||||
{"role": "user", "content": message},
|
||||
],
|
||||
"stream": False,
|
||||
"temperature": self.temperature,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||
resp = await client.post(
|
||||
f"{self.base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""GET /v1/models — 3초 timeout."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=3.0) as client:
|
||||
resp = await client.get(f"{self.base_url}/v1/models")
|
||||
return resp.status_code < 500
|
||||
except Exception:
|
||||
return False
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Worker — background task that drives EXAONE call and pushes SSE events."""
|
||||
"""Worker — 2단계 파이프라인: EXAONE rewrite → Gemma reasoning (cancel-safe + fallback)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -9,23 +9,66 @@ from time import time
|
||||
from config import settings
|
||||
from db.database import log_completion, log_request
|
||||
from models.schemas import JobStatus
|
||||
from services.exaone_adapter import stream_chat
|
||||
from services.backend_registry import backend_registry
|
||||
from services.job_manager import Job, job_manager
|
||||
from services.state_stream import state_stream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 무응답 방지: 3~5초 간격으로 processing heartbeat
|
||||
HEARTBEAT_INTERVAL = 4.0
|
||||
REWRITE_HEARTBEAT = 2.0
|
||||
MAX_REWRITE_LENGTH = 1000
|
||||
|
||||
|
||||
async def _complete_with_heartbeat(adapter, message: str, job_id: str) -> str:
|
||||
"""complete_chat + heartbeat 병행. rewrite 대기 중 사용자 체감 멈춤 방지."""
|
||||
result_holder: dict[str, str] = {}
|
||||
exc_holder: list[Exception] = []
|
||||
|
||||
async def call():
|
||||
try:
|
||||
result_holder["text"] = await adapter.complete_chat(message)
|
||||
except Exception as e:
|
||||
exc_holder.append(e)
|
||||
|
||||
task = asyncio.create_task(call())
|
||||
while not task.done():
|
||||
await asyncio.sleep(REWRITE_HEARTBEAT)
|
||||
if not task.done():
|
||||
await state_stream.push(job_id, "processing", {"message": "질문을 재구성하고 있습니다..."})
|
||||
|
||||
if exc_holder:
|
||||
raise exc_holder[0]
|
||||
return result_holder.get("text", "")
|
||||
|
||||
|
||||
async def _stream_with_cancel(adapter, message: str, job: Job, collected: list[str]) -> bool:
|
||||
"""스트리밍 + cancel 체크. 정상 완료 시 True, cancel 시 False."""
|
||||
last_heartbeat = asyncio.get_event_loop().time()
|
||||
|
||||
async for chunk in adapter.stream_chat(message):
|
||||
if job.status == JobStatus.cancelled:
|
||||
return False
|
||||
collected.append(chunk)
|
||||
await state_stream.push(job.id, "result", {"content": chunk})
|
||||
|
||||
now = asyncio.get_event_loop().time()
|
||||
if now - last_heartbeat >= HEARTBEAT_INTERVAL:
|
||||
await state_stream.push(job.id, "processing", {"message": "응답 생성 중..."})
|
||||
last_heartbeat = now
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def run(job: Job) -> None:
|
||||
"""EXAONE 호출 → SSE 이벤트 발행."""
|
||||
"""EXAONE rewrite → Gemma reasoning 파이프라인 (fallback + cancel-safe)."""
|
||||
start_time = time()
|
||||
rewrite_model = None
|
||||
reasoning_model = None
|
||||
rewritten_message = ""
|
||||
|
||||
# DB 로깅: 요청 기록
|
||||
try:
|
||||
await log_request(job.id, job.message, settings.exaone_model, job.created_at)
|
||||
await log_request(job.id, job.message, "pipeline", job.created_at)
|
||||
except Exception:
|
||||
logger.warning("Failed to log request for job %s", job.id, exc_info=True)
|
||||
|
||||
@@ -34,49 +77,91 @@ async def run(job: Job) -> None:
|
||||
await state_stream.push(job.id, "ack", {"message": "요청을 확인했습니다. 분석을 시작합니다."})
|
||||
job_manager.set_status(job.id, JobStatus.processing)
|
||||
|
||||
# --- Processing + Streaming ---
|
||||
await state_stream.push(job.id, "processing", {"message": "EXAONE 모델이 응답을 생성하고 있습니다..."})
|
||||
# --- Cancel 체크 #1 ---
|
||||
if job.status == JobStatus.cancelled:
|
||||
return
|
||||
|
||||
use_pipeline = settings.pipeline_enabled and backend_registry.is_healthy("reasoner")
|
||||
collected: list[str] = []
|
||||
last_heartbeat = asyncio.get_event_loop().time()
|
||||
|
||||
async for chunk in stream_chat(job.message):
|
||||
if not use_pipeline:
|
||||
# === EXAONE 단독 모드 (Phase 1 fallback) ===
|
||||
rewrite_model = backend_registry.rewriter.model
|
||||
await state_stream.push(job.id, "processing", {"message": "EXAONE 모델이 응답을 생성하고 있습니다..."})
|
||||
|
||||
ok = await _stream_with_cancel(backend_registry.rewriter, job.message, job, collected)
|
||||
if not ok:
|
||||
return
|
||||
else:
|
||||
# === 파이프라인 모드: EXAONE rewrite → Gemma reasoning ===
|
||||
rewrite_model = backend_registry.rewriter.model
|
||||
reasoning_model = backend_registry.reasoner.model
|
||||
|
||||
# --- Rewrite ---
|
||||
await state_stream.push(job.id, "processing", {"message": "질문을 재구성하고 있습니다..."})
|
||||
rewrite_start = time()
|
||||
|
||||
try:
|
||||
rewritten_message = await _complete_with_heartbeat(
|
||||
backend_registry.rewriter, job.message, job.id
|
||||
)
|
||||
rewritten_message = rewritten_message[:MAX_REWRITE_LENGTH]
|
||||
except Exception:
|
||||
logger.warning("Rewrite failed for job %s, using original message", job.id)
|
||||
rewritten_message = job.message
|
||||
|
||||
rewrite_latency = (time() - rewrite_start) * 1000
|
||||
job.rewritten_message = rewritten_message
|
||||
|
||||
# --- Rewrite 결과 SSE 노출 ---
|
||||
await state_stream.push(job.id, "rewrite", {"content": rewritten_message})
|
||||
|
||||
# --- Cancel 체크 #2 ---
|
||||
if job.status == JobStatus.cancelled:
|
||||
logger.info("Job %s cancelled during streaming", job.id)
|
||||
await state_stream.push(job.id, "error", {"message": "작업이 취소되었습니다."})
|
||||
latency_ms = (time() - start_time) * 1000
|
||||
try:
|
||||
await log_completion(job.id, "cancelled", len("".join(collected)), latency_ms, time())
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
collected.append(chunk)
|
||||
# --- Reasoning ---
|
||||
await state_stream.push(job.id, "processing", {"message": "Gemma 4가 응답을 생성하고 있습니다..."})
|
||||
|
||||
# Stream partial result
|
||||
await state_stream.push(job.id, "result", {"content": chunk})
|
||||
try:
|
||||
ok = await _stream_with_cancel(backend_registry.reasoner, rewritten_message, job, collected)
|
||||
if not ok:
|
||||
return
|
||||
except Exception:
|
||||
# Gemma streaming 중간 실패 → EXAONE fallback
|
||||
logger.warning("Reasoner failed for job %s, falling back to rewriter", job.id, exc_info=True)
|
||||
|
||||
# Heartbeat: 긴 침묵 방지
|
||||
now = asyncio.get_event_loop().time()
|
||||
if now - last_heartbeat >= HEARTBEAT_INTERVAL:
|
||||
await state_stream.push(job.id, "processing", {"message": "응답 생성 중..."})
|
||||
last_heartbeat = now
|
||||
if job.status == JobStatus.cancelled:
|
||||
return
|
||||
|
||||
await state_stream.push(job.id, "processing", {"message": "모델 전환 중..."})
|
||||
reasoning_model = rewrite_model # fallback 기록
|
||||
|
||||
ok = await _stream_with_cancel(backend_registry.rewriter, job.message, job, collected)
|
||||
if not ok:
|
||||
return
|
||||
|
||||
# --- Complete ---
|
||||
if not collected:
|
||||
job_manager.set_status(job.id, JobStatus.failed)
|
||||
await state_stream.push(job.id, "error", {"message": "EXAONE으로부터 응답을 받지 못했습니다."})
|
||||
await state_stream.push(job.id, "error", {"message": "응답을 받지 못했습니다."})
|
||||
status = "failed"
|
||||
else:
|
||||
job_manager.set_status(job.id, JobStatus.completed)
|
||||
await state_stream.push(job.id, "done", {"message": "완료"})
|
||||
status = "completed"
|
||||
|
||||
# DB 로깅: 완료 기록
|
||||
# --- DB 로깅 ---
|
||||
latency_ms = (time() - start_time) * 1000
|
||||
response_text = "".join(collected)
|
||||
try:
|
||||
await log_completion(job.id, status, len(response_text), latency_ms, time())
|
||||
await log_completion(
|
||||
job.id, status, len(response_text), latency_ms, time(),
|
||||
rewrite_model=rewrite_model,
|
||||
reasoning_model=reasoning_model,
|
||||
rewritten_message=rewritten_message,
|
||||
rewrite_latency_ms=rewrite_latency if use_pipeline else 0,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("Failed to log completion for job %s", job.id, exc_info=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user