diff --git a/ai-service/config.py b/ai-service/config.py index 5c574b5..79aef0c 100644 --- a/ai-service/config.py +++ b/ai-service/config.py @@ -2,8 +2,8 @@ from pydantic_settings import BaseSettings class Settings(BaseSettings): - OLLAMA_BASE_URL: str = "http://100.111.160.84:11434" - OLLAMA_TEXT_MODEL: str = "qwen3:8b" + OLLAMA_BASE_URL: str = "https://gpu.hyungi.net" + OLLAMA_TEXT_MODEL: str = "qwen3.5:9b-q8_0" OLLAMA_EMBED_MODEL: str = "bge-m3" OLLAMA_TIMEOUT: int = 120 diff --git a/ai-service/routers/health.py b/ai-service/routers/health.py index 99b4201..bbee30e 100644 --- a/ai-service/routers/health.py +++ b/ai-service/routers/health.py @@ -7,12 +7,24 @@ router = APIRouter(tags=["health"]) @router.get("/health") async def health_check(): - ollama_status = await ollama_client.check_health() + backends = await ollama_client.check_health() + stats = vector_store.stats() + + # 메인 텍스트 모델명 결정 (Ollama 메인, MLX fallback) + model_name = None + ollama_models = backends.get("ollama", {}).get("models", []) + if ollama_models: + model_name = ollama_models[0] + if not model_name and backends.get("mlx", {}).get("status") == "connected": + model_name = backends["mlx"].get("model") + return { "status": "ok", "service": "tk-ai-service", - "ollama": ollama_status, - "embeddings": vector_store.stats(), + "model": model_name, + "ollama": backends.get("ollama", {}), + "mlx": backends.get("mlx", {}), + "embeddings": stats, } diff --git a/ai-service/services/ollama_client.py b/ai-service/services/ollama_client.py index ae7adfb..a897e1d 100644 --- a/ai-service/services/ollama_client.py +++ b/ai-service/services/ollama_client.py @@ -43,7 +43,21 @@ class OllamaClient: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) client = await self._get_client() + # 조립컴 Ollama 메인, MLX fallback try: + response = await client.post( + f"{self.base_url}/api/chat", + json={ + "model": settings.OLLAMA_TEXT_MODEL, + "messages": messages, + "stream": False, + "think": False, + "options": {"temperature": 0.3, "num_predict": 2048}, + }, + ) + response.raise_for_status() + return response.json()["message"]["content"] + except Exception: response = await client.post( f"{settings.MLX_BASE_URL}/chat/completions", json={ @@ -55,31 +69,20 @@ class OllamaClient: ) response.raise_for_status() return response.json()["choices"][0]["message"]["content"] - except Exception: - response = await client.post( - f"{self.base_url}/api/chat", - json={ - "model": settings.OLLAMA_TEXT_MODEL, - "messages": messages, - "stream": False, - "options": {"temperature": 0.3, "num_predict": 2048}, - }, - ) - response.raise_for_status() - return response.json()["message"]["content"] async def check_health(self) -> dict: result = {} + short_timeout = httpx.Timeout(5.0, connect=3.0) try: - client = await self._get_client() - response = await client.get(f"{self.base_url}/api/tags") + async with httpx.AsyncClient(timeout=short_timeout) as c: + response = await c.get(f"{self.base_url}/api/tags") models = response.json().get("models", []) result["ollama"] = {"status": "connected", "models": [m["name"] for m in models]} except Exception: result["ollama"] = {"status": "disconnected"} try: - client = await self._get_client() - response = await client.get(f"{settings.MLX_BASE_URL}/health") + async with httpx.AsyncClient(timeout=short_timeout) as c: + response = await c.get(f"{settings.MLX_BASE_URL}/health") result["mlx"] = {"status": "connected", "model": settings.MLX_TEXT_MODEL} except Exception: result["mlx"] = {"status": "disconnected"} diff --git a/docker-compose.yml b/docker-compose.yml index cb1d14a..bb01b20 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -298,8 +298,8 @@ services: ports: - "30400:8000" environment: - - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://100.111.160.84:11434} - - OLLAMA_TEXT_MODEL=${OLLAMA_TEXT_MODEL:-qwen3:8b} + - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-https://gpu.hyungi.net} + - OLLAMA_TEXT_MODEL=${OLLAMA_TEXT_MODEL:-qwen3.5:9b-q8_0} - OLLAMA_EMBED_MODEL=${OLLAMA_EMBED_MODEL:-bge-m3} - OLLAMA_TIMEOUT=${OLLAMA_TIMEOUT:-120} - MLX_BASE_URL=${MLX_BASE_URL:-https://llm.hyungi.net}