feat: AI 서비스 MLX 듀얼 백엔드 및 모델 최적화

- MLX(맥미니 27B) 우선 → Ollama(조립컴 9B) fallback 구조
- pydantic-settings 기반 config 전환
- health check에 MLX 상태 추가
- 텍스트 모델 qwen3:8b → qwen3.5:9b-q8_0 변경

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-03-06 23:17:50 +09:00
parent cad662473b
commit 2f7e083db0
14 changed files with 231 additions and 140 deletions

View File

@@ -1,5 +1,6 @@
import json
from services.ollama_client import ollama_client
from services.utils import load_prompt, parse_json_response
from config import settings
@@ -7,13 +8,8 @@ CLASSIFY_PROMPT_PATH = "prompts/classify_issue.txt"
SUMMARIZE_PROMPT_PATH = "prompts/summarize_issue.txt"
def _load_prompt(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
async def classify_issue(description: str, detail_notes: str = "") -> dict:
template = _load_prompt(CLASSIFY_PROMPT_PATH)
template = load_prompt(CLASSIFY_PROMPT_PATH)
prompt = template.format(
description=description or "",
detail_notes=detail_notes or "",
@@ -32,7 +28,7 @@ async def classify_issue(description: str, detail_notes: str = "") -> dict:
async def summarize_issue(
description: str, detail_notes: str = "", solution: str = ""
) -> dict:
template = _load_prompt(SUMMARIZE_PROMPT_PATH)
template = load_prompt(SUMMARIZE_PROMPT_PATH)
prompt = template.format(
description=description or "",
detail_notes=detail_notes or "",

View File

@@ -37,26 +37,46 @@ def build_metadata(issue: dict) -> dict:
return meta
async def sync_all_issues() -> dict:
issues = get_all_issues()
BATCH_SIZE = 10
async def _sync_issues_batch(issues: list[dict]) -> tuple[int, int]:
"""배치 단위로 임베딩 생성 후 벡터 스토어에 저장"""
synced = 0
skipped = 0
# 유효한 이슈와 텍스트 준비
valid = []
for issue in issues:
doc_text = build_document_text(issue)
if not doc_text.strip():
skipped += 1
continue
valid.append((issue, doc_text))
# 배치 단위로 임베딩 생성
for i in range(0, len(valid), BATCH_SIZE):
batch = valid[i:i + BATCH_SIZE]
texts = [doc_text for _, doc_text in batch]
try:
embedding = await ollama_client.generate_embedding(doc_text)
vector_store.upsert(
doc_id=f"issue_{issue['id']}",
document=doc_text,
embedding=embedding,
metadata=build_metadata(issue),
)
synced += 1
except Exception as e:
skipped += 1
embeddings = await ollama_client.batch_embeddings(texts)
for (issue, doc_text), embedding in zip(batch, embeddings):
vector_store.upsert(
doc_id=f"issue_{issue['id']}",
document=doc_text,
embedding=embedding,
metadata=build_metadata(issue),
)
synced += 1
except Exception:
skipped += len(batch)
return synced, skipped
async def sync_all_issues() -> dict:
issues = get_all_issues()
synced, skipped = await _sync_issues_batch(issues)
if issues:
max_id = max(i["id"] for i in issues)
metadata_store.set_last_synced_id(max_id)
@@ -83,26 +103,11 @@ async def sync_single_issue(issue_id: int) -> dict:
async def sync_incremental() -> dict:
last_id = metadata_store.get_last_synced_id()
issues = get_issues_since(last_id)
synced = 0
for issue in issues:
doc_text = build_document_text(issue)
if not doc_text.strip():
continue
try:
embedding = await ollama_client.generate_embedding(doc_text)
vector_store.upsert(
doc_id=f"issue_{issue['id']}",
document=doc_text,
embedding=embedding,
metadata=build_metadata(issue),
)
synced += 1
except Exception:
pass
synced, skipped = await _sync_issues_batch(issues)
if issues:
max_id = max(i["id"] for i in issues)
metadata_store.set_last_synced_id(max_id)
return {"synced": synced, "new_issues": len(issues)}
return {"synced": synced, "skipped": skipped, "new_issues": len(issues)}
async def search_similar_by_id(issue_id: int, n_results: int = 5) -> list[dict]:

View File

@@ -1,3 +1,4 @@
import asyncio
import httpx
from config import settings
@@ -6,29 +7,55 @@ class OllamaClient:
def __init__(self):
self.base_url = settings.OLLAMA_BASE_URL
self.timeout = httpx.Timeout(float(settings.OLLAMA_TIMEOUT), connect=10.0)
self._client: httpx.AsyncClient | None = None
async def _get_client(self) -> httpx.AsyncClient:
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(timeout=self.timeout)
return self._client
async def close(self):
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None
async def generate_embedding(self, text: str) -> list[float]:
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.post(
f"{self.base_url}/api/embeddings",
json={"model": settings.OLLAMA_EMBED_MODEL, "prompt": text},
)
response.raise_for_status()
return response.json()["embedding"]
client = await self._get_client()
response = await client.post(
f"{self.base_url}/api/embeddings",
json={"model": settings.OLLAMA_EMBED_MODEL, "prompt": text},
)
response.raise_for_status()
return response.json()["embedding"]
async def batch_embeddings(self, texts: list[str]) -> list[list[float]]:
results = []
for text in texts:
emb = await self.generate_embedding(text)
results.append(emb)
return results
async def batch_embeddings(self, texts: list[str], concurrency: int = 5) -> list[list[float]]:
semaphore = asyncio.Semaphore(concurrency)
async def _embed(text: str) -> list[float]:
async with semaphore:
return await self.generate_embedding(text)
return await asyncio.gather(*[_embed(t) for t in texts])
async def generate_text(self, prompt: str, system: str = None) -> str:
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
async with httpx.AsyncClient(timeout=self.timeout) as client:
client = await self._get_client()
try:
response = await client.post(
f"{settings.MLX_BASE_URL}/chat/completions",
json={
"model": settings.MLX_TEXT_MODEL,
"messages": messages,
"max_tokens": 2048,
"temperature": 0.3,
},
)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
except Exception:
response = await client.post(
f"{self.base_url}/api/chat",
json={
@@ -42,16 +69,21 @@ class OllamaClient:
return response.json()["message"]["content"]
async def check_health(self) -> dict:
result = {}
try:
async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
response = await client.get(f"{self.base_url}/api/tags")
models = response.json().get("models", [])
return {
"status": "connected",
"models": [m["name"] for m in models],
}
client = await self._get_client()
response = await client.get(f"{self.base_url}/api/tags")
models = response.json().get("models", [])
result["ollama"] = {"status": "connected", "models": [m["name"] for m in models]}
except Exception:
return {"status": "disconnected"}
result["ollama"] = {"status": "disconnected"}
try:
client = await self._get_client()
response = await client.get(f"{settings.MLX_BASE_URL}/health")
result["mlx"] = {"status": "connected", "model": settings.MLX_TEXT_MODEL}
except Exception:
result["mlx"] = {"status": "disconnected"}
return result
ollama_client = OllamaClient()

View File

@@ -1,11 +1,7 @@
from services.ollama_client import ollama_client
from services.embedding_service import search_similar_by_text, build_document_text
from services.db_client import get_issue_by_id
def _load_prompt(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
from services.utils import load_prompt
def _format_retrieved_issues(results: list[dict]) -> str:
@@ -55,7 +51,7 @@ async def rag_suggest_solution(issue_id: int) -> dict:
break
context = _format_retrieved_issues(similar)
template = _load_prompt("prompts/rag_suggest_solution.txt")
template = load_prompt("prompts/rag_suggest_solution.txt")
prompt = template.format(
description=issue.get("description", ""),
detail_notes=issue.get("detail_notes", ""),
@@ -87,7 +83,7 @@ async def rag_ask(question: str, project_id: int = None) -> dict:
)
context = _format_retrieved_issues(results)
template = _load_prompt("prompts/rag_qa.txt")
template = load_prompt("prompts/rag_qa.txt")
prompt = template.format(
question=question,
retrieved_cases=context,
@@ -113,7 +109,7 @@ async def rag_analyze_pattern(description: str, n_results: int = 10) -> dict:
results = await search_similar_by_text(description, n_results=n_results)
context = _format_retrieved_issues(results)
template = _load_prompt("prompts/rag_pattern.txt")
template = load_prompt("prompts/rag_pattern.txt")
prompt = template.format(
description=description,
retrieved_cases=context,
@@ -142,7 +138,7 @@ async def rag_classify_with_context(description: str, detail_notes: str = "") ->
similar = await search_similar_by_text(query, n_results=5)
context = _format_retrieved_issues(similar)
template = _load_prompt("prompts/rag_classify.txt")
template = load_prompt("prompts/rag_classify.txt")
prompt = template.format(
description=description,
detail_notes=detail_notes,

View File

@@ -1,58 +1,38 @@
import asyncio
import httpx
from services.ollama_client import ollama_client
from services.db_client import get_daily_qc_stats, get_issues_for_date
from services.utils import load_prompt
from config import settings
REPORT_PROMPT_PATH = "prompts/daily_report.txt"
def _load_prompt(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
async def _fetch_one(client: httpx.AsyncClient, url: str, params: dict, headers: dict):
try:
r = await client.get(url, params=params, headers=headers)
if r.status_code == 200:
return r.json()
except Exception:
pass
return None
async def _fetch_system1_data(date_str: str, token: str) -> dict:
headers = {"Authorization": f"Bearer {token}"}
data = {"attendance": None, "work_reports": None, "patrol": None}
params = {"date": date_str}
base = settings.SYSTEM1_API_URL
try:
async with httpx.AsyncClient(timeout=15.0) as client:
# 근태
try:
r = await client.get(
f"{settings.SYSTEM1_API_URL}/api/attendance/daily-status",
params={"date": date_str},
headers=headers,
)
if r.status_code == 200:
data["attendance"] = r.json()
except Exception:
pass
# 작업보고
try:
r = await client.get(
f"{settings.SYSTEM1_API_URL}/api/daily-work-reports/summary",
params={"date": date_str},
headers=headers,
)
if r.status_code == 200:
data["work_reports"] = r.json()
except Exception:
pass
# 순회점검
try:
r = await client.get(
f"{settings.SYSTEM1_API_URL}/api/patrol/today-status",
params={"date": date_str},
headers=headers,
)
if r.status_code == 200:
data["patrol"] = r.json()
except Exception:
pass
attendance, work_reports, patrol = await asyncio.gather(
_fetch_one(client, f"{base}/api/attendance/daily-status", params, headers),
_fetch_one(client, f"{base}/api/daily-work-reports/summary", params, headers),
_fetch_one(client, f"{base}/api/patrol/today-status", params, headers),
)
except Exception:
pass
return data
attendance = work_reports = patrol = None
return {"attendance": attendance, "work_reports": work_reports, "patrol": patrol}
def _format_attendance(data) -> str:
@@ -102,7 +82,7 @@ async def generate_daily_report(
qc_stats = get_daily_qc_stats(date_str)
qc_issues = get_issues_for_date(date_str)
template = _load_prompt(REPORT_PROMPT_PATH)
template = load_prompt(REPORT_PROMPT_PATH)
prompt = template.format(
date=date_str,
attendance_data=_format_attendance(system1_data["attendance"]),

View File

@@ -0,0 +1,22 @@
import json
import os
_BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
def load_prompt(path: str) -> str:
full_path = os.path.join(_BASE_DIR, path)
with open(full_path, "r", encoding="utf-8") as f:
return f.read()
def parse_json_response(raw: str) -> dict:
"""LLM 응답에서 JSON을 추출합니다."""
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return {}
try:
return json.loads(raw[start:end])
except json.JSONDecodeError:
return {}