feat: AI 서비스 MLX 듀얼 백엔드 및 모델 최적화

- MLX(맥미니 27B) 우선 → Ollama(조립컴 9B) fallback 구조
- pydantic-settings 기반 config 전환
- health check에 MLX 상태 추가
- 텍스트 모델 qwen3:8b → qwen3.5:9b-q8_0 변경

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-03-06 23:17:50 +09:00
parent cad662473b
commit 2f7e083db0
14 changed files with 231 additions and 140 deletions

View File

@@ -1,24 +1,30 @@
import os
from pydantic_settings import BaseSettings
class Settings:
OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://100.111.160.84:11434")
OLLAMA_TEXT_MODEL: str = os.getenv("OLLAMA_TEXT_MODEL", "qwen2.5:14b-instruct-q4_K_M")
OLLAMA_EMBED_MODEL: str = os.getenv("OLLAMA_EMBED_MODEL", "bge-m3")
OLLAMA_TIMEOUT: int = int(os.getenv("OLLAMA_TIMEOUT", "120"))
class Settings(BaseSettings):
OLLAMA_BASE_URL: str = "http://100.111.160.84:11434"
OLLAMA_TEXT_MODEL: str = "qwen3:8b"
OLLAMA_EMBED_MODEL: str = "bge-m3"
OLLAMA_TIMEOUT: int = 120
DB_HOST: str = os.getenv("DB_HOST", "mariadb")
DB_PORT: int = int(os.getenv("DB_PORT", "3306"))
DB_USER: str = os.getenv("DB_USER", "hyungi_user")
DB_PASSWORD: str = os.getenv("DB_PASSWORD", "")
DB_NAME: str = os.getenv("DB_NAME", "hyungi")
MLX_BASE_URL: str = "https://llm.hyungi.net"
MLX_TEXT_MODEL: str = "/Users/hyungi/mlx-models/Qwen3.5-27B-4bit"
SECRET_KEY: str = os.getenv("SECRET_KEY", "")
DB_HOST: str = "mariadb"
DB_PORT: int = 3306
DB_USER: str = "hyungi_user"
DB_PASSWORD: str = ""
DB_NAME: str = "hyungi"
SECRET_KEY: str = ""
ALGORITHM: str = "HS256"
SYSTEM1_API_URL: str = os.getenv("SYSTEM1_API_URL", "http://system1-api:3005")
CHROMA_PERSIST_DIR: str = os.getenv("CHROMA_PERSIST_DIR", "/app/data/chroma")
METADATA_DB_PATH: str = os.getenv("METADATA_DB_PATH", "/app/data/metadata.db")
SYSTEM1_API_URL: str = "http://system1-api:3005"
CHROMA_PERSIST_DIR: str = "/app/data/chroma"
METADATA_DB_PATH: str = "/app/data/metadata.db"
class Config:
env_file = ".env"
settings = Settings()

View File

@@ -1,10 +1,28 @@
import os
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.responses import JSONResponse
from routers import health, embeddings, classification, daily_report, rag
from db.vector_store import vector_store
from db.metadata_store import metadata_store
from services.ollama_client import ollama_client
from middlewares.auth import verify_token
PUBLIC_PATHS = {"/", "/api/ai/health", "/api/ai/models"}
class AuthMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
if request.method == "OPTIONS" or request.url.path in PUBLIC_PATHS:
return await call_next(request)
try:
request.state.user = await verify_token(request)
except Exception as e:
return JSONResponse(status_code=401, content={"detail": str(e.detail) if hasattr(e, "detail") else "인증 실패"})
return await call_next(request)
@asynccontextmanager
@@ -12,6 +30,7 @@ async def lifespan(app: FastAPI):
vector_store.initialize()
metadata_store.initialize()
yield
await ollama_client.close()
app = FastAPI(
@@ -21,14 +40,25 @@ app = FastAPI(
lifespan=lifespan,
)
ALLOWED_ORIGINS = [
"https://tkfb.technicalkorea.net",
"https://tkreport.technicalkorea.net",
"https://tkqc.technicalkorea.net",
"https://tkuser.technicalkorea.net",
]
if os.getenv("ENV", "production") == "development":
ALLOWED_ORIGINS += ["http://localhost:30080", "http://localhost:30180", "http://localhost:30280"]
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=False,
allow_origins=ALLOWED_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.add_middleware(AuthMiddleware)
app.include_router(health.router, prefix="/api/ai")
app.include_router(embeddings.router, prefix="/api/ai")
app.include_router(classification.router, prefix="/api/ai")

View File

View File

@@ -0,0 +1,24 @@
from fastapi import Request, HTTPException, status
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from jose import jwt, JWTError, ExpiredSignatureError
from config import settings
security = HTTPBearer(auto_error=False)
async def verify_token(request: Request) -> dict:
"""JWT 토큰 검증. SSO 서비스와 동일한 시크릿 사용."""
auth: HTTPAuthorizationCredentials = await security(request)
if not auth:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Authorization 헤더가 필요합니다")
if not settings.SECRET_KEY:
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="서버 인증 설정 오류")
try:
payload = jwt.decode(auth.credentials, settings.SECRET_KEY, algorithms=[settings.ALGORITHM])
return payload
except ExpiredSignatureError:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="토큰이 만료되었습니다")
except JWTError:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="유효하지 않은 토큰입니다")

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services.classification_service import (
classify_issue,
@@ -26,7 +26,7 @@ async def classify(req: ClassifyRequest):
result = await classify_issue(req.description, req.detail_notes)
return {"available": True, **result}
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.post("/summarize")
@@ -35,7 +35,7 @@ async def summarize(req: SummarizeRequest):
result = await summarize_issue(req.description, req.detail_notes, req.solution)
return {"available": True, **result}
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.post("/classify-and-summarize")
@@ -44,4 +44,4 @@ async def classify_and_summarize_endpoint(req: ClassifyRequest):
result = await classify_and_summarize(req.description, req.detail_notes)
return {"available": True, **result}
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter, Request
from fastapi import APIRouter, HTTPException, Request
from pydantic import BaseModel
from services.report_service import generate_daily_report
from datetime import date
@@ -19,7 +19,7 @@ async def daily_report(req: DailyReportRequest, request: Request):
result = await generate_daily_report(report_date, req.project_id, token)
return {"available": True, **result}
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.post("/report/preview")
@@ -30,4 +30,4 @@ async def report_preview(req: DailyReportRequest, request: Request):
result = await generate_daily_report(report_date, req.project_id, token)
return {"available": True, "preview": True, **result}
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter, BackgroundTasks, Query
from fastapi import APIRouter, BackgroundTasks, HTTPException, Query
from pydantic import BaseModel
from services.embedding_service import (
sync_all_issues,
@@ -53,7 +53,7 @@ async def get_similar(issue_id: int, n_results: int = Query(default=5, le=20)):
results = await search_similar_by_id(issue_id, n_results)
return {"available": True, "results": results, "query_issue_id": issue_id}
except Exception as e:
return {"available": False, "results": [], "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.post("/similar/search")
@@ -69,7 +69,7 @@ async def search_similar(req: SearchRequest):
)
return {"available": True, "results": results}
except Exception as e:
return {"available": False, "results": [], "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.get("/embeddings/stats")

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services.rag_service import (
rag_suggest_solution,
@@ -30,7 +30,7 @@ async def suggest_solution(issue_id: int):
try:
return await rag_suggest_solution(issue_id)
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.post("/rag/ask")
@@ -38,7 +38,7 @@ async def ask_question(req: AskRequest):
try:
return await rag_ask(req.question, req.project_id)
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.post("/rag/pattern")
@@ -46,7 +46,7 @@ async def analyze_pattern(req: PatternRequest):
try:
return await rag_analyze_pattern(req.description, req.n_results)
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")
@router.post("/rag/classify")
@@ -54,4 +54,4 @@ async def classify_with_rag(req: ClassifyRequest):
try:
return await rag_classify_with_context(req.description, req.detail_notes)
except Exception as e:
return {"available": False, "error": str(e)}
raise HTTPException(status_code=500, detail="AI 서비스 처리 중 오류가 발생했습니다")

View File

@@ -1,5 +1,6 @@
import json
from services.ollama_client import ollama_client
from services.utils import load_prompt, parse_json_response
from config import settings
@@ -7,13 +8,8 @@ CLASSIFY_PROMPT_PATH = "prompts/classify_issue.txt"
SUMMARIZE_PROMPT_PATH = "prompts/summarize_issue.txt"
def _load_prompt(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
async def classify_issue(description: str, detail_notes: str = "") -> dict:
template = _load_prompt(CLASSIFY_PROMPT_PATH)
template = load_prompt(CLASSIFY_PROMPT_PATH)
prompt = template.format(
description=description or "",
detail_notes=detail_notes or "",
@@ -32,7 +28,7 @@ async def classify_issue(description: str, detail_notes: str = "") -> dict:
async def summarize_issue(
description: str, detail_notes: str = "", solution: str = ""
) -> dict:
template = _load_prompt(SUMMARIZE_PROMPT_PATH)
template = load_prompt(SUMMARIZE_PROMPT_PATH)
prompt = template.format(
description=description or "",
detail_notes=detail_notes or "",

View File

@@ -37,26 +37,46 @@ def build_metadata(issue: dict) -> dict:
return meta
async def sync_all_issues() -> dict:
issues = get_all_issues()
BATCH_SIZE = 10
async def _sync_issues_batch(issues: list[dict]) -> tuple[int, int]:
"""배치 단위로 임베딩 생성 후 벡터 스토어에 저장"""
synced = 0
skipped = 0
# 유효한 이슈와 텍스트 준비
valid = []
for issue in issues:
doc_text = build_document_text(issue)
if not doc_text.strip():
skipped += 1
continue
valid.append((issue, doc_text))
# 배치 단위로 임베딩 생성
for i in range(0, len(valid), BATCH_SIZE):
batch = valid[i:i + BATCH_SIZE]
texts = [doc_text for _, doc_text in batch]
try:
embedding = await ollama_client.generate_embedding(doc_text)
vector_store.upsert(
doc_id=f"issue_{issue['id']}",
document=doc_text,
embedding=embedding,
metadata=build_metadata(issue),
)
synced += 1
except Exception as e:
skipped += 1
embeddings = await ollama_client.batch_embeddings(texts)
for (issue, doc_text), embedding in zip(batch, embeddings):
vector_store.upsert(
doc_id=f"issue_{issue['id']}",
document=doc_text,
embedding=embedding,
metadata=build_metadata(issue),
)
synced += 1
except Exception:
skipped += len(batch)
return synced, skipped
async def sync_all_issues() -> dict:
issues = get_all_issues()
synced, skipped = await _sync_issues_batch(issues)
if issues:
max_id = max(i["id"] for i in issues)
metadata_store.set_last_synced_id(max_id)
@@ -83,26 +103,11 @@ async def sync_single_issue(issue_id: int) -> dict:
async def sync_incremental() -> dict:
last_id = metadata_store.get_last_synced_id()
issues = get_issues_since(last_id)
synced = 0
for issue in issues:
doc_text = build_document_text(issue)
if not doc_text.strip():
continue
try:
embedding = await ollama_client.generate_embedding(doc_text)
vector_store.upsert(
doc_id=f"issue_{issue['id']}",
document=doc_text,
embedding=embedding,
metadata=build_metadata(issue),
)
synced += 1
except Exception:
pass
synced, skipped = await _sync_issues_batch(issues)
if issues:
max_id = max(i["id"] for i in issues)
metadata_store.set_last_synced_id(max_id)
return {"synced": synced, "new_issues": len(issues)}
return {"synced": synced, "skipped": skipped, "new_issues": len(issues)}
async def search_similar_by_id(issue_id: int, n_results: int = 5) -> list[dict]:

View File

@@ -1,3 +1,4 @@
import asyncio
import httpx
from config import settings
@@ -6,29 +7,55 @@ class OllamaClient:
def __init__(self):
self.base_url = settings.OLLAMA_BASE_URL
self.timeout = httpx.Timeout(float(settings.OLLAMA_TIMEOUT), connect=10.0)
self._client: httpx.AsyncClient | None = None
async def _get_client(self) -> httpx.AsyncClient:
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(timeout=self.timeout)
return self._client
async def close(self):
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None
async def generate_embedding(self, text: str) -> list[float]:
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.post(
f"{self.base_url}/api/embeddings",
json={"model": settings.OLLAMA_EMBED_MODEL, "prompt": text},
)
response.raise_for_status()
return response.json()["embedding"]
client = await self._get_client()
response = await client.post(
f"{self.base_url}/api/embeddings",
json={"model": settings.OLLAMA_EMBED_MODEL, "prompt": text},
)
response.raise_for_status()
return response.json()["embedding"]
async def batch_embeddings(self, texts: list[str]) -> list[list[float]]:
results = []
for text in texts:
emb = await self.generate_embedding(text)
results.append(emb)
return results
async def batch_embeddings(self, texts: list[str], concurrency: int = 5) -> list[list[float]]:
semaphore = asyncio.Semaphore(concurrency)
async def _embed(text: str) -> list[float]:
async with semaphore:
return await self.generate_embedding(text)
return await asyncio.gather(*[_embed(t) for t in texts])
async def generate_text(self, prompt: str, system: str = None) -> str:
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
async with httpx.AsyncClient(timeout=self.timeout) as client:
client = await self._get_client()
try:
response = await client.post(
f"{settings.MLX_BASE_URL}/chat/completions",
json={
"model": settings.MLX_TEXT_MODEL,
"messages": messages,
"max_tokens": 2048,
"temperature": 0.3,
},
)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
except Exception:
response = await client.post(
f"{self.base_url}/api/chat",
json={
@@ -42,16 +69,21 @@ class OllamaClient:
return response.json()["message"]["content"]
async def check_health(self) -> dict:
result = {}
try:
async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
response = await client.get(f"{self.base_url}/api/tags")
models = response.json().get("models", [])
return {
"status": "connected",
"models": [m["name"] for m in models],
}
client = await self._get_client()
response = await client.get(f"{self.base_url}/api/tags")
models = response.json().get("models", [])
result["ollama"] = {"status": "connected", "models": [m["name"] for m in models]}
except Exception:
return {"status": "disconnected"}
result["ollama"] = {"status": "disconnected"}
try:
client = await self._get_client()
response = await client.get(f"{settings.MLX_BASE_URL}/health")
result["mlx"] = {"status": "connected", "model": settings.MLX_TEXT_MODEL}
except Exception:
result["mlx"] = {"status": "disconnected"}
return result
ollama_client = OllamaClient()

View File

@@ -1,11 +1,7 @@
from services.ollama_client import ollama_client
from services.embedding_service import search_similar_by_text, build_document_text
from services.db_client import get_issue_by_id
def _load_prompt(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
from services.utils import load_prompt
def _format_retrieved_issues(results: list[dict]) -> str:
@@ -55,7 +51,7 @@ async def rag_suggest_solution(issue_id: int) -> dict:
break
context = _format_retrieved_issues(similar)
template = _load_prompt("prompts/rag_suggest_solution.txt")
template = load_prompt("prompts/rag_suggest_solution.txt")
prompt = template.format(
description=issue.get("description", ""),
detail_notes=issue.get("detail_notes", ""),
@@ -87,7 +83,7 @@ async def rag_ask(question: str, project_id: int = None) -> dict:
)
context = _format_retrieved_issues(results)
template = _load_prompt("prompts/rag_qa.txt")
template = load_prompt("prompts/rag_qa.txt")
prompt = template.format(
question=question,
retrieved_cases=context,
@@ -113,7 +109,7 @@ async def rag_analyze_pattern(description: str, n_results: int = 10) -> dict:
results = await search_similar_by_text(description, n_results=n_results)
context = _format_retrieved_issues(results)
template = _load_prompt("prompts/rag_pattern.txt")
template = load_prompt("prompts/rag_pattern.txt")
prompt = template.format(
description=description,
retrieved_cases=context,
@@ -142,7 +138,7 @@ async def rag_classify_with_context(description: str, detail_notes: str = "") ->
similar = await search_similar_by_text(query, n_results=5)
context = _format_retrieved_issues(similar)
template = _load_prompt("prompts/rag_classify.txt")
template = load_prompt("prompts/rag_classify.txt")
prompt = template.format(
description=description,
detail_notes=detail_notes,

View File

@@ -1,58 +1,38 @@
import asyncio
import httpx
from services.ollama_client import ollama_client
from services.db_client import get_daily_qc_stats, get_issues_for_date
from services.utils import load_prompt
from config import settings
REPORT_PROMPT_PATH = "prompts/daily_report.txt"
def _load_prompt(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
async def _fetch_one(client: httpx.AsyncClient, url: str, params: dict, headers: dict):
try:
r = await client.get(url, params=params, headers=headers)
if r.status_code == 200:
return r.json()
except Exception:
pass
return None
async def _fetch_system1_data(date_str: str, token: str) -> dict:
headers = {"Authorization": f"Bearer {token}"}
data = {"attendance": None, "work_reports": None, "patrol": None}
params = {"date": date_str}
base = settings.SYSTEM1_API_URL
try:
async with httpx.AsyncClient(timeout=15.0) as client:
# 근태
try:
r = await client.get(
f"{settings.SYSTEM1_API_URL}/api/attendance/daily-status",
params={"date": date_str},
headers=headers,
)
if r.status_code == 200:
data["attendance"] = r.json()
except Exception:
pass
# 작업보고
try:
r = await client.get(
f"{settings.SYSTEM1_API_URL}/api/daily-work-reports/summary",
params={"date": date_str},
headers=headers,
)
if r.status_code == 200:
data["work_reports"] = r.json()
except Exception:
pass
# 순회점검
try:
r = await client.get(
f"{settings.SYSTEM1_API_URL}/api/patrol/today-status",
params={"date": date_str},
headers=headers,
)
if r.status_code == 200:
data["patrol"] = r.json()
except Exception:
pass
attendance, work_reports, patrol = await asyncio.gather(
_fetch_one(client, f"{base}/api/attendance/daily-status", params, headers),
_fetch_one(client, f"{base}/api/daily-work-reports/summary", params, headers),
_fetch_one(client, f"{base}/api/patrol/today-status", params, headers),
)
except Exception:
pass
return data
attendance = work_reports = patrol = None
return {"attendance": attendance, "work_reports": work_reports, "patrol": patrol}
def _format_attendance(data) -> str:
@@ -102,7 +82,7 @@ async def generate_daily_report(
qc_stats = get_daily_qc_stats(date_str)
qc_issues = get_issues_for_date(date_str)
template = _load_prompt(REPORT_PROMPT_PATH)
template = load_prompt(REPORT_PROMPT_PATH)
prompt = template.format(
date=date_str,
attendance_data=_format_attendance(system1_data["attendance"]),

View File

@@ -0,0 +1,22 @@
import json
import os
_BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
def load_prompt(path: str) -> str:
full_path = os.path.join(_BASE_DIR, path)
with open(full_path, "r", encoding="utf-8") as f:
return f.read()
def parse_json_response(raw: str) -> dict:
"""LLM 응답에서 JSON을 추출합니다."""
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return {}
try:
return json.loads(raw[start:end])
except json.JSONDecodeError:
return {}