feat: mlx-proxy 서버 + n8n 워크플로우 LLM/임베딩 URL 분리

mlx-vlm 기반 ollama 호환 프록시 서버 추가 (port 11435).
n8n GEN 노드 6개에 callLLM 래퍼 주입 (health check + ollama fallback).
임베딩/리랭커는 ollama(LOCAL_EMBED_URL)로 분리.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-03-19 10:00:00 +09:00
parent a050f2e7d5
commit 1137754964
5 changed files with 162 additions and 14 deletions

View File

@@ -28,6 +28,12 @@ LOCAL_OLLAMA_URL=http://host.docker.internal:11434
# Ollama (GPU 서버 — RTX 4070Ti Super, 기본 모델: id-9b:latest) # Ollama (GPU 서버 — RTX 4070Ti Super, 기본 모델: id-9b:latest)
GPU_OLLAMA_URL=http://192.168.1.186:11434 GPU_OLLAMA_URL=http://192.168.1.186:11434
# mlx-proxy (맥미니 — LLM 생성용, ollama 호환, 기본 모델: qwen3.5:27b)
LOCAL_LLM_URL=http://host.docker.internal:11435
# 임베딩 전용 (ollama — bge-m3, bge-reranker)
LOCAL_EMBED_URL=http://host.docker.internal:11434
# Qdrant (Docker 내부에서 접근) # Qdrant (Docker 내부에서 접근)
QDRANT_URL=http://host.docker.internal:6333 QDRANT_URL=http://host.docker.internal:6333
@@ -63,3 +69,4 @@ CHAT_BRIDGE_URL=http://host.docker.internal:8091
CALDAV_BRIDGE_URL=http://host.docker.internal:8092 CALDAV_BRIDGE_URL=http://host.docker.internal:8092
DEVONTHINK_BRIDGE_URL=http://host.docker.internal:8093 DEVONTHINK_BRIDGE_URL=http://host.docker.internal:8093
MAIL_BRIDGE_URL=http://host.docker.internal:8094 MAIL_BRIDGE_URL=http://host.docker.internal:8094
KB_WRITER_URL=http://host.docker.internal:8095

27
com.mlx-proxy.plist Normal file
View File

@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.mlx-proxy</string>
<key>ProgramArguments</key>
<array>
<string>/Users/hyungi/mlx-env/bin/uvicorn</string>
<string>mlx_proxy:app</string>
<string>--host</string>
<string>0.0.0.0</string>
<string>--port</string>
<string>11435</string>
</array>
<key>WorkingDirectory</key>
<string>/Users/hyungi/Documents/code/syn-chat-bot</string>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<true/>
<key>StandardOutPath</key>
<string>/tmp/mlx-proxy.log</string>
<key>StandardErrorPath</key>
<string>/tmp/mlx-proxy.err</string>
</dict>
</plist>

View File

@@ -11,6 +11,7 @@ SERVICES=(
"com.syn-chat-bot.mail-bridge" "com.syn-chat-bot.mail-bridge"
"com.syn-chat-bot.inbox-processor" "com.syn-chat-bot.inbox-processor"
"com.syn-chat-bot.news-digest" "com.syn-chat-bot.news-digest"
"com.mlx-proxy"
) )
PLIST_DIR="$HOME/Library/LaunchAgents" PLIST_DIR="$HOME/Library/LaunchAgents"

113
mlx_proxy.py Normal file
View File

@@ -0,0 +1,113 @@
"""mlx-vlm proxy — ollama 호환 API for mlx-vlm inference (port 11435)
ollama 호환 /api/generate 엔드포인트 제공. n8n에서 투명하게 사용 가능.
Qwen3.5-27B-4bit를 mlx-vlm으로 서빙, thinking 자동 비활성화 (prefill 방식).
"""
import asyncio
import logging
import time
from functools import partial
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("mlx_proxy")
MODEL_PATH = "mlx-community/Qwen3.5-27B-4bit"
DISPLAY_NAME = "qwen3.5:27b"
app = FastAPI()
# Global model state — loaded once at startup
model = None
processor = None
@app.on_event("startup")
async def startup():
global model, processor
logger.info(f"Loading {MODEL_PATH} ...")
from mlx_vlm import load
model, processor = load(MODEL_PATH)
logger.info("Model ready")
def _generate_sync(
prompt: str, system: str, max_tokens: int, temperature: float, is_json: bool
) -> dict:
"""Blocking generation — runs in thread pool executor."""
from mlx_vlm import generate as mlx_generate
from mlx_vlm.utils import apply_chat_template
messages = []
# System prompt — strip /no_think (thinking handled by prefill)
sys_text = (system or "").replace("/no_think", "").strip()
if is_json and sys_text and "JSON" not in sys_text:
sys_text += "\nJSON으로만 응답하세요."
elif is_json and not sys_text:
sys_text = "JSON으로만 응답하세요."
if sys_text:
messages.append({"role": "system", "content": sys_text})
messages.append({"role": "user", "content": prompt})
# Prefill: thinking 비활성화 — 모델이 <think> 단계 완료 상태에서 시작
messages.append({"role": "assistant", "content": "<think>\n</think>\n\n"})
formatted = apply_chat_template(processor, model.config, messages)
t0 = time.perf_counter()
output = mlx_generate(
model, processor, formatted, max_tokens=max_tokens, temp=temperature
)
elapsed = time.perf_counter() - t0
output = output.strip()
# Token count (approximate via tokenizer)
try:
n_tokens = len(processor.tokenizer.encode(output))
except Exception:
n_tokens = max(1, len(output) // 3)
return {"text": output, "n_tokens": n_tokens, "elapsed": elapsed}
@app.post("/api/generate")
async def api_generate(request: Request):
body = await request.json()
prompt = body.get("prompt", "")
system = body.get("system", "")
opts = body.get("options", {})
max_tokens = opts.get("num_predict", 2048)
temperature = opts.get("temperature", 0.7)
is_json = body.get("format") == "json"
if not prompt:
return JSONResponse({"error": "prompt required"}, status_code=400)
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
partial(_generate_sync, prompt, system, max_tokens, temperature, is_json),
)
except Exception as e:
logger.error(f"Generation failed: {e}")
return JSONResponse({"error": str(e)}, status_code=500)
return {
"model": DISPLAY_NAME,
"response": result["text"],
"done": True,
"eval_count": result["n_tokens"],
"eval_duration": int(result["elapsed"] * 1e9),
}
@app.get("/health")
async def health():
return {"status": "ok" if model is not None else "loading", "model": MODEL_PATH}

File diff suppressed because one or more lines are too long