feat: RAG 파이프라인 — pkm_api_server.py에 검색/임베딩 엔드포인트 추가

- POST /rag/query: 질문 → GPU bge-m3 임베딩 → Qdrant 검색 → MLX 35B 답변 생성
  - DEVONthink 링크(x-devonthink-item://UUID) 포함 응답
- POST /devonthink/embed: 단일 문서 UUID → Qdrant 임베딩 트리거
- POST /devonthink/embed-batch: 배치 문서 임베딩
- docstring 범위 갱신: DEVONthink + OmniFocus + RAG 검색

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-03-30 13:32:49 +09:00
parent 5fc23e0dbd
commit 5db2f4f6fa

View File

@@ -1,17 +1,22 @@
#!/usr/bin/env python3
"""
PKM Host API Server
DEVONthink + OmniFocus AppleScript 중계 경량 HTTP 서버.
DEVONthink + OmniFocus AppleScript 중계 + RAG 검색 경량 HTTP 서버.
NanoClaw 컨테이너에서 호출. LaunchAgent(GUI 세션)로 실행 필수.
범위: DEVONthink + OmniFocus 전용. 이 이상 확장하지 않을 것.
범위: DEVONthink + OmniFocus + RAG 검색.
"""
import json
import os
import subprocess
import sys
from pathlib import Path
from flask import Flask, request, jsonify
sys.path.insert(0, str(Path(__file__).parent))
from pkm_utils import load_credentials
app = Flask(__name__)
@@ -225,11 +230,161 @@ def omnifocus_today():
return jsonify(success=False, error=str(e)), 500
# --- RAG ---
def _get_gpu_ip():
creds = load_credentials()
return creds.get("GPU_SERVER_IP")
def _embed_text(text: str, gpu_ip: str) -> list[float] | None:
"""GPU 서버 bge-m3로 텍스트 임베딩"""
import requests as req
try:
resp = req.post(f"http://{gpu_ip}:11434/api/embed",
json={"model": "bge-m3", "input": [text[:8000]]}, timeout=60)
resp.raise_for_status()
return resp.json().get("embeddings", [[]])[0]
except Exception:
return None
def _search_qdrant(vector: list[float], limit: int = 20) -> list[dict]:
"""Qdrant에서 유사도 검색"""
import requests as req
resp = req.post("http://localhost:6333/collections/pkm_documents/points/search",
json={"vector": vector, "limit": limit, "with_payload": True}, timeout=10)
resp.raise_for_status()
return resp.json().get("result", [])
def _llm_generate(prompt: str) -> str:
"""Mac Mini MLX로 답변 생성"""
import requests as req
resp = req.post("http://localhost:8800/v1/chat/completions", json={
"model": "mlx-community/Qwen3.5-35B-A3B-4bit",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.3,
"max_tokens": 2048,
}, timeout=120)
resp.raise_for_status()
return resp.json()["choices"][0]["message"]["content"]
@app.route('/rag/query', methods=['POST'])
def rag_query():
"""RAG 질의: 임베딩 → Qdrant 검색 → LLM 답변 생성"""
data = request.get_json(silent=True) or {}
q = data.get('q', '')
limit = data.get('limit', 10)
if not q:
return jsonify(success=False, error='q parameter required'), 400
gpu_ip = _get_gpu_ip()
if not gpu_ip:
return jsonify(success=False, error='GPU_SERVER_IP not configured'), 500
try:
# 1. 쿼리 임베딩
query_vec = _embed_text(q, gpu_ip)
if not query_vec:
return jsonify(success=False, error='embedding failed'), 500
# 2. Qdrant 검색
results = _search_qdrant(query_vec, limit=limit)
if not results:
return jsonify(success=True, answer="관련 문서를 찾지 못했습니다.", sources=[])
# 3. 컨텍스트 조립
sources = []
context_parts = []
for r in results[:5]:
payload = r.get("payload", {})
title = payload.get("title", "")
preview = payload.get("text_preview", "")
doc_uuid = payload.get("uuid", "")
sources.append({
"title": title,
"uuid": doc_uuid,
"score": round(r.get("score", 0), 3),
"link": f"x-devonthink-item://{doc_uuid}" if doc_uuid else None,
})
context_parts.append(f"[{title}]\n{preview}")
context = "\n\n---\n\n".join(context_parts)
# 4. LLM 답변 생성
prompt = f"""다음 문서들을 참고하여 질문에 답변해주세요.
## 참고 문서
{context}
## 질문
{q}
답변은 한국어로, 참고한 문서 제목을 언급해주세요."""
answer = _llm_generate(prompt)
return jsonify(success=True, answer=answer, sources=sources, query=q)
except Exception as e:
return jsonify(success=False, error=str(e)), 500
@app.route('/devonthink/embed', methods=['POST'])
def devonthink_embed():
"""단일 문서 임베딩 트리거"""
data = request.get_json(silent=True) or {}
doc_uuid = data.get('uuid', '')
if not doc_uuid:
return jsonify(success=False, error='uuid parameter required'), 400
try:
venv_python = str(Path(__file__).parent.parent / "venv" / "bin" / "python3")
embed_script = str(Path(__file__).parent / "embed_to_qdrant.py")
result = subprocess.run(
[venv_python, embed_script, doc_uuid],
capture_output=True, text=True, timeout=120
)
if result.returncode != 0:
return jsonify(success=False, error=result.stderr.strip()), 500
return jsonify(success=True, uuid=doc_uuid)
except Exception as e:
return jsonify(success=False, error=str(e)), 500
@app.route('/devonthink/embed-batch', methods=['POST'])
def devonthink_embed_batch():
"""배치 문서 임베딩 트리거"""
data = request.get_json(silent=True) or {}
uuids = data.get('uuids', [])
if not uuids:
return jsonify(success=False, error='uuids array required'), 400
results = []
venv_python = str(Path(__file__).parent.parent / "venv" / "bin" / "python3")
embed_script = str(Path(__file__).parent / "embed_to_qdrant.py")
for doc_uuid in uuids:
try:
result = subprocess.run(
[venv_python, embed_script, doc_uuid],
capture_output=True, text=True, timeout=120
)
results.append({"uuid": doc_uuid, "success": result.returncode == 0})
except Exception as e:
results.append({"uuid": doc_uuid, "success": False, "error": str(e)})
succeeded = sum(1 for r in results if r["success"])
return jsonify(success=True, total=len(uuids), succeeded=succeeded, results=results)
@app.route('/health')
def health():
return jsonify(success=True, service='pkm-api', endpoints=[
'/devonthink/stats', '/devonthink/search?q=',
'/devonthink/inbox-count', '/omnifocus/stats', '/omnifocus/overdue', '/omnifocus/today'
'/devonthink/inbox-count', '/devonthink/embed', '/devonthink/embed-batch',
'/omnifocus/stats', '/omnifocus/overdue', '/omnifocus/today',
'/rag/query',
])