feat: RAG 파이프라인 — pkm_api_server.py에 검색/임베딩 엔드포인트 추가

- POST /rag/query: 질문 → GPU bge-m3 임베딩 → Qdrant 검색 → MLX 35B 답변 생성
  - DEVONthink 링크(x-devonthink-item://UUID) 포함 응답
- POST /devonthink/embed: 단일 문서 UUID → Qdrant 임베딩 트리거
- POST /devonthink/embed-batch: 배치 문서 임베딩
- docstring 범위 갱신: DEVONthink + OmniFocus + RAG 검색

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-03-30 13:32:49 +09:00
parent 5fc23e0dbd
commit 5db2f4f6fa

View File

@@ -1,17 +1,22 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
PKM Host API Server PKM Host API Server
DEVONthink + OmniFocus AppleScript 중계 경량 HTTP 서버. DEVONthink + OmniFocus AppleScript 중계 + RAG 검색 경량 HTTP 서버.
NanoClaw 컨테이너에서 호출. LaunchAgent(GUI 세션)로 실행 필수. NanoClaw 컨테이너에서 호출. LaunchAgent(GUI 세션)로 실행 필수.
범위: DEVONthink + OmniFocus 전용. 이 이상 확장하지 않을 것. 범위: DEVONthink + OmniFocus + RAG 검색.
""" """
import json import json
import os
import subprocess import subprocess
import sys import sys
from pathlib import Path
from flask import Flask, request, jsonify from flask import Flask, request, jsonify
sys.path.insert(0, str(Path(__file__).parent))
from pkm_utils import load_credentials
app = Flask(__name__) app = Flask(__name__)
@@ -225,11 +230,161 @@ def omnifocus_today():
return jsonify(success=False, error=str(e)), 500 return jsonify(success=False, error=str(e)), 500
# --- RAG ---
def _get_gpu_ip():
creds = load_credentials()
return creds.get("GPU_SERVER_IP")
def _embed_text(text: str, gpu_ip: str) -> list[float] | None:
"""GPU 서버 bge-m3로 텍스트 임베딩"""
import requests as req
try:
resp = req.post(f"http://{gpu_ip}:11434/api/embed",
json={"model": "bge-m3", "input": [text[:8000]]}, timeout=60)
resp.raise_for_status()
return resp.json().get("embeddings", [[]])[0]
except Exception:
return None
def _search_qdrant(vector: list[float], limit: int = 20) -> list[dict]:
"""Qdrant에서 유사도 검색"""
import requests as req
resp = req.post("http://localhost:6333/collections/pkm_documents/points/search",
json={"vector": vector, "limit": limit, "with_payload": True}, timeout=10)
resp.raise_for_status()
return resp.json().get("result", [])
def _llm_generate(prompt: str) -> str:
"""Mac Mini MLX로 답변 생성"""
import requests as req
resp = req.post("http://localhost:8800/v1/chat/completions", json={
"model": "mlx-community/Qwen3.5-35B-A3B-4bit",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.3,
"max_tokens": 2048,
}, timeout=120)
resp.raise_for_status()
return resp.json()["choices"][0]["message"]["content"]
@app.route('/rag/query', methods=['POST'])
def rag_query():
"""RAG 질의: 임베딩 → Qdrant 검색 → LLM 답변 생성"""
data = request.get_json(silent=True) or {}
q = data.get('q', '')
limit = data.get('limit', 10)
if not q:
return jsonify(success=False, error='q parameter required'), 400
gpu_ip = _get_gpu_ip()
if not gpu_ip:
return jsonify(success=False, error='GPU_SERVER_IP not configured'), 500
try:
# 1. 쿼리 임베딩
query_vec = _embed_text(q, gpu_ip)
if not query_vec:
return jsonify(success=False, error='embedding failed'), 500
# 2. Qdrant 검색
results = _search_qdrant(query_vec, limit=limit)
if not results:
return jsonify(success=True, answer="관련 문서를 찾지 못했습니다.", sources=[])
# 3. 컨텍스트 조립
sources = []
context_parts = []
for r in results[:5]:
payload = r.get("payload", {})
title = payload.get("title", "")
preview = payload.get("text_preview", "")
doc_uuid = payload.get("uuid", "")
sources.append({
"title": title,
"uuid": doc_uuid,
"score": round(r.get("score", 0), 3),
"link": f"x-devonthink-item://{doc_uuid}" if doc_uuid else None,
})
context_parts.append(f"[{title}]\n{preview}")
context = "\n\n---\n\n".join(context_parts)
# 4. LLM 답변 생성
prompt = f"""다음 문서들을 참고하여 질문에 답변해주세요.
## 참고 문서
{context}
## 질문
{q}
답변은 한국어로, 참고한 문서 제목을 언급해주세요."""
answer = _llm_generate(prompt)
return jsonify(success=True, answer=answer, sources=sources, query=q)
except Exception as e:
return jsonify(success=False, error=str(e)), 500
@app.route('/devonthink/embed', methods=['POST'])
def devonthink_embed():
"""단일 문서 임베딩 트리거"""
data = request.get_json(silent=True) or {}
doc_uuid = data.get('uuid', '')
if not doc_uuid:
return jsonify(success=False, error='uuid parameter required'), 400
try:
venv_python = str(Path(__file__).parent.parent / "venv" / "bin" / "python3")
embed_script = str(Path(__file__).parent / "embed_to_qdrant.py")
result = subprocess.run(
[venv_python, embed_script, doc_uuid],
capture_output=True, text=True, timeout=120
)
if result.returncode != 0:
return jsonify(success=False, error=result.stderr.strip()), 500
return jsonify(success=True, uuid=doc_uuid)
except Exception as e:
return jsonify(success=False, error=str(e)), 500
@app.route('/devonthink/embed-batch', methods=['POST'])
def devonthink_embed_batch():
"""배치 문서 임베딩 트리거"""
data = request.get_json(silent=True) or {}
uuids = data.get('uuids', [])
if not uuids:
return jsonify(success=False, error='uuids array required'), 400
results = []
venv_python = str(Path(__file__).parent.parent / "venv" / "bin" / "python3")
embed_script = str(Path(__file__).parent / "embed_to_qdrant.py")
for doc_uuid in uuids:
try:
result = subprocess.run(
[venv_python, embed_script, doc_uuid],
capture_output=True, text=True, timeout=120
)
results.append({"uuid": doc_uuid, "success": result.returncode == 0})
except Exception as e:
results.append({"uuid": doc_uuid, "success": False, "error": str(e)})
succeeded = sum(1 for r in results if r["success"])
return jsonify(success=True, total=len(uuids), succeeded=succeeded, results=results)
@app.route('/health') @app.route('/health')
def health(): def health():
return jsonify(success=True, service='pkm-api', endpoints=[ return jsonify(success=True, service='pkm-api', endpoints=[
'/devonthink/stats', '/devonthink/search?q=', '/devonthink/stats', '/devonthink/search?q=',
'/devonthink/inbox-count', '/omnifocus/stats', '/omnifocus/overdue', '/omnifocus/today' '/devonthink/inbox-count', '/devonthink/embed', '/devonthink/embed-batch',
'/omnifocus/stats', '/omnifocus/overdue', '/omnifocus/today',
'/rag/query',
]) ])