From 5db2f4f6fa7cd60143879ebc16fb2c2c2ca86e00 Mon Sep 17 00:00:00 2001
From: hyungi <hyungi@Hyungiui-Macmini.local>
Date: Mon, 30 Mar 2026 13:32:49 +0900
Subject: [PATCH] =?UTF-8?q?feat:=20RAG=20=ED=8C=8C=EC=9D=B4=ED=94=84?=
 =?UTF-8?q?=EB=9D=BC=EC=9D=B8=20=E2=80=94=20pkm=5Fapi=5Fserver.py=EC=97=90?=
 =?UTF-8?q?=20=EA=B2=80=EC=83=89/=EC=9E=84=EB=B2=A0=EB=94=A9=20=EC=97=94?=
 =?UTF-8?q?=EB=93=9C=ED=8F=AC=EC=9D=B8=ED=8A=B8=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- POST /rag/query: 질문 → GPU bge-m3 임베딩 → Qdrant 검색 → MLX 35B 답변 생성
  - DEVONthink 링크(x-devonthink-item://UUID) 포함 응답
- POST /devonthink/embed: 단일 문서 UUID → Qdrant 임베딩 트리거
- POST /devonthink/embed-batch: 배치 문서 임베딩
- docstring 범위 갱신: DEVONthink + OmniFocus + RAG 검색

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/pkm_api_server.py | 161 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 158 insertions(+), 3 deletions(-)

diff --git a/scripts/pkm_api_server.py b/scripts/pkm_api_server.py
index 95c7136..8a301f5 100644
--- a/scripts/pkm_api_server.py
+++ b/scripts/pkm_api_server.py
@@ -1,17 +1,22 @@
 #!/usr/bin/env python3
 """
 PKM Host API Server
-DEVONthink + OmniFocus AppleScript 중계용 경량 HTTP 서버.
+DEVONthink + OmniFocus AppleScript 중계 + RAG 검색 경량 HTTP 서버.
 NanoClaw 컨테이너에서 호출. LaunchAgent(GUI 세션)로 실행 필수.
 
-범위: DEVONthink + OmniFocus 전용. 이 이상 확장하지 않을 것.
+범위: DEVONthink + OmniFocus + RAG 검색.
 """
 
 import json
+import os
 import subprocess
 import sys
+from pathlib import Path
 from flask import Flask, request, jsonify
 
+sys.path.insert(0, str(Path(__file__).parent))
+from pkm_utils import load_credentials
+
 app = Flask(__name__)
 
 
@@ -225,11 +230,161 @@ def omnifocus_today():
         return jsonify(success=False, error=str(e)), 500
 
 
+# --- RAG ---
+
+def _get_gpu_ip():
+    creds = load_credentials()
+    return creds.get("GPU_SERVER_IP")
+
+
+def _embed_text(text: str, gpu_ip: str) -> list[float] | None:
+    """GPU 서버 bge-m3로 텍스트 임베딩"""
+    import requests as req
+    try:
+        resp = req.post(f"http://{gpu_ip}:11434/api/embed",
+                        json={"model": "bge-m3", "input": [text[:8000]]}, timeout=60)
+        resp.raise_for_status()
+        return resp.json().get("embeddings", [[]])[0]
+    except Exception:
+        return None
+
+
+def _search_qdrant(vector: list[float], limit: int = 20) -> list[dict]:
+    """Qdrant에서 유사도 검색"""
+    import requests as req
+    resp = req.post("http://localhost:6333/collections/pkm_documents/points/search",
+                    json={"vector": vector, "limit": limit, "with_payload": True}, timeout=10)
+    resp.raise_for_status()
+    return resp.json().get("result", [])
+
+
+def _llm_generate(prompt: str) -> str:
+    """Mac Mini MLX로 답변 생성"""
+    import requests as req
+    resp = req.post("http://localhost:8800/v1/chat/completions", json={
+        "model": "mlx-community/Qwen3.5-35B-A3B-4bit",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.3,
+        "max_tokens": 2048,
+    }, timeout=120)
+    resp.raise_for_status()
+    return resp.json()["choices"][0]["message"]["content"]
+
+
+@app.route('/rag/query', methods=['POST'])
+def rag_query():
+    """RAG 질의: 임베딩 → Qdrant 검색 → LLM 답변 생성"""
+    data = request.get_json(silent=True) or {}
+    q = data.get('q', '')
+    limit = data.get('limit', 10)
+    if not q:
+        return jsonify(success=False, error='q parameter required'), 400
+
+    gpu_ip = _get_gpu_ip()
+    if not gpu_ip:
+        return jsonify(success=False, error='GPU_SERVER_IP not configured'), 500
+
+    try:
+        # 1. 쿼리 임베딩
+        query_vec = _embed_text(q, gpu_ip)
+        if not query_vec:
+            return jsonify(success=False, error='embedding failed'), 500
+
+        # 2. Qdrant 검색
+        results = _search_qdrant(query_vec, limit=limit)
+        if not results:
+            return jsonify(success=True, answer="관련 문서를 찾지 못했습니다.", sources=[])
+
+        # 3. 컨텍스트 조립
+        sources = []
+        context_parts = []
+        for r in results[:5]:
+            payload = r.get("payload", {})
+            title = payload.get("title", "")
+            preview = payload.get("text_preview", "")
+            doc_uuid = payload.get("uuid", "")
+            sources.append({
+                "title": title,
+                "uuid": doc_uuid,
+                "score": round(r.get("score", 0), 3),
+                "link": f"x-devonthink-item://{doc_uuid}" if doc_uuid else None,
+            })
+            context_parts.append(f"[{title}]\n{preview}")
+
+        context = "\n\n---\n\n".join(context_parts)
+
+        # 4. LLM 답변 생성
+        prompt = f"""다음 문서들을 참고하여 질문에 답변해주세요.
+
+## 참고 문서
+{context}
+
+## 질문
+{q}
+
+답변은 한국어로, 참고한 문서 제목을 언급해주세요."""
+
+        answer = _llm_generate(prompt)
+
+        return jsonify(success=True, answer=answer, sources=sources, query=q)
+    except Exception as e:
+        return jsonify(success=False, error=str(e)), 500
+
+
+@app.route('/devonthink/embed', methods=['POST'])
+def devonthink_embed():
+    """단일 문서 임베딩 트리거"""
+    data = request.get_json(silent=True) or {}
+    doc_uuid = data.get('uuid', '')
+    if not doc_uuid:
+        return jsonify(success=False, error='uuid parameter required'), 400
+
+    try:
+        venv_python = str(Path(__file__).parent.parent / "venv" / "bin" / "python3")
+        embed_script = str(Path(__file__).parent / "embed_to_qdrant.py")
+        result = subprocess.run(
+            [venv_python, embed_script, doc_uuid],
+            capture_output=True, text=True, timeout=120
+        )
+        if result.returncode != 0:
+            return jsonify(success=False, error=result.stderr.strip()), 500
+        return jsonify(success=True, uuid=doc_uuid)
+    except Exception as e:
+        return jsonify(success=False, error=str(e)), 500
+
+
+@app.route('/devonthink/embed-batch', methods=['POST'])
+def devonthink_embed_batch():
+    """배치 문서 임베딩 트리거"""
+    data = request.get_json(silent=True) or {}
+    uuids = data.get('uuids', [])
+    if not uuids:
+        return jsonify(success=False, error='uuids array required'), 400
+
+    results = []
+    venv_python = str(Path(__file__).parent.parent / "venv" / "bin" / "python3")
+    embed_script = str(Path(__file__).parent / "embed_to_qdrant.py")
+    for doc_uuid in uuids:
+        try:
+            result = subprocess.run(
+                [venv_python, embed_script, doc_uuid],
+                capture_output=True, text=True, timeout=120
+            )
+            results.append({"uuid": doc_uuid, "success": result.returncode == 0})
+        except Exception as e:
+            results.append({"uuid": doc_uuid, "success": False, "error": str(e)})
+
+    succeeded = sum(1 for r in results if r["success"])
+    return jsonify(success=True, total=len(uuids), succeeded=succeeded, results=results)
+
+
 @app.route('/health')
 def health():
     return jsonify(success=True, service='pkm-api', endpoints=[
         '/devonthink/stats', '/devonthink/search?q=',
-        '/devonthink/inbox-count', '/omnifocus/stats', '/omnifocus/overdue', '/omnifocus/today'
+        '/devonthink/inbox-count', '/devonthink/embed', '/devonthink/embed-batch',
+        '/omnifocus/stats', '/omnifocus/overdue', '/omnifocus/today',
+        '/rag/query',
     ])