Files
hyungi_document_server/scripts/embed_to_qdrant.py
hyungi 45cabc9aea refactor: GPU 서버 재구성 + ChromaDB→Qdrant 마이그레이션
- embed_to_chroma.py → embed_to_qdrant.py 리라이트 (bge-m3 + Qdrant REST API)
- auto_classify.scpt: embed_to_qdrant.py 경로 변경 + sourceChannel 덮어쓰기 버그 수정
- requirements.txt: chromadb/schedule 제거, qdrant-client/flask/gunicorn 추가
- credentials.env.example: GPU_SERVER_IP 항목 추가
- GPU 서버 재구성 계획서 (docs/gpu-restructure.md) + dev-roadmap/commands 통합
- CLAUDE.md, README.md, deploy.md 현행화

GPU 서버 변경사항 (이미 적용됨):
  - Ollama: qwen3.5:9b, id-9b 제거 → bge-m3 + bge-reranker-v2-m3
  - Surya OCR 서비스 (:8400, systemd)
  - Docker + NFS + Komga 이전 (:25600)
  - tk-ai-service: Ollama API → OpenAI API 전환 (MLX 35B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 13:19:31 +09:00

115 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""
벡터 임베딩 스크립트
- DEVONthink 문서 UUID로 텍스트 추출
- GPU 서버(bge-m3)로 임베딩 생성
- Qdrant에 저장
"""
import sys
import uuid as uuid_mod
import requests
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from pkm_utils import setup_logger, load_credentials, run_applescript_inline
logger = setup_logger("embed")
QDRANT_URL = "http://localhost:6333"
COLLECTION = "pkm_documents"
def get_document_text(doc_uuid: str) -> tuple[str, str]:
"""DEVONthink에서 UUID로 문서 텍스트 + 제목 추출"""
script = f'''
tell application id "DNtp"
set theRecord to get record with uuid "{doc_uuid}"
set docText to plain text of theRecord
set docTitle to name of theRecord
return docTitle & "|||" & docText
end tell
'''
result = run_applescript_inline(script)
parts = result.split("|||", 1)
title = parts[0] if len(parts) > 0 else ""
text = parts[1] if len(parts) > 1 else ""
return title, text
def get_embedding(text: str, gpu_server_ip: str) -> list[float] | None:
"""GPU 서버의 bge-m3로 임베딩 생성"""
url = f"http://{gpu_server_ip}:11434/api/embed"
try:
resp = requests.post(url, json={
"model": "bge-m3",
"input": [text[:8000]]
}, timeout=60)
resp.raise_for_status()
embeddings = resp.json().get("embeddings")
return embeddings[0] if embeddings else None
except Exception as e:
logger.error(f"임베딩 생성 실패: {e}")
return None
def store_in_qdrant(doc_uuid: str, title: str, text: str, embedding: list[float]):
"""Qdrant에 저장"""
# UUID 문자열을 정수 ID로 변환 (Qdrant point ID)
point_id = uuid_mod.uuid5(uuid_mod.NAMESPACE_URL, doc_uuid).int >> 64
payload = {
"uuid": doc_uuid,
"title": title,
"text_preview": text[:500],
"source": "devonthink",
}
resp = requests.put(
f"{QDRANT_URL}/collections/{COLLECTION}/points",
json={
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload,
}]
},
timeout=30,
)
resp.raise_for_status()
logger.info(f"Qdrant 저장: {doc_uuid} ({title[:30]})")
def run(doc_uuid: str):
"""단일 문서 임베딩 처리"""
logger.info(f"임베딩 처리 시작: {doc_uuid}")
creds = load_credentials()
gpu_ip = creds.get("GPU_SERVER_IP")
if not gpu_ip:
logger.warning("GPU_SERVER_IP 미설정 — 임베딩 건너뜀")
return
try:
title, text = get_document_text(doc_uuid)
if not text or len(text) < 10:
logger.warning(f"텍스트 부족 [{doc_uuid}]: {len(text)}")
return
embedding = get_embedding(text, gpu_ip)
if embedding:
store_in_qdrant(doc_uuid, title, text, embedding)
logger.info(f"임베딩 완료: {doc_uuid}")
else:
logger.error(f"임베딩 실패: {doc_uuid}")
except Exception as e:
logger.error(f"임베딩 처리 에러 [{doc_uuid}]: {e}", exc_info=True)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("사용법: python3 embed_to_qdrant.py <DEVONthink_UUID>")
sys.exit(1)
run(sys.argv[1])