#!/usr/bin/env python3 """ 벡터 임베딩 스크립트 - DEVONthink 문서 UUID로 텍스트 추출 - GPU 서버(bge-m3)로 임베딩 생성 - Qdrant에 저장 """ import sys import uuid as uuid_mod import requests from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from pkm_utils import setup_logger, load_credentials, run_applescript_inline logger = setup_logger("embed") QDRANT_URL = "http://localhost:6333" COLLECTION = "pkm_documents" def get_document_text(doc_uuid: str) -> tuple[str, str]: """DEVONthink에서 UUID로 문서 텍스트 + 제목 추출""" script = f''' tell application id "DNtp" set theRecord to get record with uuid "{doc_uuid}" set docText to plain text of theRecord set docTitle to name of theRecord return docTitle & "|||" & docText end tell ''' result = run_applescript_inline(script) parts = result.split("|||", 1) title = parts[0] if len(parts) > 0 else "" text = parts[1] if len(parts) > 1 else "" return title, text def get_embedding(text: str, gpu_server_ip: str) -> list[float] | None: """GPU 서버의 bge-m3로 임베딩 생성""" url = f"http://{gpu_server_ip}:11434/api/embed" try: resp = requests.post(url, json={ "model": "bge-m3", "input": [text[:8000]] }, timeout=60) resp.raise_for_status() embeddings = resp.json().get("embeddings") return embeddings[0] if embeddings else None except Exception as e: logger.error(f"임베딩 생성 실패: {e}") return None def store_in_qdrant(doc_uuid: str, title: str, text: str, embedding: list[float]): """Qdrant에 저장""" # UUID 문자열을 정수 ID로 변환 (Qdrant point ID) point_id = uuid_mod.uuid5(uuid_mod.NAMESPACE_URL, doc_uuid).int >> 64 payload = { "uuid": doc_uuid, "title": title, "text_preview": text[:500], "source": "devonthink", } resp = requests.put( f"{QDRANT_URL}/collections/{COLLECTION}/points", json={ "points": [{ "id": point_id, "vector": embedding, "payload": payload, }] }, timeout=30, ) resp.raise_for_status() logger.info(f"Qdrant 저장: {doc_uuid} ({title[:30]})") def run(doc_uuid: str): """단일 문서 임베딩 처리""" logger.info(f"임베딩 처리 시작: {doc_uuid}") creds = load_credentials() gpu_ip = creds.get("GPU_SERVER_IP") if not gpu_ip: logger.warning("GPU_SERVER_IP 미설정 — 임베딩 건너뜀") return try: title, text = get_document_text(doc_uuid) if not text or len(text) < 10: logger.warning(f"텍스트 부족 [{doc_uuid}]: {len(text)}자") return embedding = get_embedding(text, gpu_ip) if embedding: store_in_qdrant(doc_uuid, title, text, embedding) logger.info(f"임베딩 완료: {doc_uuid}") else: logger.error(f"임베딩 실패: {doc_uuid}") except Exception as e: logger.error(f"임베딩 처리 에러 [{doc_uuid}]: {e}", exc_info=True) if __name__ == "__main__": if len(sys.argv) < 2: print("사용법: python3 embed_to_qdrant.py ") sys.exit(1) run(sys.argv[1])