hyungi_document_server/scripts/embed_to_qdrant.py

#!/usr/bin/env python3
"""
벡터 임베딩 스크립트
- DEVONthink 문서 UUID로 텍스트 추출
- GPU 서버(bge-m3)로 임베딩 생성
- Qdrant에 저장
"""

import sys
import uuid as uuid_mod
import requests
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))
from pkm_utils import setup_logger, load_credentials, run_applescript_inline

logger = setup_logger("embed")

QDRANT_URL = "http://localhost:6333"
COLLECTION = "pkm_documents"


def get_document_text(doc_uuid: str) -> tuple[str, str]:
    """DEVONthink에서 UUID로 문서 텍스트 + 제목 추출"""
    script = f'''
    tell application id "DNtp"
        set theRecord to get record with uuid "{doc_uuid}"
        set docText to plain text of theRecord
        set docTitle to name of theRecord
        return docTitle & "|||" & docText
    end tell
    '''
    result = run_applescript_inline(script)
    parts = result.split("|||", 1)
    title = parts[0] if len(parts) > 0 else ""
    text = parts[1] if len(parts) > 1 else ""
    return title, text


def get_embedding(text: str, gpu_server_ip: str) -> list[float] | None:
    """GPU 서버의 bge-m3로 임베딩 생성"""
    url = f"http://{gpu_server_ip}:11434/api/embed"
    try:
        resp = requests.post(url, json={
            "model": "bge-m3",
            "input": [text[:8000]]
        }, timeout=60)
        resp.raise_for_status()
        embeddings = resp.json().get("embeddings")
        return embeddings[0] if embeddings else None
    except Exception as e:
        logger.error(f"임베딩 생성 실패: {e}")
        return None


def store_in_qdrant(doc_uuid: str, title: str, text: str, embedding: list[float]):
    """Qdrant에 저장"""
    # UUID 문자열을 정수 ID로 변환 (Qdrant point ID)
    point_id = uuid_mod.uuid5(uuid_mod.NAMESPACE_URL, doc_uuid).int >> 64

    payload = {
        "uuid": doc_uuid,
        "title": title,
        "text_preview": text[:500],
        "source": "devonthink",
    }

    resp = requests.put(
        f"{QDRANT_URL}/collections/{COLLECTION}/points",
        json={
            "points": [{
                "id": point_id,
                "vector": embedding,
                "payload": payload,
            }]
        },
        timeout=30,
    )
    resp.raise_for_status()
    logger.info(f"Qdrant 저장: {doc_uuid} ({title[:30]})")


def run(doc_uuid: str):
    """단일 문서 임베딩 처리"""
    logger.info(f"임베딩 처리 시작: {doc_uuid}")

    creds = load_credentials()
    gpu_ip = creds.get("GPU_SERVER_IP")
    if not gpu_ip:
        logger.warning("GPU_SERVER_IP 미설정 — 임베딩 건너뜀")
        return

    try:
        title, text = get_document_text(doc_uuid)
        if not text or len(text) < 10:
            logger.warning(f"텍스트 부족 [{doc_uuid}]: {len(text)}자")
            return

        embedding = get_embedding(text, gpu_ip)
        if embedding:
            store_in_qdrant(doc_uuid, title, text, embedding)
            logger.info(f"임베딩 완료: {doc_uuid}")
        else:
            logger.error(f"임베딩 실패: {doc_uuid}")

    except Exception as e:
        logger.error(f"임베딩 처리 에러 [{doc_uuid}]: {e}", exc_info=True)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("사용법: python3 embed_to_qdrant.py <DEVONthink_UUID>")
        sys.exit(1)
    run(sys.argv[1])