#!/usr/bin/env python3 """ DEVONthink 전체 문서 배치 임베딩 - DB별 순차 처리, 500건씩 AppleScript 배치 텍스트 추출 - GPU bge-m3 배치 임베딩 (32건/호출) - Qdrant 배치 upsert (100건/호출) - --sync: 삭제된 문서 Qdrant 정리 - --force: 전체 재임베딩 - --db: 특정 DB만 처리 사용법: python3 batch_embed.py # 신규 문서만 python3 batch_embed.py --sync # 신규 + 삭제 동기화 python3 batch_embed.py --force # 전체 재임베딩 python3 batch_embed.py --db "04_Industrial safety" """ import argparse import sys import uuid as uuid_mod import time import requests from pathlib import Path from datetime import datetime sys.path.insert(0, str(Path(__file__).parent)) from pkm_utils import setup_logger, load_credentials, run_applescript_inline logger = setup_logger("batch_embed") QDRANT_URL = "http://localhost:6333" COLLECTION = "pkm_documents" EMBED_BATCH_SIZE = 32 QDRANT_BATCH_SIZE = 100 APPLESCRIPT_CHUNK = 500 # --- GPU 헬스체크 --- def check_gpu_health(gpu_ip: str) -> bool: """GPU bge-m3 API ping""" try: resp = requests.post( f"http://{gpu_ip}:11434/api/embed", json={"model": "bge-m3", "input": ["test"]}, timeout=10, ) return resp.status_code == 200 except Exception: return False # --- Qdrant --- def get_existing_uuids_from_qdrant() -> set[str]: """Qdrant에 이미 저장된 UUID 집합 조회""" uuids = set() offset = None while True: body = {"limit": 1000, "with_payload": {"include": ["uuid"]}} if offset: body["offset"] = offset resp = requests.post( f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll", json=body, timeout=30, ) resp.raise_for_status() result = resp.json()["result"] points = result.get("points", []) for p in points: uuid_val = p.get("payload", {}).get("uuid") if uuid_val: uuids.add(uuid_val) offset = result.get("next_page_offset") if not offset or not points: break return uuids def delete_from_qdrant(point_ids: list[int]): """Qdrant에서 포인트 삭제""" if not point_ids: return resp = requests.post( f"{QDRANT_URL}/collections/{COLLECTION}/points/delete", json={"points": point_ids}, timeout=30, ) resp.raise_for_status() def uuid_to_point_id(doc_uuid: str) -> int: return uuid_mod.uuid5(uuid_mod.NAMESPACE_URL, doc_uuid).int >> 64 def store_batch_in_qdrant(docs: list[dict]): """Qdrant 배치 upsert""" if not docs: return points = [] for doc in docs: points.append({ "id": uuid_to_point_id(doc["uuid"]), "vector": doc["embedding"], "payload": { "uuid": doc["uuid"], "title": doc["title"], "db_name": doc.get("db_name", ""), "text_preview": doc.get("text", "")[:200], "source": "devonthink", "embedded_at": datetime.now().isoformat(), }, }) for i in range(0, len(points), QDRANT_BATCH_SIZE): batch = points[i:i + QDRANT_BATCH_SIZE] resp = requests.put( f"{QDRANT_URL}/collections/{COLLECTION}/points", json={"points": batch}, timeout=60, ) resp.raise_for_status() # --- GPU 임베딩 --- def get_embeddings_batch(texts: list[str], gpu_ip: str) -> list[list[float]]: """GPU bge-m3 배치 임베딩 (4000자 제한 — bge-m3 토큰 한도 고려)""" truncated = [t[:4000] for t in texts] resp = requests.post( f"http://{gpu_ip}:11434/api/embed", json={"model": "bge-m3", "input": truncated}, timeout=120, ) resp.raise_for_status() return resp.json().get("embeddings", []) # --- DEVONthink 텍스트 추출 --- def get_db_names() -> list[str]: """DEVONthink DB 이름 목록""" script = ''' tell application id "DNtp" set dbNames to {} repeat with db in databases set end of dbNames to name of db end repeat set AppleScript's text item delimiters to linefeed return dbNames as text end tell ''' result = run_applescript_inline(script) return [n.strip() for n in result.split("\n") if n.strip()] def get_db_document_uuids(db_name: str) -> list[str]: """특정 DB의 임베딩 대상 UUID 목록 (그룹 제외, 텍스트 10자 이상)""" script = f''' tell application id "DNtp" set theDB to database "{db_name}" set allDocs to contents of theDB set output to {{}} repeat with rec in allDocs try set recType to type of rec as string if recType is not "group" then set recText to plain text of rec if length of recText > 10 then set end of output to uuid of rec end if end if end try end repeat set AppleScript's text item delimiters to linefeed return output as text end tell ''' try: result = run_applescript_inline(script) return [u.strip() for u in result.split("\n") if u.strip()] except Exception as e: logger.error(f"UUID 수집 실패 [{db_name}]: {e}") return [] def get_documents_batch(uuids: list[str]) -> list[dict]: """UUID 리스트로 배치 텍스트 추출 (AppleScript 1회 호출)""" if not uuids: return [] # UUID를 AppleScript 리스트로 변환 uuid_list = ", ".join(f'"{u}"' for u in uuids) script = f''' tell application id "DNtp" set uuidList to {{{uuid_list}}} set output to {{}} repeat with u in uuidList try set theRecord to get record with uuid u set recText to plain text of theRecord set recTitle to name of theRecord set recDB to name of database of theRecord if length of recText > 8000 then set recText to text 1 thru 8000 of recText end if set end of output to u & "|||" & recTitle & "|||" & recDB & "|||" & recText on error set end of output to u & "|||ERROR|||||||" end try end repeat set AppleScript's text item delimiters to linefeed & "<<<>>>" & linefeed return output as text end tell ''' try: result = run_applescript_inline(script) except Exception as e: logger.error(f"배치 텍스트 추출 실패: {e}") return [] docs = [] for entry in result.split("\n<<<>>>\n"): entry = entry.strip() if not entry or "|||ERROR|||" in entry: continue parts = entry.split("|||", 3) if len(parts) >= 4: text = parts[3].strip() if len(text) >= 10: docs.append({ "uuid": parts[0].strip(), "title": parts[1].strip(), "db_name": parts[2].strip(), "text": text, }) return docs # --- 메인 배치 --- def run_batch(gpu_ip: str, target_db: str = None, force: bool = False, sync: bool = False): """배치 임베딩 실행""" # GPU 헬스체크 if not check_gpu_health(gpu_ip): logger.error(f"GPU 서버 연결 실패 ({gpu_ip}) — 종료") sys.exit(1) logger.info(f"GPU 서버 연결 확인: {gpu_ip}") # 기존 임베딩 UUID 조회 existing_uuids = set() if not force: existing_uuids = get_existing_uuids_from_qdrant() logger.info(f"Qdrant 기존 임베딩: {len(existing_uuids)}건") # DB 목록 db_names = [target_db] if target_db else get_db_names() logger.info(f"처리 대상 DB: {db_names}") total_embedded = 0 total_skipped = 0 total_failed = 0 all_dt_uuids = set() for db_name in db_names: logger.info(f"--- DB: {db_name} ---") # UUID 수집 uuids = get_db_document_uuids(db_name) all_dt_uuids.update(uuids) logger.info(f" 문서: {len(uuids)}건") # 기존 스킵 if not force: new_uuids = [u for u in uuids if u not in existing_uuids] skipped = len(uuids) - len(new_uuids) total_skipped += skipped if skipped > 0: logger.info(f" 스킵: {skipped}건 (이미 임베딩)") uuids = new_uuids if not uuids: continue # 500건씩 AppleScript 배치 텍스트 추출 for chunk_start in range(0, len(uuids), APPLESCRIPT_CHUNK): chunk_uuids = uuids[chunk_start:chunk_start + APPLESCRIPT_CHUNK] docs = get_documents_batch(chunk_uuids) if not docs: continue # 32건씩 GPU 임베딩 for batch_start in range(0, len(docs), EMBED_BATCH_SIZE): batch = docs[batch_start:batch_start + EMBED_BATCH_SIZE] texts = [d["text"] for d in batch] try: embeddings = get_embeddings_batch(texts, gpu_ip) if len(embeddings) != len(batch): logger.warning(f"임베딩 수 불일치: {len(embeddings)} != {len(batch)}") total_failed += len(batch) continue for doc, emb in zip(batch, embeddings): doc["embedding"] = emb store_batch_in_qdrant(batch) total_embedded += len(batch) except Exception as e: logger.error(f"배치 임베딩 실패: {e}") total_failed += len(batch) progress = chunk_start + len(chunk_uuids) logger.info(f" 진행: {progress}/{len(uuids)}") # --sync: 고아 포인트 삭제 orphan_deleted = 0 if sync and all_dt_uuids: orphan_uuids = existing_uuids - all_dt_uuids if orphan_uuids: orphan_ids = [uuid_to_point_id(u) for u in orphan_uuids] delete_from_qdrant(orphan_ids) orphan_deleted = len(orphan_uuids) logger.info(f"고아 포인트 삭제: {orphan_deleted}건") # 통계 logger.info("=== 배치 임베딩 완료 ===") logger.info(f" 임베딩: {total_embedded}건") logger.info(f" 스킵: {total_skipped}건") logger.info(f" 실패: {total_failed}건") if orphan_deleted: logger.info(f" 고아 삭제: {orphan_deleted}건") # Qdrant 최종 카운트 try: resp = requests.get(f"{QDRANT_URL}/collections/{COLLECTION}", timeout=10) count = resp.json()["result"]["points_count"] logger.info(f" Qdrant 총 포인트: {count}건") except Exception: pass if __name__ == "__main__": parser = argparse.ArgumentParser(description="DEVONthink 배치 임베딩") parser.add_argument("--force", action="store_true", help="전체 재임베딩") parser.add_argument("--sync", action="store_true", help="삭제 동기화 포함") parser.add_argument("--db", type=str, help="특정 DB만 처리") args = parser.parse_args() creds = load_credentials() gpu_ip = creds.get("GPU_SERVER_IP") if not gpu_ip: logger.error("GPU_SERVER_IP 미설정") sys.exit(1) run_batch(gpu_ip, target_db=args.db, force=args.force, sync=args.sync)