- Implement kordoc /parse endpoint (HWP/HWPX/PDF via kordoc lib, text files direct read, images flagged for OCR) - Add queue consumer with APScheduler (1min interval, stage chaining extract→classify→embed, stale item recovery, retry logic) - Add extract worker (kordoc HTTP call + direct text read) - Add classify worker (Qwen3.5 AI classification with think-tag stripping and robust JSON extraction from AI responses) - Add embed worker (GPU server nomic-embed-text, graceful failure) - Add DEVONthink migration script with folder mapping for 16 DBs, dry-run mode, batch commits, and idempotent file_path UNIQUE - Enhance ai/client.py with strip_thinking() and parse_json_response() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
"""벡터 임베딩 워커 — GPU 서버 nomic-embed-text 호출"""
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from ai.client import AIClient
|
|
from core.utils import setup_logger
|
|
from models.document import Document
|
|
|
|
logger = setup_logger("embed_worker")
|
|
|
|
# 임베딩용 텍스트 최대 길이 (nomic-embed-text: 8192 토큰)
|
|
MAX_EMBED_TEXT = 6000
|
|
EMBED_MODEL_VERSION = "nomic-embed-text-v1.5"
|
|
|
|
|
|
async def process(document_id: int, session: AsyncSession) -> None:
|
|
"""문서 벡터 임베딩 생성"""
|
|
doc = await session.get(Document, document_id)
|
|
if not doc:
|
|
raise ValueError(f"문서 ID {document_id}를 찾을 수 없음")
|
|
|
|
if not doc.extracted_text:
|
|
raise ValueError(f"문서 ID {document_id}: extracted_text가 비어있음")
|
|
|
|
# title + 본문 앞부분을 결합하여 임베딩 입력 생성
|
|
title_part = doc.title or ""
|
|
text_part = doc.extracted_text[:MAX_EMBED_TEXT]
|
|
embed_input = f"{title_part}\n\n{text_part}".strip()
|
|
|
|
if not embed_input:
|
|
logger.warning(f"[임베딩] document_id={document_id}: 빈 텍스트, 스킵")
|
|
return
|
|
|
|
client = AIClient()
|
|
try:
|
|
vector = await client.embed(embed_input)
|
|
doc.embedding = vector
|
|
doc.embed_model_version = EMBED_MODEL_VERSION
|
|
doc.embedded_at = datetime.now(timezone.utc)
|
|
logger.info(f"[임베딩] document_id={document_id}: {len(vector)}차원 벡터 생성")
|
|
finally:
|
|
await client.close()
|