Files
hyungi_document_server/app/workers/embed_worker.py
Hyungi Ahn 299fac3904 feat: implement Phase 1 data pipeline and migration
- Implement kordoc /parse endpoint (HWP/HWPX/PDF via kordoc lib,
  text files direct read, images flagged for OCR)
- Add queue consumer with APScheduler (1min interval, stage chaining
  extract→classify→embed, stale item recovery, retry logic)
- Add extract worker (kordoc HTTP call + direct text read)
- Add classify worker (Qwen3.5 AI classification with think-tag
  stripping and robust JSON extraction from AI responses)
- Add embed worker (GPU server nomic-embed-text, graceful failure)
- Add DEVONthink migration script with folder mapping for 16 DBs,
  dry-run mode, batch commits, and idempotent file_path UNIQUE
- Enhance ai/client.py with strip_thinking() and parse_json_response()

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 14:35:36 +09:00

45 lines
1.5 KiB
Python

"""벡터 임베딩 워커 — GPU 서버 nomic-embed-text 호출"""
from datetime import datetime, timezone
from sqlalchemy.ext.asyncio import AsyncSession
from ai.client import AIClient
from core.utils import setup_logger
from models.document import Document
logger = setup_logger("embed_worker")
# 임베딩용 텍스트 최대 길이 (nomic-embed-text: 8192 토큰)
MAX_EMBED_TEXT = 6000
EMBED_MODEL_VERSION = "nomic-embed-text-v1.5"
async def process(document_id: int, session: AsyncSession) -> None:
"""문서 벡터 임베딩 생성"""
doc = await session.get(Document, document_id)
if not doc:
raise ValueError(f"문서 ID {document_id}를 찾을 수 없음")
if not doc.extracted_text:
raise ValueError(f"문서 ID {document_id}: extracted_text가 비어있음")
# title + 본문 앞부분을 결합하여 임베딩 입력 생성
title_part = doc.title or ""
text_part = doc.extracted_text[:MAX_EMBED_TEXT]
embed_input = f"{title_part}\n\n{text_part}".strip()
if not embed_input:
logger.warning(f"[임베딩] document_id={document_id}: 빈 텍스트, 스킵")
return
client = AIClient()
try:
vector = await client.embed(embed_input)
doc.embedding = vector
doc.embed_model_version = EMBED_MODEL_VERSION
doc.embedded_at = datetime.now(timezone.utc)
logger.info(f"[임베딩] document_id={document_id}: {len(vector)}차원 벡터 생성")
finally:
await client.close()