feat: implement Phase 1 data pipeline and migration
- Implement kordoc /parse endpoint (HWP/HWPX/PDF via kordoc lib, text files direct read, images flagged for OCR) - Add queue consumer with APScheduler (1min interval, stage chaining extract→classify→embed, stale item recovery, retry logic) - Add extract worker (kordoc HTTP call + direct text read) - Add classify worker (Qwen3.5 AI classification with think-tag stripping and robust JSON extraction from AI responses) - Add embed worker (GPU server nomic-embed-text, graceful failure) - Add DEVONthink migration script with folder mapping for 16 DBs, dry-run mode, batch commits, and idempotent file_path UNIQUE - Enhance ai/client.py with strip_thinking() and parse_json_response() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
44
app/workers/embed_worker.py
Normal file
44
app/workers/embed_worker.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""벡터 임베딩 워커 — GPU 서버 nomic-embed-text 호출"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ai.client import AIClient
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
|
||||
logger = setup_logger("embed_worker")
|
||||
|
||||
# 임베딩용 텍스트 최대 길이 (nomic-embed-text: 8192 토큰)
|
||||
MAX_EMBED_TEXT = 6000
|
||||
EMBED_MODEL_VERSION = "nomic-embed-text-v1.5"
|
||||
|
||||
|
||||
async def process(document_id: int, session: AsyncSession) -> None:
|
||||
"""문서 벡터 임베딩 생성"""
|
||||
doc = await session.get(Document, document_id)
|
||||
if not doc:
|
||||
raise ValueError(f"문서 ID {document_id}를 찾을 수 없음")
|
||||
|
||||
if not doc.extracted_text:
|
||||
raise ValueError(f"문서 ID {document_id}: extracted_text가 비어있음")
|
||||
|
||||
# title + 본문 앞부분을 결합하여 임베딩 입력 생성
|
||||
title_part = doc.title or ""
|
||||
text_part = doc.extracted_text[:MAX_EMBED_TEXT]
|
||||
embed_input = f"{title_part}\n\n{text_part}".strip()
|
||||
|
||||
if not embed_input:
|
||||
logger.warning(f"[임베딩] document_id={document_id}: 빈 텍스트, 스킵")
|
||||
return
|
||||
|
||||
client = AIClient()
|
||||
try:
|
||||
vector = await client.embed(embed_input)
|
||||
doc.embedding = vector
|
||||
doc.embed_model_version = EMBED_MODEL_VERSION
|
||||
doc.embedded_at = datetime.now(timezone.utc)
|
||||
logger.info(f"[임베딩] document_id={document_id}: {len(vector)}차원 벡터 생성")
|
||||
finally:
|
||||
await client.close()
|
||||
Reference in New Issue
Block a user