feat: implement Phase 1 data pipeline and migration

- Implement kordoc /parse endpoint (HWP/HWPX/PDF via kordoc lib, text files direct read, images flagged for OCR) - Add queue consumer with APScheduler (1min interval, stage chaining extract→classify→embed, stale item recovery, retry logic) - Add extract worker (kordoc HTTP call + direct text read) - Add classify worker (Qwen3.5 AI classification with think-tag stripping and robust JSON extraction from AI responses) - Add embed worker (GPU server nomic-embed-text, graceful failure) - Add DEVONthink migration script with folder mapping for 16 DBs, dry-run mode, batch commits, and idempotent file_path UNIQUE - Enhance ai/client.py with strip_thinking() and parse_json_response() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 14:35:36 +09:00
parent 23ee055357
commit 299fac3904
9 changed files with 682 additions and 13 deletions
--- a/app/workers/extract_worker.py
+++ b/app/workers/extract_worker.py
@@ -0,0 +1,80 @@
+"""텍스트 추출 워커 — kordoc 호출 또는 직접 파일 읽기"""
+
+from datetime import datetime, timezone
+from pathlib import Path
+
+import httpx
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from core.config import settings
+from core.utils import setup_logger
+from models.document import Document
+
+logger = setup_logger("extract_worker")
+
+# kordoc으로 파싱 가능한 포맷
+KORDOC_FORMATS = {"hwp", "hwpx", "pdf"}
+# 직접 읽기 가능한 텍스트 포맷
+TEXT_FORMATS = {"md", "txt", "csv", "json", "xml", "html"}
+# OCR 필요 이미지 포맷 (Phase 2)
+IMAGE_FORMATS = {"jpg", "jpeg", "png", "tiff", "tif", "bmp", "gif"}
+
+EXTRACTOR_VERSION = "kordoc@1.7"
+
+
+async def process(document_id: int, session: AsyncSession) -> None:
+    """문서 텍스트 추출"""
+    doc = await session.get(Document, document_id)
+    if not doc:
+        raise ValueError(f"문서 ID {document_id}를 찾을 수 없음")
+
+    fmt = doc.file_format.lower()
+    full_path = Path(settings.nas_mount_path) / doc.file_path
+
+    # 텍스트 파일 — 직접 읽기
+    if fmt in TEXT_FORMATS:
+        if not full_path.exists():
+            raise FileNotFoundError(f"파일 없음: {full_path}")
+        text = full_path.read_text(encoding="utf-8", errors="replace")
+        doc.extracted_text = text
+        doc.extracted_at = datetime.now(timezone.utc)
+        doc.extractor_version = "direct_read"
+        logger.info(f"[텍스트] {doc.file_path} ({len(text)}자)")
+        return
+
+    # 이미지 — 스킵 (Phase 2 OCR)
+    if fmt in IMAGE_FORMATS:
+        doc.extracted_text = ""
+        doc.extracted_at = datetime.now(timezone.utc)
+        doc.extractor_version = "skip_image"
+        logger.info(f"[이미지] {doc.file_path} — OCR 미구현, 스킵")
+        return
+
+    # kordoc 파싱 (HWP/HWPX/PDF)
+    if fmt in KORDOC_FORMATS:
+        # 컨테이너 내부 경로: /documents/{file_path}
+        container_path = f"/documents/{doc.file_path}"
+        async with httpx.AsyncClient(timeout=60) as client:
+            resp = await client.post(
+                f"{settings.kordoc_endpoint}/parse",
+                json={"filePath": container_path},
+            )
+
+        if resp.status_code == 404:
+            raise FileNotFoundError(f"kordoc: 파일 없음 — {container_path}")
+        if resp.status_code == 422:
+            raise ValueError(f"kordoc: 파싱 실패 — {resp.json().get('error', 'unknown')}")
+        resp.raise_for_status()
+
+        data = resp.json()
+        doc.extracted_text = data.get("markdown", "")
+        doc.extracted_at = datetime.now(timezone.utc)
+        doc.extractor_version = EXTRACTOR_VERSION
+        logger.info(f"[kordoc] {doc.file_path} ({len(doc.extracted_text)}자)")
+        return
+
+    # 미지원 포맷
+    doc.extracted_text = ""
+    doc.extracted_at = datetime.now(timezone.utc)
+    doc.extractor_version = f"unsupported_{fmt}"
+    logger.warning(f"[미지원] {doc.file_path} (format={fmt})")