feat: implement Phase 1 data pipeline and migration
- Implement kordoc /parse endpoint (HWP/HWPX/PDF via kordoc lib, text files direct read, images flagged for OCR) - Add queue consumer with APScheduler (1min interval, stage chaining extract→classify→embed, stale item recovery, retry logic) - Add extract worker (kordoc HTTP call + direct text read) - Add classify worker (Qwen3.5 AI classification with think-tag stripping and robust JSON extraction from AI responses) - Add embed worker (GPU server nomic-embed-text, graceful failure) - Add DEVONthink migration script with folder mapping for 16 DBs, dry-run mode, batch commits, and idempotent file_path UNIQUE - Enhance ai/client.py with strip_thinking() and parse_json_response() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,39 @@
|
||||
"""AI 추상화 레이어 — 통합 클라이언트. 기본값은 항상 Qwen3.5."""
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
from core.config import settings
|
||||
|
||||
|
||||
def strip_thinking(text: str) -> str:
|
||||
"""Qwen3.5의 <think>...</think> 블록 제거"""
|
||||
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
||||
|
||||
|
||||
def parse_json_response(raw: str) -> dict | None:
|
||||
"""AI 응답에서 JSON 객체 추출 (think 태그, 코드블록 등 제거)"""
|
||||
cleaned = strip_thinking(raw)
|
||||
# 코드블록 내부 JSON 추출
|
||||
code_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", cleaned, re.DOTALL)
|
||||
if code_match:
|
||||
cleaned = code_match.group(1)
|
||||
# 마지막 유효 JSON 객체 찾기
|
||||
matches = list(re.finditer(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", cleaned, re.DOTALL))
|
||||
for m in reversed(matches):
|
||||
try:
|
||||
return json.loads(m.group())
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
# 최후 시도: 전체 텍스트를 JSON으로
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
# 프롬프트 로딩
|
||||
PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user