feat: DEVONthink OCR 연동 — Surya OCR 전처리 + Smart Rule Step 0

- ocr_preprocess.py: DEVONthink UUID → 파일 추출 → GPU Surya OCR(:8400) 호출 → 텍스트 반환 - auto_classify.scpt: Step 0 OCR 감지 추가 (텍스트 없는 PDF/이미지 → Surya OCR → 본문 병합) - 이미지/스캔 PDF 자동 감지: docType이 PDF/JPEG/PNG/TIFF이고 텍스트가 비어있는 경우 - OCR 실패 시 로그 기록 후 분류 진행 (graceful degradation) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 13:31:22 +09:00
parent 45cabc9aea
commit 5fc23e0dbd
2 changed files with 100 additions and 3 deletions
--- a/applescript/auto_classify.scpt
+++ b/applescript/auto_classify.scpt
@@ -1,21 +1,39 @@
 -- DEVONthink 4 Smart Rule: AI 자동 분류
-- Inbox DB 새 문서 → Ollama 분류 → 태그 + 메타데이터 + 도메인 DB 이동
+-- Inbox DB 새 문서 → OCR 전처리 → MLX 분류 → 태그 + 메타데이터 + 도메인 DB 이동 → Qdrant 임베딩
 -- Smart Rule 설정: Event = On Import, 조건 = Tags is empty

 on performSmartRule(theRecords)
 	tell application id "DNtp"
 		repeat with theRecord in theRecords
 			try
-				-- 1. 문서 텍스트 추출 (최대 4000자)
+				-- 0. OCR 전처리: 텍스트 없는 PDF/이미지 → Surya OCR
 				set docText to plain text of theRecord
 				set docUUID to uuid of theRecord
+				set docType to type of theRecord as string

+				if docText is "" then
+					if docType is in {"PDF Document", "JPEG image", "PNG image", "TIFF image"} then
+						set ocrScript to (POSIX path of (path to home folder)) & "Documents/code/DEVONThink_my server/venv/bin/python3"
+						set ocrPy to (POSIX path of (path to home folder)) & "Documents/code/DEVONThink_my server/scripts/ocr_preprocess.py"
+						try
+							set ocrText to do shell script ocrScript & " " & quoted form of ocrPy & " " & quoted form of docUUID
+							if length of ocrText > 0 then
+								set plain text of theRecord to ocrText
+								set docText to ocrText
+							end if
+						on error ocrErr
+							do shell script "echo '[OCR ERROR] " & ocrErr & "' >> ~/Documents/code/DEVONThink_my\\ server/logs/auto_classify.log"
+						end try
+					end if
+				end if
+
+				-- 1. 문서 텍스트 추출 (최대 4000자)
 				if length of docText > 4000 then
 					set docText to text 1 thru 4000 of docText
 				end if

 				if length of docText < 10 then
-					-- 텍스트가 너무 짧으면 건너뜀
+					-- OCR 후에도 텍스트가 부족하면 검토필요 태그
 					set tags of theRecord to {"@상태/검토필요"}
 					continue repeat
 				end if
--- a/scripts/ocr_preprocess.py
+++ b/scripts/ocr_preprocess.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""
+OCR 전처리 스크립트
+- DEVONthink 문서 UUID로 파일 경로 추출
+- GPU 서버 Surya OCR API 호출
+- OCR 텍스트 반환 (auto_classify.scpt에서 호출)
+"""
+
+import sys
+import requests
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+from pkm_utils import setup_logger, load_credentials, run_applescript_inline
+
+logger = setup_logger("ocr")
+
+
+def get_document_path(doc_uuid: str) -> str | None:
+    """DEVONthink에서 UUID로 문서 파일 경로 추출"""
+    script = f'''
+    tell application id "DNtp"
+        set theRecord to get record with uuid "{doc_uuid}"
+        return POSIX path of (path of theRecord as POSIX file)
+    end tell
+    '''
+    try:
+        return run_applescript_inline(script)
+    except Exception as e:
+        logger.error(f"파일 경로 추출 실패 [{doc_uuid}]: {e}")
+        return None
+
+
+def run_ocr(file_path: str, gpu_server_ip: str, langs: str = "ko,en,ja") -> str | None:
+    """GPU 서버 Surya OCR API 호출"""
+    url = f"http://{gpu_server_ip}:8400/ocr"
+    try:
+        with open(file_path, "rb") as f:
+            resp = requests.post(
+                url,
+                files={"file": (Path(file_path).name, f)},
+                data={"langs": langs},
+                timeout=300,
+            )
+        resp.raise_for_status()
+        result = resp.json()
+        text = result.get("text", "")
+        pages = result.get("total_pages", 0)
+        logger.info(f"OCR 완료: {pages}페이지, {len(text)}자")
+        return text
+    except Exception as e:
+        logger.error(f"OCR 실패 [{file_path}]: {e}")
+        return None
+
+
+def run(doc_uuid: str) -> str:
+    """단일 문서 OCR 처리 — 텍스트 반환"""
+    logger.info(f"OCR 처리 시작: {doc_uuid}")
+
+    creds = load_credentials()
+    gpu_ip = creds.get("GPU_SERVER_IP")
+    if not gpu_ip:
+        logger.warning("GPU_SERVER_IP 미설정 — OCR 건너뜀")
+        return ""
+
+    file_path = get_document_path(doc_uuid)
+    if not file_path:
+        return ""
+
+    text = run_ocr(file_path, gpu_ip)
+    return text or ""
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("사용법: python3 ocr_preprocess.py <DEVONthink_UUID>")
+        sys.exit(1)
+    result = run(sys.argv[1])
+    print(result)