feat: DEVONthink OCR 연동 — Surya OCR 전처리 + Smart Rule Step 0

- ocr_preprocess.py: DEVONthink UUID → 파일 추출 → GPU Surya OCR(:8400) 호출 → 텍스트 반환 - auto_classify.scpt: Step 0 OCR 감지 추가 (텍스트 없는 PDF/이미지 → Surya OCR → 본문 병합) - 이미지/스캔 PDF 자동 감지: docType이 PDF/JPEG/PNG/TIFF이고 텍스트가 비어있는 경우 - OCR 실패 시 로그 기록 후 분류 진행 (graceful degradation) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 13:31:22 +09:00
parent 45cabc9aea
commit 5fc23e0dbd
2 changed files with 100 additions and 3 deletions
--- a/applescript/auto_classify.scpt
+++ b/applescript/auto_classify.scpt
@@ -1,21 +1,39 @@
 -- DEVONthink 4 Smart Rule: AI 자동 분류
-- Inbox DB 새 문서 → Ollama 분류 → 태그 + 메타데이터 + 도메인 DB 이동
+-- Inbox DB 새 문서 → OCR 전처리 → MLX 분류 → 태그 + 메타데이터 + 도메인 DB 이동 → Qdrant 임베딩
 -- Smart Rule 설정: Event = On Import, 조건 = Tags is empty
 on performSmartRule(theRecords)
 	tell application id "DNtp"
 		repeat with theRecord in theRecords
 			try
-				-- 1. 문서 텍스트 추출 (최대 4000자)
+				-- 0. OCR 전처리: 텍스트 없는 PDF/이미지 → Surya OCR
 				set docText to plain text of theRecord
 				set docUUID to uuid of theRecord
 				set docType to type of theRecord as string
 				if docText is "" then
 					if docType is in {"PDF Document", "JPEG image", "PNG image", "TIFF image"} then
 						set ocrScript to (POSIX path of (path to home folder)) & "Documents/code/DEVONThink_my server/venv/bin/python3"
 						set ocrPy to (POSIX path of (path to home folder)) & "Documents/code/DEVONThink_my server/scripts/ocr_preprocess.py"
 						try
 							set ocrText to do shell script ocrScript & " " & quoted form of ocrPy & " " & quoted form of docUUID
 							if length of ocrText > 0 then
 								set plain text of theRecord to ocrText
 								set docText to ocrText
 							end if
 						on error ocrErr
 							do shell script "echo '[OCR ERROR] " & ocrErr & "' >> ~/Documents/code/DEVONThink_my\\ server/logs/auto_classify.log"
 						end try
 					end if
 				end if
 				-- 1. 문서 텍스트 추출 (최대 4000자)
 				if length of docText > 4000 then
 					set docText to text 1 thru 4000 of docText
 				end if
 				if length of docText < 10 then
-					-- 텍스트가 너무 짧으면 건너뜀
+					-- OCR 후에도 텍스트가 부족하면 검토필요 태그
 					set tags of theRecord to {"@상태/검토필요"}
 					continue repeat
 				end if
--- a/scripts/ocr_preprocess.py
+++ b/scripts/ocr_preprocess.py
@@ -0,0 +1,79 @@
 #!/usr/bin/env python3
 """
 OCR 전처리 스크립트
 - DEVONthink 문서 UUID로 파일 경로 추출
 - GPU 서버 Surya OCR API 호출
 - OCR 텍스트 반환 (auto_classify.scpt에서 호출)
 """
 import sys
 import requests
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent))
 from pkm_utils import setup_logger, load_credentials, run_applescript_inline
 logger = setup_logger("ocr")
 def get_document_path(doc_uuid: str) -> str | None:
    """DEVONthink에서 UUID로 문서 파일 경로 추출"""
    script = f'''
    tell application id "DNtp"
        set theRecord to get record with uuid "{doc_uuid}"
        return POSIX path of (path of theRecord as POSIX file)
    end tell
    '''
    try:
        return run_applescript_inline(script)
    except Exception as e:
        logger.error(f"파일 경로 추출 실패 [{doc_uuid}]: {e}")
        return None
 def run_ocr(file_path: str, gpu_server_ip: str, langs: str = "ko,en,ja") -> str | None:
    """GPU 서버 Surya OCR API 호출"""
    url = f"http://{gpu_server_ip}:8400/ocr"
    try:
        with open(file_path, "rb") as f:
            resp = requests.post(
                url,
                files={"file": (Path(file_path).name, f)},
                data={"langs": langs},
                timeout=300,
            )
        resp.raise_for_status()
        result = resp.json()
        text = result.get("text", "")
        pages = result.get("total_pages", 0)
        logger.info(f"OCR 완료: {pages}페이지, {len(text)}자")
        return text
    except Exception as e:
        logger.error(f"OCR 실패 [{file_path}]: {e}")
        return None
 def run(doc_uuid: str) -> str:
    """단일 문서 OCR 처리 — 텍스트 반환"""
    logger.info(f"OCR 처리 시작: {doc_uuid}")
    creds = load_credentials()
    gpu_ip = creds.get("GPU_SERVER_IP")
    if not gpu_ip:
        logger.warning("GPU_SERVER_IP 미설정 — OCR 건너뜀")
        return ""
    file_path = get_document_path(doc_uuid)
    if not file_path:
        return ""
    text = run_ocr(file_path, gpu_ip)
    return text or ""
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("사용법: python3 ocr_preprocess.py <DEVONthink_UUID>")
        sys.exit(1)
    result = run(sys.argv[1])
    print(result)