From 5fc23e0dbd8642d8e18befad163c96b6e8069d98 Mon Sep 17 00:00:00 2001 From: hyungi Date: Mon, 30 Mar 2026 13:31:22 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20DEVONthink=20OCR=20=EC=97=B0=EB=8F=99?= =?UTF-8?q?=20=E2=80=94=20Surya=20OCR=20=EC=A0=84=EC=B2=98=EB=A6=AC=20+=20?= =?UTF-8?q?Smart=20Rule=20Step=200?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ocr_preprocess.py: DEVONthink UUID → 파일 추출 → GPU Surya OCR(:8400) 호출 → 텍스트 반환 - auto_classify.scpt: Step 0 OCR 감지 추가 (텍스트 없는 PDF/이미지 → Surya OCR → 본문 병합) - 이미지/스캔 PDF 자동 감지: docType이 PDF/JPEG/PNG/TIFF이고 텍스트가 비어있는 경우 - OCR 실패 시 로그 기록 후 분류 진행 (graceful degradation) Co-Authored-By: Claude Opus 4.6 (1M context) --- applescript/auto_classify.scpt | 24 +++++++++-- scripts/ocr_preprocess.py | 79 ++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 scripts/ocr_preprocess.py diff --git a/applescript/auto_classify.scpt b/applescript/auto_classify.scpt index 057a977..49c2820 100644 --- a/applescript/auto_classify.scpt +++ b/applescript/auto_classify.scpt @@ -1,21 +1,39 @@ -- DEVONthink 4 Smart Rule: AI 자동 분류 --- Inbox DB 새 문서 → Ollama 분류 → 태그 + 메타데이터 + 도메인 DB 이동 +-- Inbox DB 새 문서 → OCR 전처리 → MLX 분류 → 태그 + 메타데이터 + 도메인 DB 이동 → Qdrant 임베딩 -- Smart Rule 설정: Event = On Import, 조건 = Tags is empty on performSmartRule(theRecords) tell application id "DNtp" repeat with theRecord in theRecords try - -- 1. 문서 텍스트 추출 (최대 4000자) + -- 0. OCR 전처리: 텍스트 없는 PDF/이미지 → Surya OCR set docText to plain text of theRecord set docUUID to uuid of theRecord + set docType to type of theRecord as string + if docText is "" then + if docType is in {"PDF Document", "JPEG image", "PNG image", "TIFF image"} then + set ocrScript to (POSIX path of (path to home folder)) & "Documents/code/DEVONThink_my server/venv/bin/python3" + set ocrPy to (POSIX path of (path to home folder)) & "Documents/code/DEVONThink_my server/scripts/ocr_preprocess.py" + try + set ocrText to do shell script ocrScript & " " & quoted form of ocrPy & " " & quoted form of docUUID + if length of ocrText > 0 then + set plain text of theRecord to ocrText + set docText to ocrText + end if + on error ocrErr + do shell script "echo '[OCR ERROR] " & ocrErr & "' >> ~/Documents/code/DEVONThink_my\\ server/logs/auto_classify.log" + end try + end if + end if + + -- 1. 문서 텍스트 추출 (최대 4000자) if length of docText > 4000 then set docText to text 1 thru 4000 of docText end if if length of docText < 10 then - -- 텍스트가 너무 짧으면 건너뜀 + -- OCR 후에도 텍스트가 부족하면 검토필요 태그 set tags of theRecord to {"@상태/검토필요"} continue repeat end if diff --git a/scripts/ocr_preprocess.py b/scripts/ocr_preprocess.py new file mode 100644 index 0000000..63cb4d4 --- /dev/null +++ b/scripts/ocr_preprocess.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +""" +OCR 전처리 스크립트 +- DEVONthink 문서 UUID로 파일 경로 추출 +- GPU 서버 Surya OCR API 호출 +- OCR 텍스트 반환 (auto_classify.scpt에서 호출) +""" + +import sys +import requests +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from pkm_utils import setup_logger, load_credentials, run_applescript_inline + +logger = setup_logger("ocr") + + +def get_document_path(doc_uuid: str) -> str | None: + """DEVONthink에서 UUID로 문서 파일 경로 추출""" + script = f''' + tell application id "DNtp" + set theRecord to get record with uuid "{doc_uuid}" + return POSIX path of (path of theRecord as POSIX file) + end tell + ''' + try: + return run_applescript_inline(script) + except Exception as e: + logger.error(f"파일 경로 추출 실패 [{doc_uuid}]: {e}") + return None + + +def run_ocr(file_path: str, gpu_server_ip: str, langs: str = "ko,en,ja") -> str | None: + """GPU 서버 Surya OCR API 호출""" + url = f"http://{gpu_server_ip}:8400/ocr" + try: + with open(file_path, "rb") as f: + resp = requests.post( + url, + files={"file": (Path(file_path).name, f)}, + data={"langs": langs}, + timeout=300, + ) + resp.raise_for_status() + result = resp.json() + text = result.get("text", "") + pages = result.get("total_pages", 0) + logger.info(f"OCR 완료: {pages}페이지, {len(text)}자") + return text + except Exception as e: + logger.error(f"OCR 실패 [{file_path}]: {e}") + return None + + +def run(doc_uuid: str) -> str: + """단일 문서 OCR 처리 — 텍스트 반환""" + logger.info(f"OCR 처리 시작: {doc_uuid}") + + creds = load_credentials() + gpu_ip = creds.get("GPU_SERVER_IP") + if not gpu_ip: + logger.warning("GPU_SERVER_IP 미설정 — OCR 건너뜀") + return "" + + file_path = get_document_path(doc_uuid) + if not file_path: + return "" + + text = run_ocr(file_path, gpu_ip) + return text or "" + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("사용법: python3 ocr_preprocess.py ") + sys.exit(1) + result = run(sys.argv[1]) + print(result)