From f8f72ceae20e6ddb85282e575c839a1b1cff151e Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Thu, 23 Apr 2026 13:52:19 +0900 Subject: [PATCH] fix(ocr): Surya 0.17 API + NFC/NFD path normalize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - services/ocr/server.py: surya 0.17.x predictors 기반으로 재작성 (구 `from surya.ocr import run_ocr` 제거됨 → import error → 빈 텍스트 반환) - NFC(DB 경로) vs NFD(NFS 파일시스템) 한글 정규화 mismatch 보정 - surya-ocr 버전 0.17.1 고정 (0.6~1.0 범위는 breaking change 노출) - AIClient.ocr() NotImplementedError 제거 (호출처 0건, extract_worker 가 ocr-service HTTP 호출을 직접 사용) Co-Authored-By: Claude Opus 4.7 (1M context) --- app/ai/client.py | 5 -- services/ocr/requirements.txt | 2 +- services/ocr/server.py | 111 +++++++++++++++++++--------------- 3 files changed, 63 insertions(+), 55 deletions(-) diff --git a/app/ai/client.py b/app/ai/client.py index 049b3c7..6864e13 100644 --- a/app/ai/client.py +++ b/app/ai/client.py @@ -79,11 +79,6 @@ class AIClient: response.raise_for_status() return response.json()["embedding"] - async def ocr(self, image_bytes: bytes) -> str: - """이미지 OCR — GPU 서버 전용""" - # TODO: Qwen2.5-VL-7B 비전 모델 호출 구현 - raise NotImplementedError("OCR는 Phase 1에서 구현") - async def rerank(self, query: str, texts: list[str]) -> list[dict]: """TEI bge-reranker-v2-m3 호출 (Phase 1.3). diff --git a/services/ocr/requirements.txt b/services/ocr/requirements.txt index 4167404..2c78ef2 100644 --- a/services/ocr/requirements.txt +++ b/services/ocr/requirements.txt @@ -1,4 +1,4 @@ -surya-ocr>=0.6.0,<1.0.0 +surya-ocr==0.17.1 pymupdf>=1.24.0,<2.0.0 fastapi>=0.110.0,<1.0.0 uvicorn[standard]>=0.27.0,<1.0.0 diff --git a/services/ocr/server.py b/services/ocr/server.py index 198bad8..b099482 100644 --- a/services/ocr/server.py +++ b/services/ocr/server.py @@ -1,9 +1,10 @@ -"""OCR 마이크로서비스 — Surya OCR (GPU) + PyMuPDF (PDF→이미지) +"""OCR 마이크로서비스 — Surya OCR 0.17.x (GPU) + PyMuPDF (PDF→이미지) 페이지 단위 스트리밍으로 대형 PDF도 메모리 피크 억제. 모델은 첫 요청 시 lazy loading. """ +import unicodedata from pathlib import Path import fitz @@ -13,31 +14,55 @@ from PIL import Image app = FastAPI() -# 모델 lazy loading _models = None IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"} +def _resolve_path(file_path: str) -> Path | None: + """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수.""" + candidates = [file_path, + unicodedata.normalize("NFD", file_path), + unicodedata.normalize("NFC", file_path)] + for c in candidates: + p = Path(c) + if p.exists(): + return p + # 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭 + parent = Path(file_path).parent + if parent.exists(): + target = unicodedata.normalize("NFC", Path(file_path).name) + for child in parent.iterdir(): + if unicodedata.normalize("NFC", child.name) == target: + return child + return None + + def _load_models(): - """Surya OCR 모델 lazy loading — 첫 호출 시만""" + """Surya 0.17 predictors lazy loading — 첫 호출 시만""" global _models if _models is not None: return _models - from surya.model.detection.model import load_det_processor, load_det_model - from surya.model.recognition.model import load_rec_model - from surya.model.recognition.processor import load_rec_processor + from surya.detection import DetectionPredictor + from surya.recognition import FoundationPredictor, RecognitionPredictor + foundation = FoundationPredictor() _models = { - "det_processor": load_det_processor(), - "det_model": load_det_model(), - "rec_model": load_rec_model(), - "rec_processor": load_rec_processor(), + "detection": DetectionPredictor(), + "recognition": RecognitionPredictor(foundation), } return _models +def _ocr_image(image: Image.Image) -> str: + m = _load_models() + results = m["recognition"]([image], det_predictor=m["detection"]) + if not results: + return "" + return "\n".join(line.text for line in results[0].text_lines) + + @app.get("/health") def health(): """Liveness — Docker healthcheck용, 프로세스 생존 확인""" @@ -60,52 +85,40 @@ def ready(): @app.post("/ocr") async def ocr_endpoint(body: dict): """PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)""" - file_path = body["filePath"] - langs = body.get("langs", ["ko", "en"]) + raw_path = body["filePath"] max_pages = body.get("maxPages", 200) - if not Path(file_path).exists(): - return {"error": f"파일 없음: {file_path}", "text": "", "pages": 0, "chars": 0} + resolved = _resolve_path(raw_path) + if resolved is None: + return {"error": f"파일 없음: {raw_path}", "text": "", "pages": 0, "chars": 0} - from surya.ocr import run_ocr + ext = resolved.suffix.lower() - m = _load_models() - ext = Path(file_path).suffix.lower() - - # 이미지 파일 → 단일 이미지 OCR if ext in IMAGE_EXTS: - img = Image.open(file_path).convert("RGB") - predictions = run_ocr( - [img], [langs], - m["det_model"], m["det_processor"], - m["rec_model"], m["rec_processor"], - ) - text = "\n".join(line.text for line in predictions[0].text_lines) - del img + img = Image.open(resolved).convert("RGB") + try: + text = _ocr_image(img) + finally: + del img return {"text": text, "pages": 1, "chars": len(text)} - # PDF → 페이지 단위 렌더 + OCR - doc = fitz.open(file_path) - page_count = len(doc) - process_pages = min(page_count, max_pages) - all_text = [] + doc = fitz.open(str(resolved)) + try: + page_count = len(doc) + process_pages = min(page_count, max_pages) + all_text = [] + for i in range(process_pages): + pix = doc[i].get_pixmap(dpi=200) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + del pix + try: + page_text = _ocr_image(img) + finally: + del img + if page_text.strip(): + all_text.append(page_text) + finally: + doc.close() - for i in range(process_pages): - page = doc[i] - pix = page.get_pixmap(dpi=200) - img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) - del pix # 렌더링 메모리 즉시 해제 - - predictions = run_ocr( - [img], [langs], - m["det_model"], m["det_processor"], - m["rec_model"], m["rec_processor"], - ) - page_text = "\n".join(line.text for line in predictions[0].text_lines) - if page_text.strip(): - all_text.append(page_text) - del img # 이미지 메모리 즉시 해제 - - doc.close() combined = "\n\n".join(all_text) return {"text": combined, "pages": page_count, "chars": len(combined)}