"""OCR 마이크로서비스 — Surya OCR 0.17.x (GPU) + PyMuPDF (PDF→이미지) 페이지 단위 스트리밍으로 대형 PDF도 메모리 피크 억제. 모델은 첫 요청 시 lazy loading. """ import unicodedata from pathlib import Path import fitz import torch from fastapi import FastAPI from PIL import Image app = FastAPI() _models = None IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"} def _resolve_path(file_path: str) -> Path | None: """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수.""" candidates = [file_path, unicodedata.normalize("NFD", file_path), unicodedata.normalize("NFC", file_path)] for c in candidates: p = Path(c) if p.exists(): return p # 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭 parent = Path(file_path).parent if parent.exists(): target = unicodedata.normalize("NFC", Path(file_path).name) for child in parent.iterdir(): if unicodedata.normalize("NFC", child.name) == target: return child return None def _load_models(): """Surya 0.17 predictors lazy loading — 첫 호출 시만""" global _models if _models is not None: return _models from surya.detection import DetectionPredictor from surya.recognition import FoundationPredictor, RecognitionPredictor foundation = FoundationPredictor() _models = { "detection": DetectionPredictor(), "recognition": RecognitionPredictor(foundation), } return _models def _ocr_image(image: Image.Image) -> str: m = _load_models() results = m["recognition"]([image], det_predictor=m["detection"]) if not results: return "" return "\n".join(line.text for line in results[0].text_lines) @app.get("/health") def health(): """Liveness — Docker healthcheck용, 프로세스 생존 확인""" return {"status": "ok", "service": "ocr-surya"} @app.get("/ready") def ready(): """Readiness — 배포 검증용, CUDA + 모델 상태""" cuda_ok = torch.cuda.is_available() models_loaded = _models is not None return { "ready": cuda_ok and models_loaded, "cuda": cuda_ok, "models_loaded": models_loaded, "gpu_name": torch.cuda.get_device_name(0) if cuda_ok else None, } @app.post("/ocr") async def ocr_endpoint(body: dict): """PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)""" raw_path = body["filePath"] max_pages = body.get("maxPages", 200) resolved = _resolve_path(raw_path) if resolved is None: return {"error": f"파일 없음: {raw_path}", "text": "", "pages": 0, "chars": 0} ext = resolved.suffix.lower() if ext in IMAGE_EXTS: img = Image.open(resolved).convert("RGB") try: text = _ocr_image(img) finally: del img return {"text": text, "pages": 1, "chars": len(text)} doc = fitz.open(str(resolved)) try: page_count = len(doc) process_pages = min(page_count, max_pages) all_text = [] for i in range(process_pages): pix = doc[i].get_pixmap(dpi=200) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) del pix try: page_text = _ocr_image(img) finally: del img if page_text.strip(): all_text.append(page_text) finally: doc.close() combined = "\n\n".join(all_text) return {"text": combined, "pages": page_count, "chars": len(combined)}