"""OCR 마이크로서비스 — Surya OCR 0.17.x (GPU) + PyMuPDF (PDF→이미지) 페이지 단위 스트리밍으로 대형 PDF도 메모리 피크 억제. 모델은 첫 요청 시 lazy loading. """ import asyncio import time import unicodedata from pathlib import Path import fitz import torch from fastapi import FastAPI from fastapi.responses import JSONResponse from PIL import Image, ImageDraw app = FastAPI() _models = None IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"} def _resolve_path(file_path: str) -> Path | None: """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수.""" candidates = [file_path, unicodedata.normalize("NFD", file_path), unicodedata.normalize("NFC", file_path)] for c in candidates: p = Path(c) if p.exists(): return p # 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭 parent = Path(file_path).parent if parent.exists(): target = unicodedata.normalize("NFC", Path(file_path).name) for child in parent.iterdir(): if unicodedata.normalize("NFC", child.name) == target: return child return None def _load_models(): """Surya 0.17 predictors lazy loading — 첫 호출 시만""" global _models if _models is not None: return _models from surya.detection import DetectionPredictor from surya.recognition import FoundationPredictor, RecognitionPredictor foundation = FoundationPredictor() _models = { "detection": DetectionPredictor(), "recognition": RecognitionPredictor(foundation), } return _models def _ocr_image(image: Image.Image) -> str: m = _load_models() results = m["recognition"]([image], det_predictor=m["detection"]) if not results: return "" return "\n".join(line.text for line in results[0].text_lines) @app.get("/health") def health(): """Liveness — Docker healthcheck용, 프로세스 생존 확인""" return {"status": "ok", "service": "ocr-surya"} @app.get("/ready") def ready(): """Readiness — 배포 검증용, CUDA + 모델 상태""" cuda_ok = torch.cuda.is_available() models_loaded = _models is not None return { "ready": cuda_ok and models_loaded, "cuda": cuda_ok, "models_loaded": models_loaded, "gpu_name": torch.cuda.get_device_name(0) if cuda_ok else None, } @app.get("/smoke") async def smoke(): """OCR 라운드트립이 예외 없이 완료되는지 운영 verify. Docker healthcheck 미사용.""" start = time.monotonic() img = Image.new("RGB", (160, 60), color="white") draw = ImageDraw.Draw(img) draw.text((30, 20), "OK", fill="black") try: loop = asyncio.get_running_loop() await asyncio.wait_for( loop.run_in_executor(None, _ocr_image, img), timeout=20.0, ) except asyncio.TimeoutError: return JSONResponse(status_code=503, content={"status": "degraded", "reason": "timeout"}) except Exception as exc: return JSONResponse( status_code=503, content={"status": "degraded", "reason": exc.__class__.__name__}, ) elapsed_ms = int((time.monotonic() - start) * 1000) return {"status": "ok", "service": "ocr-service", "inference": "ok", "elapsed_ms": elapsed_ms} @app.post("/ocr") async def ocr_endpoint(body: dict): """PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)""" raw_path = body["filePath"] max_pages = body.get("maxPages", 200) resolved = _resolve_path(raw_path) if resolved is None: return {"error": f"파일 없음: {raw_path}", "text": "", "pages": 0, "chars": 0} ext = resolved.suffix.lower() if ext in IMAGE_EXTS: img = Image.open(resolved).convert("RGB") try: text = _ocr_image(img) finally: del img return {"text": text, "pages": 1, "chars": len(text)} doc = fitz.open(str(resolved)) try: page_count = len(doc) process_pages = min(page_count, max_pages) all_text = [] for i in range(process_pages): pix = doc[i].get_pixmap(dpi=200) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) del pix try: page_text = _ocr_image(img) finally: del img if page_text.strip(): all_text.append(page_text) finally: doc.close() combined = "\n\n".join(all_text) return {"text": combined, "pages": page_count, "chars": len(combined)}