hyungi_document_server/services/ocr/server.py

"""OCR 마이크로서비스 — Surya OCR 0.17.x (GPU) + PyMuPDF (PDF→이미지)

페이지 단위 스트리밍으로 대형 PDF도 메모리 피크 억제.
모델은 첫 요청 시 lazy loading.
"""

import unicodedata
from pathlib import Path

import fitz
import torch
from fastapi import FastAPI
from PIL import Image

app = FastAPI()

_models = None

IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"}


def _resolve_path(file_path: str) -> Path | None:
    """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수."""
    candidates = [file_path,
                  unicodedata.normalize("NFD", file_path),
                  unicodedata.normalize("NFC", file_path)]
    for c in candidates:
        p = Path(c)
        if p.exists():
            return p
    # 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭
    parent = Path(file_path).parent
    if parent.exists():
        target = unicodedata.normalize("NFC", Path(file_path).name)
        for child in parent.iterdir():
            if unicodedata.normalize("NFC", child.name) == target:
                return child
    return None


def _load_models():
    """Surya 0.17 predictors lazy loading — 첫 호출 시만"""
    global _models
    if _models is not None:
        return _models

    from surya.detection import DetectionPredictor
    from surya.recognition import FoundationPredictor, RecognitionPredictor

    foundation = FoundationPredictor()
    _models = {
        "detection": DetectionPredictor(),
        "recognition": RecognitionPredictor(foundation),
    }
    return _models


def _ocr_image(image: Image.Image) -> str:
    m = _load_models()
    results = m["recognition"]([image], det_predictor=m["detection"])
    if not results:
        return ""
    return "\n".join(line.text for line in results[0].text_lines)


@app.get("/health")
def health():
    """Liveness — Docker healthcheck용, 프로세스 생존 확인"""
    return {"status": "ok", "service": "ocr-surya"}


@app.get("/ready")
def ready():
    """Readiness — 배포 검증용, CUDA + 모델 상태"""
    cuda_ok = torch.cuda.is_available()
    models_loaded = _models is not None
    return {
        "ready": cuda_ok and models_loaded,
        "cuda": cuda_ok,
        "models_loaded": models_loaded,
        "gpu_name": torch.cuda.get_device_name(0) if cuda_ok else None,
    }


@app.post("/ocr")
async def ocr_endpoint(body: dict):
    """PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)"""
    raw_path = body["filePath"]
    max_pages = body.get("maxPages", 200)

    resolved = _resolve_path(raw_path)
    if resolved is None:
        return {"error": f"파일 없음: {raw_path}", "text": "", "pages": 0, "chars": 0}

    ext = resolved.suffix.lower()

    if ext in IMAGE_EXTS:
        img = Image.open(resolved).convert("RGB")
        try:
            text = _ocr_image(img)
        finally:
            del img
        return {"text": text, "pages": 1, "chars": len(text)}

    doc = fitz.open(str(resolved))
    try:
        page_count = len(doc)
        process_pages = min(page_count, max_pages)
        all_text = []
        for i in range(process_pages):
            pix = doc[i].get_pixmap(dpi=200)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            del pix
            try:
                page_text = _ocr_image(img)
            finally:
                del img
            if page_text.strip():
                all_text.append(page_text)
    finally:
        doc.close()

    combined = "\n\n".join(all_text)
    return {"text": combined, "pages": page_count, "chars": len(combined)}