Files
Hyungi Ahn f8f72ceae2 fix(ocr): Surya 0.17 API + NFC/NFD path normalize
- services/ocr/server.py: surya 0.17.x predictors 기반으로 재작성
  (구 `from surya.ocr import run_ocr` 제거됨 → import error → 빈 텍스트 반환)
- NFC(DB 경로) vs NFD(NFS 파일시스템) 한글 정규화 mismatch 보정
- surya-ocr 버전 0.17.1 고정 (0.6~1.0 범위는 breaking change 노출)
- AIClient.ocr() NotImplementedError 제거 (호출처 0건, extract_worker 가
  ocr-service HTTP 호출을 직접 사용)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 13:52:19 +09:00

125 lines
3.6 KiB
Python

"""OCR 마이크로서비스 — Surya OCR 0.17.x (GPU) + PyMuPDF (PDF→이미지)
페이지 단위 스트리밍으로 대형 PDF도 메모리 피크 억제.
모델은 첫 요청 시 lazy loading.
"""
import unicodedata
from pathlib import Path
import fitz
import torch
from fastapi import FastAPI
from PIL import Image
app = FastAPI()
_models = None
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"}
def _resolve_path(file_path: str) -> Path | None:
"""NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수."""
candidates = [file_path,
unicodedata.normalize("NFD", file_path),
unicodedata.normalize("NFC", file_path)]
for c in candidates:
p = Path(c)
if p.exists():
return p
# 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭
parent = Path(file_path).parent
if parent.exists():
target = unicodedata.normalize("NFC", Path(file_path).name)
for child in parent.iterdir():
if unicodedata.normalize("NFC", child.name) == target:
return child
return None
def _load_models():
"""Surya 0.17 predictors lazy loading — 첫 호출 시만"""
global _models
if _models is not None:
return _models
from surya.detection import DetectionPredictor
from surya.recognition import FoundationPredictor, RecognitionPredictor
foundation = FoundationPredictor()
_models = {
"detection": DetectionPredictor(),
"recognition": RecognitionPredictor(foundation),
}
return _models
def _ocr_image(image: Image.Image) -> str:
m = _load_models()
results = m["recognition"]([image], det_predictor=m["detection"])
if not results:
return ""
return "\n".join(line.text for line in results[0].text_lines)
@app.get("/health")
def health():
"""Liveness — Docker healthcheck용, 프로세스 생존 확인"""
return {"status": "ok", "service": "ocr-surya"}
@app.get("/ready")
def ready():
"""Readiness — 배포 검증용, CUDA + 모델 상태"""
cuda_ok = torch.cuda.is_available()
models_loaded = _models is not None
return {
"ready": cuda_ok and models_loaded,
"cuda": cuda_ok,
"models_loaded": models_loaded,
"gpu_name": torch.cuda.get_device_name(0) if cuda_ok else None,
}
@app.post("/ocr")
async def ocr_endpoint(body: dict):
"""PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)"""
raw_path = body["filePath"]
max_pages = body.get("maxPages", 200)
resolved = _resolve_path(raw_path)
if resolved is None:
return {"error": f"파일 없음: {raw_path}", "text": "", "pages": 0, "chars": 0}
ext = resolved.suffix.lower()
if ext in IMAGE_EXTS:
img = Image.open(resolved).convert("RGB")
try:
text = _ocr_image(img)
finally:
del img
return {"text": text, "pages": 1, "chars": len(text)}
doc = fitz.open(str(resolved))
try:
page_count = len(doc)
process_pages = min(page_count, max_pages)
all_text = []
for i in range(process_pages):
pix = doc[i].get_pixmap(dpi=200)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
del pix
try:
page_text = _ocr_image(img)
finally:
del img
if page_text.strip():
all_text.append(page_text)
finally:
doc.close()
combined = "\n\n".join(all_text)
return {"text": combined, "pages": page_count, "chars": len(combined)}