feat(ocr): Surya OCR 마이크로서비스 추가

GPU 가속 OCR (Surya, Apache 2.0) 별도 컨테이너로 추가.
스캔 PDF/이미지 파일의 텍스트 추출 지원.

- services/ocr: Dockerfile + server.py + requirements.txt
- /health (liveness) + /ready (readiness, CUDA+모델 상태)
- /ocr: 페이지 단위 스트리밍 처리 (메모리 피크 억제)
- docker-compose: ocr-service + GPU reservation + ocr_models 볼륨

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-15 15:03:55 +09:00
parent 083aa3126a
commit 7883ac67b3
4 changed files with 156 additions and 0 deletions
+24
View File
@@ -32,6 +32,28 @@ services:
retries: 3
restart: unless-stopped
ocr-service:
build: ./services/ocr
expose:
- "3200"
volumes:
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
- ocr_models:/root/.cache
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3200/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 180s
restart: unless-stopped
ollama:
image: ollama/ollama
volumes:
@@ -102,6 +124,7 @@ services:
environment:
- DATABASE_URL=postgresql+asyncpg://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm
- KORDOC_ENDPOINT=http://kordoc-service:3100
- OCR_ENDPOINT=http://ocr-service:3200
restart: unless-stopped
frontend:
@@ -129,3 +152,4 @@ volumes:
caddy_data:
ollama_data:
reranker_cache:
ocr_models:
+16
View File
@@ -0,0 +1,16 @@
FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
WORKDIR /app
# PyMuPDF/Pillow 시스템 의존성
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 libglib2.0-0 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY server.py .
EXPOSE 3200
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3200"]
+5
View File
@@ -0,0 +1,5 @@
surya-ocr>=0.6.0,<1.0.0
pymupdf>=1.24.0,<2.0.0
fastapi>=0.110.0,<1.0.0
uvicorn[standard]>=0.27.0,<1.0.0
pillow>=10.0.0,<12.0.0
+111
View File
@@ -0,0 +1,111 @@
"""OCR 마이크로서비스 — Surya OCR (GPU) + PyMuPDF (PDF→이미지)
페이지 단위 스트리밍으로 대형 PDF도 메모리 피크 억제.
모델은 첫 요청 시 lazy loading.
"""
from pathlib import Path
import fitz
import torch
from fastapi import FastAPI
from PIL import Image
app = FastAPI()
# 모델 lazy loading
_models = None
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"}
def _load_models():
"""Surya OCR 모델 lazy loading — 첫 호출 시만"""
global _models
if _models is not None:
return _models
from surya.model.detection.model import load_det_processor, load_det_model
from surya.model.recognition.model import load_rec_model
from surya.model.recognition.processor import load_rec_processor
_models = {
"det_processor": load_det_processor(),
"det_model": load_det_model(),
"rec_model": load_rec_model(),
"rec_processor": load_rec_processor(),
}
return _models
@app.get("/health")
def health():
"""Liveness — Docker healthcheck용, 프로세스 생존 확인"""
return {"status": "ok", "service": "ocr-surya"}
@app.get("/ready")
def ready():
"""Readiness — 배포 검증용, CUDA + 모델 상태"""
cuda_ok = torch.cuda.is_available()
models_loaded = _models is not None
return {
"ready": cuda_ok and models_loaded,
"cuda": cuda_ok,
"models_loaded": models_loaded,
"gpu_name": torch.cuda.get_device_name(0) if cuda_ok else None,
}
@app.post("/ocr")
async def ocr_endpoint(body: dict):
"""PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)"""
file_path = body["filePath"]
langs = body.get("langs", ["ko", "en"])
max_pages = body.get("maxPages", 200)
if not Path(file_path).exists():
return {"error": f"파일 없음: {file_path}", "text": "", "pages": 0, "chars": 0}
from surya.ocr import run_ocr
m = _load_models()
ext = Path(file_path).suffix.lower()
# 이미지 파일 → 단일 이미지 OCR
if ext in IMAGE_EXTS:
img = Image.open(file_path).convert("RGB")
predictions = run_ocr(
[img], [langs],
m["det_model"], m["det_processor"],
m["rec_model"], m["rec_processor"],
)
text = "\n".join(line.text for line in predictions[0].text_lines)
del img
return {"text": text, "pages": 1, "chars": len(text)}
# PDF → 페이지 단위 렌더 + OCR
doc = fitz.open(file_path)
page_count = len(doc)
process_pages = min(page_count, max_pages)
all_text = []
for i in range(process_pages):
page = doc[i]
pix = page.get_pixmap(dpi=200)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
del pix # 렌더링 메모리 즉시 해제
predictions = run_ocr(
[img], [langs],
m["det_model"], m["det_processor"],
m["rec_model"], m["rec_processor"],
)
page_text = "\n".join(line.text for line in predictions[0].text_lines)
if page_text.strip():
all_text.append(page_text)
del img # 이미지 메모리 즉시 해제
doc.close()
combined = "\n\n".join(all_text)
return {"text": combined, "pages": page_count, "chars": len(combined)}