From 7883ac67b3f82943a5e29fb6deb8acc7c8c7351a Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 15 Apr 2026 15:03:55 +0900 Subject: [PATCH] =?UTF-8?q?feat(ocr):=20Surya=20OCR=20=EB=A7=88=EC=9D=B4?= =?UTF-8?q?=ED=81=AC=EB=A1=9C=EC=84=9C=EB=B9=84=EC=8A=A4=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU 가속 OCR (Surya, Apache 2.0) 별도 컨테이너로 추가. 스캔 PDF/이미지 파일의 텍스트 추출 지원. - services/ocr: Dockerfile + server.py + requirements.txt - /health (liveness) + /ready (readiness, CUDA+모델 상태) - /ocr: 페이지 단위 스트리밍 처리 (메모리 피크 억제) - docker-compose: ocr-service + GPU reservation + ocr_models 볼륨 Co-Authored-By: Claude Opus 4.6 (1M context) --- docker-compose.yml | 24 ++++++++ services/ocr/Dockerfile | 16 +++++ services/ocr/requirements.txt | 5 ++ services/ocr/server.py | 111 ++++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+) create mode 100644 services/ocr/Dockerfile create mode 100644 services/ocr/requirements.txt create mode 100644 services/ocr/server.py diff --git a/docker-compose.yml b/docker-compose.yml index 918187c..b4d404d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,6 +32,28 @@ services: retries: 3 restart: unless-stopped + ocr-service: + build: ./services/ocr + expose: + - "3200" + volumes: + - ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro + - ocr_models:/root/.cache + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3200/health')"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 180s + restart: unless-stopped + ollama: image: ollama/ollama volumes: @@ -102,6 +124,7 @@ services: environment: - DATABASE_URL=postgresql+asyncpg://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm - KORDOC_ENDPOINT=http://kordoc-service:3100 + - OCR_ENDPOINT=http://ocr-service:3200 restart: unless-stopped frontend: @@ -129,3 +152,4 @@ volumes: caddy_data: ollama_data: reranker_cache: + ocr_models: diff --git a/services/ocr/Dockerfile b/services/ocr/Dockerfile new file mode 100644 index 0000000..b694c61 --- /dev/null +++ b/services/ocr/Dockerfile @@ -0,0 +1,16 @@ +FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime + +WORKDIR /app + +# PyMuPDF/Pillow 시스템 의존성 +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgl1 libglib2.0-0 && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY server.py . + +EXPOSE 3200 +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3200"] diff --git a/services/ocr/requirements.txt b/services/ocr/requirements.txt new file mode 100644 index 0000000..4167404 --- /dev/null +++ b/services/ocr/requirements.txt @@ -0,0 +1,5 @@ +surya-ocr>=0.6.0,<1.0.0 +pymupdf>=1.24.0,<2.0.0 +fastapi>=0.110.0,<1.0.0 +uvicorn[standard]>=0.27.0,<1.0.0 +pillow>=10.0.0,<12.0.0 diff --git a/services/ocr/server.py b/services/ocr/server.py new file mode 100644 index 0000000..198bad8 --- /dev/null +++ b/services/ocr/server.py @@ -0,0 +1,111 @@ +"""OCR 마이크로서비스 — Surya OCR (GPU) + PyMuPDF (PDF→이미지) + +페이지 단위 스트리밍으로 대형 PDF도 메모리 피크 억제. +모델은 첫 요청 시 lazy loading. +""" + +from pathlib import Path + +import fitz +import torch +from fastapi import FastAPI +from PIL import Image + +app = FastAPI() + +# 모델 lazy loading +_models = None + +IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"} + + +def _load_models(): + """Surya OCR 모델 lazy loading — 첫 호출 시만""" + global _models + if _models is not None: + return _models + + from surya.model.detection.model import load_det_processor, load_det_model + from surya.model.recognition.model import load_rec_model + from surya.model.recognition.processor import load_rec_processor + + _models = { + "det_processor": load_det_processor(), + "det_model": load_det_model(), + "rec_model": load_rec_model(), + "rec_processor": load_rec_processor(), + } + return _models + + +@app.get("/health") +def health(): + """Liveness — Docker healthcheck용, 프로세스 생존 확인""" + return {"status": "ok", "service": "ocr-surya"} + + +@app.get("/ready") +def ready(): + """Readiness — 배포 검증용, CUDA + 모델 상태""" + cuda_ok = torch.cuda.is_available() + models_loaded = _models is not None + return { + "ready": cuda_ok and models_loaded, + "cuda": cuda_ok, + "models_loaded": models_loaded, + "gpu_name": torch.cuda.get_device_name(0) if cuda_ok else None, + } + + +@app.post("/ocr") +async def ocr_endpoint(body: dict): + """PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)""" + file_path = body["filePath"] + langs = body.get("langs", ["ko", "en"]) + max_pages = body.get("maxPages", 200) + + if not Path(file_path).exists(): + return {"error": f"파일 없음: {file_path}", "text": "", "pages": 0, "chars": 0} + + from surya.ocr import run_ocr + + m = _load_models() + ext = Path(file_path).suffix.lower() + + # 이미지 파일 → 단일 이미지 OCR + if ext in IMAGE_EXTS: + img = Image.open(file_path).convert("RGB") + predictions = run_ocr( + [img], [langs], + m["det_model"], m["det_processor"], + m["rec_model"], m["rec_processor"], + ) + text = "\n".join(line.text for line in predictions[0].text_lines) + del img + return {"text": text, "pages": 1, "chars": len(text)} + + # PDF → 페이지 단위 렌더 + OCR + doc = fitz.open(file_path) + page_count = len(doc) + process_pages = min(page_count, max_pages) + all_text = [] + + for i in range(process_pages): + page = doc[i] + pix = page.get_pixmap(dpi=200) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + del pix # 렌더링 메모리 즉시 해제 + + predictions = run_ocr( + [img], [langs], + m["det_model"], m["det_processor"], + m["rec_model"], m["rec_processor"], + ) + page_text = "\n".join(line.text for line in predictions[0].text_lines) + if page_text.strip(): + all_text.append(page_text) + del img # 이미지 메모리 즉시 해제 + + doc.close() + combined = "\n\n".join(all_text) + return {"text": combined, "pages": page_count, "chars": len(combined)}