ops(gpu-health): GPU 서비스 health/smoke 표준화 + synthetic VRAM 피크 가드

PR-GPU-Health-1. 운영 준비성 표준화 PR (모델 성능 개선 아님). - OCR /smoke endpoint 추가 (160x60 OK PNG in-memory, 200/503 분기, Docker healthcheck 미사용) - marker /health endpoint 추가 (stt/ocr 동일 시그니처) - reranker docker-compose healthcheck 추가 (TEI :80/health) - scripts/gpu_service_smoke.sh: docker exec 표준 점검 (OCR/STT expose-only) - scripts/gpu_vram_fixture.sh: Mode A sequential + Mode B light overlap + --stress 옵션 - tests/load/fixtures/: synthetic ocr_ok.png / sine_30s.wav / lorem_1p.pdf OCR 빈 응답 false negative — root cause: ports 미매핑. 결정: ocr-service / stt-service 는 expose-only 유지, 운영 점검은 docker exec 내부 curl 표준. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 09:42:07 +09:00
parent f1399459c5
commit 98ee7dffe2
8 changed files with 266 additions and 1 deletions
@@ -149,6 +149,12 @@ services:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://localhost/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 120s
    restart: unless-stopped

  ai-gateway:
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+# GPU 미디어/검색 서비스 health/ready/smoke 점검 (PR-GPU-Health-1).
+# OCR/STT/reranker 는 expose-only 라 docker exec 내부 curl 표준 경로 사용.
+# marker 는 ports 매핑이 있지만 일관성을 위해 동일 패턴.
+set -uo pipefail
+
+OCR=hyungi_document_server-ocr-service-1
+MARKER=hyungi_document_server-marker-service-1
+RERANKER=hyungi_document_server-reranker-1
+STT=hyungi_document_server-stt-service-1
+
+PASS=0
+FAIL=0
+
+vram() {
+  nvidia-smi --query-gpu=memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null \
+    | awk -F',' '{printf "used=%dMiB free=%dMiB\n", $1, $2}'
+}
+
+probe() {
+  local label="$1" container="$2" path="$3" timeout="${4:-5}"
+  printf "  %-22s " "$label"
+  if out=$(docker exec "$container" curl -fsS -m "$timeout" "$path" 2>&1); then
+    echo "OK  $(echo "$out" | head -c 120)"
+    PASS=$((PASS+1))
+  else
+    echo "FAIL  $(echo "$out" | head -c 120)"
+    FAIL=$((FAIL+1))
+  fi
+}
+
+probe_post() {
+  local label="$1" container="$2" url="$3" body="$4" timeout="${5:-30}" expect="${6:-}"
+  printf "  %-22s " "$label"
+  if out=$(docker exec "$container" curl -fsS -m "$timeout" -H 'Content-Type: application/json' -X POST -d "$body" "$url" 2>&1); then
+    if [[ -z "$expect" || "$out" == *"$expect"* ]]; then
+      echo "OK  $(echo "$out" | head -c 100)"
+      PASS=$((PASS+1))
+    else
+      echo "FAIL(unexpected body)  $(echo "$out" | head -c 100)"
+      FAIL=$((FAIL+1))
+    fi
+  else
+    echo "FAIL  $(echo "$out" | head -c 100)"
+    FAIL=$((FAIL+1))
+  fi
+}
+
+echo "=== nvidia-smi baseline ==="
+BASE=$(vram); echo "  $BASE"
+echo
+
+echo "=== health / ready ==="
+probe "OCR /health"      "$OCR"      "http://127.0.0.1:3200/health"  5
+probe "OCR /ready"       "$OCR"      "http://127.0.0.1:3200/ready"   5
+probe "marker /health"   "$MARKER"   "http://127.0.0.1:3300/health"  5
+probe "marker /ready"    "$MARKER"   "http://127.0.0.1:3300/ready"   5
+probe "reranker /health" "$RERANKER" "http://127.0.0.1:80/health"    5
+probe "stt /health"      "$STT"      "http://127.0.0.1:3300/health"  5
+probe "stt /ready"       "$STT"      "http://127.0.0.1:3300/ready"   5
+
+echo
+echo "=== smoke ==="
+probe "OCR /smoke"       "$OCR"      "http://127.0.0.1:3200/smoke"   30
+probe_post "bge-m3 embed" "$OCR" "http://ollama:11434/api/embeddings" '{"model":"bge-m3","prompt":"smoke test"}' 30 '"embedding"'
+
+echo
+echo "=== nvidia-smi after ==="
+AFTER=$(vram); echo "  $AFTER"
+echo
+echo "  baseline: $BASE"
+echo "  after   : $AFTER"
+echo
+echo "=== summary ==="
+echo "  pass=$PASS  fail=$FAIL"
+
+exit $FAIL
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+# synthetic fixture 기반 GPU VRAM 피크 검증 (PR-GPU-Health-1).
+# Mode A (sequential) + Mode B (light overlap) 기본. --stress 옵션은 5개 동시 (기본 gate 미포함).
+set -uo pipefail
+
+OCR=hyungi_document_server-ocr-service-1
+MARKER=hyungi_document_server-marker-service-1
+RERANKER=hyungi_document_server-reranker-1
+STT=hyungi_document_server-stt-service-1
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+FIX="$REPO_ROOT/tests/load/fixtures"
+REPORT="$REPO_ROOT/reports/vram_fixture_$(date +%F).md"
+mkdir -p "$REPO_ROOT/reports"
+
+STRESS_MODE=0
+[[ "${1:-}" == "--stress" ]] && STRESS_MODE=1
+
+vram() {
+  nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d ' '
+}
+
+copy_fixtures() {
+  docker cp "$FIX/ocr_ok.png"   $OCR:/tmp/ocr_ok.png      >/dev/null
+  docker cp "$FIX/lorem_1p.pdf" $MARKER:/tmp/lorem_1p.pdf >/dev/null
+  docker cp "$FIX/sine_30s.wav" $STT:/tmp/sine_30s.wav    >/dev/null
+}
+
+call_ocr() {
+  docker exec "$OCR" curl -fsS -m 60 -X POST -H 'Content-Type: application/json' \
+    -d '{"filePath":"/tmp/ocr_ok.png"}' http://127.0.0.1:3200/ocr >/dev/null
+}
+call_marker() {
+  docker exec "$MARKER" curl -fsS -m 180 -X POST -H 'Content-Type: application/json' \
+    -d '{"file_path":"/tmp/lorem_1p.pdf"}' http://127.0.0.1:3300/convert >/dev/null
+}
+call_stt() {
+  docker exec "$STT" curl -fsS -m 180 -X POST -H 'Content-Type: application/json' \
+    -d '{"filePath":"/tmp/sine_30s.wav","langs":["en"],"beamSize":1}' http://127.0.0.1:3300/transcribe >/dev/null
+}
+call_rerank() {
+  docker exec "$RERANKER" curl -fsS -m 30 -X POST -H 'Content-Type: application/json' \
+    -d '{"query":"smoke","texts":["foo bar baz","alpha beta gamma"]}' http://127.0.0.1:80/rerank >/dev/null
+}
+call_embed() {
+  docker exec "$OCR" curl -fsS -m 30 -X POST -H 'Content-Type: application/json' \
+    -d '{"model":"bge-m3","prompt":"smoke test"}' http://ollama:11434/api/embeddings >/dev/null
+}
+
+run_named() {
+  local name="$1"; local fn="$2"
+  local before=$(vram)
+  if $fn; then status="OK"; else status="FAIL"; fi
+  local after=$(vram)
+  printf "| %s | %s | %s | %s |\n" "$name" "$before" "$after" "$status" >> "$REPORT"
+  echo "  $name  before=$before after=$after $status"
+}
+
+run_overlap() {
+  local label="$1" fn_a="$2" fn_b="$3"
+  local before=$(vram)
+  $fn_a & pid_a=$!
+  $fn_b & pid_b=$!
+  wait $pid_a && sa="OK" || sa="FAIL"
+  wait $pid_b && sb="OK" || sb="FAIL"
+  local after=$(vram)
+  printf "| %s | %s | %s | %s+%s |\n" "$label" "$before" "$after" "$sa" "$sb" >> "$REPORT"
+  echo "  $label  before=$before after=$after $sa+$sb"
+}
+
+run_stress() {
+  local before=$(vram)
+  call_ocr & p1=$!
+  call_marker & p2=$!
+  call_stt & p3=$!
+  call_rerank & p4=$!
+  call_embed & p5=$!
+  wait $p1 && s1="OK" || s1="FAIL"
+  wait $p2 && s2="OK" || s2="FAIL"
+  wait $p3 && s3="OK" || s3="FAIL"
+  wait $p4 && s4="OK" || s4="FAIL"
+  wait $p5 && s5="OK" || s5="FAIL"
+  local after=$(vram)
+  printf "| stress (5 concurrent) | %s | %s | %s/%s/%s/%s/%s |\n" "$before" "$after" "$s1" "$s2" "$s3" "$s4" "$s5" >> "$REPORT"
+  echo "  stress  before=$before after=$after $s1/$s2/$s3/$s4/$s5"
+}
+
+copy_fixtures
+
+{
+  echo "# VRAM fixture report — $(date '+%F %H:%M:%S')"
+  echo
+  echo "- baseline used = $(vram) MiB / total = 16376 MiB"
+  echo "- stress mode: $([[ $STRESS_MODE -eq 1 ]] && echo enabled || echo disabled)"
+  echo
+  echo "## Mode A — sequential smoke"
+  echo
+  echo "| call | before (MiB) | after (MiB) | status |"
+  echo "|---|---|---|---|"
+} > "$REPORT"
+
+echo "[mode A] sequential"
+run_named "OCR /ocr (ocr_ok.png)"    call_ocr
+run_named "STT /transcribe (sine30s)" call_stt
+run_named "marker /convert (lorem1p)" call_marker
+run_named "reranker /rerank"          call_rerank
+run_named "embed bge-m3"              call_embed
+
+{
+  echo
+  echo "## Mode B — light overlap"
+  echo
+  echo "| pair | before (MiB) | after (MiB) | status |"
+  echo "|---|---|---|---|"
+} >> "$REPORT"
+
+echo "[mode B] light overlap"
+run_overlap "OCR + embedding"   call_ocr call_embed
+run_overlap "marker + reranker" call_marker call_rerank
+run_overlap "STT + embedding"   call_stt call_embed
+
+if [[ $STRESS_MODE -eq 1 ]]; then
+  {
+    echo
+    echo "## Stress (--stress) — 5 concurrent"
+    echo
+    echo "| call | before (MiB) | after (MiB) | status |"
+    echo "|---|---|---|---|"
+  } >> "$REPORT"
+  echo "[stress] 5 concurrent"
+  run_stress
+fi
+
+PEAK=$(awk -F'|' '$0 ~ /^\|/ && $5 ~ /(OK|FAIL)/ {gsub(/ /,"",$4); if ($4+0 > max) max=$4+0} END {print max+0}' "$REPORT")
+GATE=$([[ $PEAK -gt 0 && $PEAK -lt 14000 ]] && echo PASS || echo FAIL)
+
+{
+  echo
+  echo "## Summary"
+  echo
+  echo "- peak after = $PEAK MiB"
+  echo "- safety margin (vs 16376 MiB) = $((16376 - PEAK)) MiB"
+  echo "- gate (peak < 14000 MiB) = $GATE"
+} >> "$REPORT"
+
+echo
+echo "report: $REPORT"
+echo "peak=$PEAK gate=$GATE"
+
+[[ "$GATE" == "PASS" ]] && exit 0 || exit 1
@@ -100,6 +100,11 @@ class ConvertResponse(BaseModel):
    images_truncated: bool = False


+@app.get("/health")
+def health():
+    return {"status": "ok", "service": "marker-service"}
+
+
@app.get("/ready")
 async def ready(response: Response):
    """Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출."""
@@ -4,13 +4,16 @@
 모델은 첫 요청 시 lazy loading.
 """

+import asyncio
+import time
 import unicodedata
 from pathlib import Path

 import fitz
 import torch
 from fastapi import FastAPI
-from PIL import Image
+from fastapi.responses import JSONResponse
+from PIL import Image, ImageDraw

 app = FastAPI()

@@ -82,6 +85,30 @@ def ready():
    }


+@app.get("/smoke")
+async def smoke():
+    """OCR 라운드트립이 예외 없이 완료되는지 운영 verify. Docker healthcheck 미사용."""
+    start = time.monotonic()
+    img = Image.new("RGB", (160, 60), color="white")
+    draw = ImageDraw.Draw(img)
+    draw.text((30, 20), "OK", fill="black")
+    try:
+        loop = asyncio.get_running_loop()
+        await asyncio.wait_for(
+            loop.run_in_executor(None, _ocr_image, img),
+            timeout=20.0,
+        )
+    except asyncio.TimeoutError:
+        return JSONResponse(status_code=503, content={"status": "degraded", "reason": "timeout"})
+    except Exception as exc:
+        return JSONResponse(
+            status_code=503,
+            content={"status": "degraded", "reason": exc.__class__.__name__},
+        )
+    elapsed_ms = int((time.monotonic() - start) * 1000)
+    return {"status": "ok", "service": "ocr-service", "inference": "ok", "elapsed_ms": elapsed_ms}
+
+
@app.post("/ocr")
 async def ocr_endpoint(body: dict):
    """PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)"""