ops(gpu-health): GPU 서비스 health/smoke 표준화 + synthetic VRAM 피크 가드

PR-GPU-Health-1. 운영 준비성 표준화 PR (모델 성능 개선 아님).

- OCR /smoke endpoint 추가 (160x60 OK PNG in-memory, 200/503 분기, Docker healthcheck 미사용)
- marker /health endpoint 추가 (stt/ocr 동일 시그니처)
- reranker docker-compose healthcheck 추가 (TEI :80/health)
- scripts/gpu_service_smoke.sh: docker exec 표준 점검 (OCR/STT expose-only)
- scripts/gpu_vram_fixture.sh: Mode A sequential + Mode B light overlap + --stress 옵션
- tests/load/fixtures/: synthetic ocr_ok.png / sine_30s.wav / lorem_1p.pdf

OCR 빈 응답 false negative — root cause: ports 미매핑.
결정: ocr-service / stt-service 는 expose-only 유지, 운영 점검은 docker exec 내부 curl 표준.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-05-14 09:42:07 +09:00
parent f1399459c5
commit 98ee7dffe2
8 changed files with 266 additions and 1 deletions
+6
View File
@@ -149,6 +149,12 @@ services:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 120s
restart: unless-stopped
ai-gateway:
+77
View File
@@ -0,0 +1,77 @@
#!/usr/bin/env bash
# GPU 미디어/검색 서비스 health/ready/smoke 점검 (PR-GPU-Health-1).
# OCR/STT/reranker 는 expose-only 라 docker exec 내부 curl 표준 경로 사용.
# marker 는 ports 매핑이 있지만 일관성을 위해 동일 패턴.
set -uo pipefail
OCR=hyungi_document_server-ocr-service-1
MARKER=hyungi_document_server-marker-service-1
RERANKER=hyungi_document_server-reranker-1
STT=hyungi_document_server-stt-service-1
PASS=0
FAIL=0
vram() {
nvidia-smi --query-gpu=memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null \
| awk -F',' '{printf "used=%dMiB free=%dMiB\n", $1, $2}'
}
probe() {
local label="$1" container="$2" path="$3" timeout="${4:-5}"
printf " %-22s " "$label"
if out=$(docker exec "$container" curl -fsS -m "$timeout" "$path" 2>&1); then
echo "OK $(echo "$out" | head -c 120)"
PASS=$((PASS+1))
else
echo "FAIL $(echo "$out" | head -c 120)"
FAIL=$((FAIL+1))
fi
}
probe_post() {
local label="$1" container="$2" url="$3" body="$4" timeout="${5:-30}" expect="${6:-}"
printf " %-22s " "$label"
if out=$(docker exec "$container" curl -fsS -m "$timeout" -H 'Content-Type: application/json' -X POST -d "$body" "$url" 2>&1); then
if [[ -z "$expect" || "$out" == *"$expect"* ]]; then
echo "OK $(echo "$out" | head -c 100)"
PASS=$((PASS+1))
else
echo "FAIL(unexpected body) $(echo "$out" | head -c 100)"
FAIL=$((FAIL+1))
fi
else
echo "FAIL $(echo "$out" | head -c 100)"
FAIL=$((FAIL+1))
fi
}
echo "=== nvidia-smi baseline ==="
BASE=$(vram); echo " $BASE"
echo
echo "=== health / ready ==="
probe "OCR /health" "$OCR" "http://127.0.0.1:3200/health" 5
probe "OCR /ready" "$OCR" "http://127.0.0.1:3200/ready" 5
probe "marker /health" "$MARKER" "http://127.0.0.1:3300/health" 5
probe "marker /ready" "$MARKER" "http://127.0.0.1:3300/ready" 5
probe "reranker /health" "$RERANKER" "http://127.0.0.1:80/health" 5
probe "stt /health" "$STT" "http://127.0.0.1:3300/health" 5
probe "stt /ready" "$STT" "http://127.0.0.1:3300/ready" 5
echo
echo "=== smoke ==="
probe "OCR /smoke" "$OCR" "http://127.0.0.1:3200/smoke" 30
probe_post "bge-m3 embed" "$OCR" "http://ollama:11434/api/embeddings" '{"model":"bge-m3","prompt":"smoke test"}' 30 '"embedding"'
echo
echo "=== nvidia-smi after ==="
AFTER=$(vram); echo " $AFTER"
echo
echo " baseline: $BASE"
echo " after : $AFTER"
echo
echo "=== summary ==="
echo " pass=$PASS fail=$FAIL"
exit $FAIL
+150
View File
@@ -0,0 +1,150 @@
#!/usr/bin/env bash
# synthetic fixture 기반 GPU VRAM 피크 검증 (PR-GPU-Health-1).
# Mode A (sequential) + Mode B (light overlap) 기본. --stress 옵션은 5개 동시 (기본 gate 미포함).
set -uo pipefail
OCR=hyungi_document_server-ocr-service-1
MARKER=hyungi_document_server-marker-service-1
RERANKER=hyungi_document_server-reranker-1
STT=hyungi_document_server-stt-service-1
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
FIX="$REPO_ROOT/tests/load/fixtures"
REPORT="$REPO_ROOT/reports/vram_fixture_$(date +%F).md"
mkdir -p "$REPO_ROOT/reports"
STRESS_MODE=0
[[ "${1:-}" == "--stress" ]] && STRESS_MODE=1
vram() {
nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d ' '
}
copy_fixtures() {
docker cp "$FIX/ocr_ok.png" $OCR:/tmp/ocr_ok.png >/dev/null
docker cp "$FIX/lorem_1p.pdf" $MARKER:/tmp/lorem_1p.pdf >/dev/null
docker cp "$FIX/sine_30s.wav" $STT:/tmp/sine_30s.wav >/dev/null
}
call_ocr() {
docker exec "$OCR" curl -fsS -m 60 -X POST -H 'Content-Type: application/json' \
-d '{"filePath":"/tmp/ocr_ok.png"}' http://127.0.0.1:3200/ocr >/dev/null
}
call_marker() {
docker exec "$MARKER" curl -fsS -m 180 -X POST -H 'Content-Type: application/json' \
-d '{"file_path":"/tmp/lorem_1p.pdf"}' http://127.0.0.1:3300/convert >/dev/null
}
call_stt() {
docker exec "$STT" curl -fsS -m 180 -X POST -H 'Content-Type: application/json' \
-d '{"filePath":"/tmp/sine_30s.wav","langs":["en"],"beamSize":1}' http://127.0.0.1:3300/transcribe >/dev/null
}
call_rerank() {
docker exec "$RERANKER" curl -fsS -m 30 -X POST -H 'Content-Type: application/json' \
-d '{"query":"smoke","texts":["foo bar baz","alpha beta gamma"]}' http://127.0.0.1:80/rerank >/dev/null
}
call_embed() {
docker exec "$OCR" curl -fsS -m 30 -X POST -H 'Content-Type: application/json' \
-d '{"model":"bge-m3","prompt":"smoke test"}' http://ollama:11434/api/embeddings >/dev/null
}
run_named() {
local name="$1"; local fn="$2"
local before=$(vram)
if $fn; then status="OK"; else status="FAIL"; fi
local after=$(vram)
printf "| %s | %s | %s | %s |\n" "$name" "$before" "$after" "$status" >> "$REPORT"
echo " $name before=$before after=$after $status"
}
run_overlap() {
local label="$1" fn_a="$2" fn_b="$3"
local before=$(vram)
$fn_a & pid_a=$!
$fn_b & pid_b=$!
wait $pid_a && sa="OK" || sa="FAIL"
wait $pid_b && sb="OK" || sb="FAIL"
local after=$(vram)
printf "| %s | %s | %s | %s+%s |\n" "$label" "$before" "$after" "$sa" "$sb" >> "$REPORT"
echo " $label before=$before after=$after $sa+$sb"
}
run_stress() {
local before=$(vram)
call_ocr & p1=$!
call_marker & p2=$!
call_stt & p3=$!
call_rerank & p4=$!
call_embed & p5=$!
wait $p1 && s1="OK" || s1="FAIL"
wait $p2 && s2="OK" || s2="FAIL"
wait $p3 && s3="OK" || s3="FAIL"
wait $p4 && s4="OK" || s4="FAIL"
wait $p5 && s5="OK" || s5="FAIL"
local after=$(vram)
printf "| stress (5 concurrent) | %s | %s | %s/%s/%s/%s/%s |\n" "$before" "$after" "$s1" "$s2" "$s3" "$s4" "$s5" >> "$REPORT"
echo " stress before=$before after=$after $s1/$s2/$s3/$s4/$s5"
}
copy_fixtures
{
echo "# VRAM fixture report — $(date '+%F %H:%M:%S')"
echo
echo "- baseline used = $(vram) MiB / total = 16376 MiB"
echo "- stress mode: $([[ $STRESS_MODE -eq 1 ]] && echo enabled || echo disabled)"
echo
echo "## Mode A — sequential smoke"
echo
echo "| call | before (MiB) | after (MiB) | status |"
echo "|---|---|---|---|"
} > "$REPORT"
echo "[mode A] sequential"
run_named "OCR /ocr (ocr_ok.png)" call_ocr
run_named "STT /transcribe (sine30s)" call_stt
run_named "marker /convert (lorem1p)" call_marker
run_named "reranker /rerank" call_rerank
run_named "embed bge-m3" call_embed
{
echo
echo "## Mode B — light overlap"
echo
echo "| pair | before (MiB) | after (MiB) | status |"
echo "|---|---|---|---|"
} >> "$REPORT"
echo "[mode B] light overlap"
run_overlap "OCR + embedding" call_ocr call_embed
run_overlap "marker + reranker" call_marker call_rerank
run_overlap "STT + embedding" call_stt call_embed
if [[ $STRESS_MODE -eq 1 ]]; then
{
echo
echo "## Stress (--stress) — 5 concurrent"
echo
echo "| call | before (MiB) | after (MiB) | status |"
echo "|---|---|---|---|"
} >> "$REPORT"
echo "[stress] 5 concurrent"
run_stress
fi
PEAK=$(awk -F'|' '$0 ~ /^\|/ && $5 ~ /(OK|FAIL)/ {gsub(/ /,"",$4); if ($4+0 > max) max=$4+0} END {print max+0}' "$REPORT")
GATE=$([[ $PEAK -gt 0 && $PEAK -lt 14000 ]] && echo PASS || echo FAIL)
{
echo
echo "## Summary"
echo
echo "- peak after = $PEAK MiB"
echo "- safety margin (vs 16376 MiB) = $((16376 - PEAK)) MiB"
echo "- gate (peak < 14000 MiB) = $GATE"
} >> "$REPORT"
echo
echo "report: $REPORT"
echo "peak=$PEAK gate=$GATE"
[[ "$GATE" == "PASS" ]] && exit 0 || exit 1
+5
View File
@@ -100,6 +100,11 @@ class ConvertResponse(BaseModel):
images_truncated: bool = False
@app.get("/health")
def health():
return {"status": "ok", "service": "marker-service"}
@app.get("/ready")
async def ready(response: Response):
"""Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출."""
+28 -1
View File
@@ -4,13 +4,16 @@
모델은 첫 요청 시 lazy loading.
"""
import asyncio
import time
import unicodedata
from pathlib import Path
import fitz
import torch
from fastapi import FastAPI
from PIL import Image
from fastapi.responses import JSONResponse
from PIL import Image, ImageDraw
app = FastAPI()
@@ -82,6 +85,30 @@ def ready():
}
@app.get("/smoke")
async def smoke():
"""OCR 라운드트립이 예외 없이 완료되는지 운영 verify. Docker healthcheck 미사용."""
start = time.monotonic()
img = Image.new("RGB", (160, 60), color="white")
draw = ImageDraw.Draw(img)
draw.text((30, 20), "OK", fill="black")
try:
loop = asyncio.get_running_loop()
await asyncio.wait_for(
loop.run_in_executor(None, _ocr_image, img),
timeout=20.0,
)
except asyncio.TimeoutError:
return JSONResponse(status_code=503, content={"status": "degraded", "reason": "timeout"})
except Exception as exc:
return JSONResponse(
status_code=503,
content={"status": "degraded", "reason": exc.__class__.__name__},
)
elapsed_ms = int((time.monotonic() - start) * 1000)
return {"status": "ok", "service": "ocr-service", "inference": "ok", "elapsed_ms": elapsed_ms}
@app.post("/ocr")
async def ocr_endpoint(body: dict):
"""PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)"""
Binary file not shown.
Binary file not shown.

After

Width:  |  Height:  |  Size: 518 B

Binary file not shown.