diff --git a/docker-compose.yml b/docker-compose.yml index 40612b1..14af257 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -149,6 +149,12 @@ services: - driver: nvidia count: 1 capabilities: [gpu] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost/health"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 120s restart: unless-stopped ai-gateway: diff --git a/scripts/gpu_service_smoke.sh b/scripts/gpu_service_smoke.sh new file mode 100755 index 0000000..27de2db --- /dev/null +++ b/scripts/gpu_service_smoke.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# GPU 미디어/검색 서비스 health/ready/smoke 점검 (PR-GPU-Health-1). +# OCR/STT/reranker 는 expose-only 라 docker exec 내부 curl 표준 경로 사용. +# marker 는 ports 매핑이 있지만 일관성을 위해 동일 패턴. +set -uo pipefail + +OCR=hyungi_document_server-ocr-service-1 +MARKER=hyungi_document_server-marker-service-1 +RERANKER=hyungi_document_server-reranker-1 +STT=hyungi_document_server-stt-service-1 + +PASS=0 +FAIL=0 + +vram() { + nvidia-smi --query-gpu=memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null \ + | awk -F',' '{printf "used=%dMiB free=%dMiB\n", $1, $2}' +} + +probe() { + local label="$1" container="$2" path="$3" timeout="${4:-5}" + printf " %-22s " "$label" + if out=$(docker exec "$container" curl -fsS -m "$timeout" "$path" 2>&1); then + echo "OK $(echo "$out" | head -c 120)" + PASS=$((PASS+1)) + else + echo "FAIL $(echo "$out" | head -c 120)" + FAIL=$((FAIL+1)) + fi +} + +probe_post() { + local label="$1" container="$2" url="$3" body="$4" timeout="${5:-30}" expect="${6:-}" + printf " %-22s " "$label" + if out=$(docker exec "$container" curl -fsS -m "$timeout" -H 'Content-Type: application/json' -X POST -d "$body" "$url" 2>&1); then + if [[ -z "$expect" || "$out" == *"$expect"* ]]; then + echo "OK $(echo "$out" | head -c 100)" + PASS=$((PASS+1)) + else + echo "FAIL(unexpected body) $(echo "$out" | head -c 100)" + FAIL=$((FAIL+1)) + fi + else + echo "FAIL $(echo "$out" | head -c 100)" + FAIL=$((FAIL+1)) + fi +} + +echo "=== nvidia-smi baseline ===" +BASE=$(vram); echo " $BASE" +echo + +echo "=== health / ready ===" +probe "OCR /health" "$OCR" "http://127.0.0.1:3200/health" 5 +probe "OCR /ready" "$OCR" "http://127.0.0.1:3200/ready" 5 +probe "marker /health" "$MARKER" "http://127.0.0.1:3300/health" 5 +probe "marker /ready" "$MARKER" "http://127.0.0.1:3300/ready" 5 +probe "reranker /health" "$RERANKER" "http://127.0.0.1:80/health" 5 +probe "stt /health" "$STT" "http://127.0.0.1:3300/health" 5 +probe "stt /ready" "$STT" "http://127.0.0.1:3300/ready" 5 + +echo +echo "=== smoke ===" +probe "OCR /smoke" "$OCR" "http://127.0.0.1:3200/smoke" 30 +probe_post "bge-m3 embed" "$OCR" "http://ollama:11434/api/embeddings" '{"model":"bge-m3","prompt":"smoke test"}' 30 '"embedding"' + +echo +echo "=== nvidia-smi after ===" +AFTER=$(vram); echo " $AFTER" +echo +echo " baseline: $BASE" +echo " after : $AFTER" +echo +echo "=== summary ===" +echo " pass=$PASS fail=$FAIL" + +exit $FAIL diff --git a/scripts/gpu_vram_fixture.sh b/scripts/gpu_vram_fixture.sh new file mode 100755 index 0000000..aef8ba5 --- /dev/null +++ b/scripts/gpu_vram_fixture.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash +# synthetic fixture 기반 GPU VRAM 피크 검증 (PR-GPU-Health-1). +# Mode A (sequential) + Mode B (light overlap) 기본. --stress 옵션은 5개 동시 (기본 gate 미포함). +set -uo pipefail + +OCR=hyungi_document_server-ocr-service-1 +MARKER=hyungi_document_server-marker-service-1 +RERANKER=hyungi_document_server-reranker-1 +STT=hyungi_document_server-stt-service-1 + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +FIX="$REPO_ROOT/tests/load/fixtures" +REPORT="$REPO_ROOT/reports/vram_fixture_$(date +%F).md" +mkdir -p "$REPO_ROOT/reports" + +STRESS_MODE=0 +[[ "${1:-}" == "--stress" ]] && STRESS_MODE=1 + +vram() { + nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d ' ' +} + +copy_fixtures() { + docker cp "$FIX/ocr_ok.png" $OCR:/tmp/ocr_ok.png >/dev/null + docker cp "$FIX/lorem_1p.pdf" $MARKER:/tmp/lorem_1p.pdf >/dev/null + docker cp "$FIX/sine_30s.wav" $STT:/tmp/sine_30s.wav >/dev/null +} + +call_ocr() { + docker exec "$OCR" curl -fsS -m 60 -X POST -H 'Content-Type: application/json' \ + -d '{"filePath":"/tmp/ocr_ok.png"}' http://127.0.0.1:3200/ocr >/dev/null +} +call_marker() { + docker exec "$MARKER" curl -fsS -m 180 -X POST -H 'Content-Type: application/json' \ + -d '{"file_path":"/tmp/lorem_1p.pdf"}' http://127.0.0.1:3300/convert >/dev/null +} +call_stt() { + docker exec "$STT" curl -fsS -m 180 -X POST -H 'Content-Type: application/json' \ + -d '{"filePath":"/tmp/sine_30s.wav","langs":["en"],"beamSize":1}' http://127.0.0.1:3300/transcribe >/dev/null +} +call_rerank() { + docker exec "$RERANKER" curl -fsS -m 30 -X POST -H 'Content-Type: application/json' \ + -d '{"query":"smoke","texts":["foo bar baz","alpha beta gamma"]}' http://127.0.0.1:80/rerank >/dev/null +} +call_embed() { + docker exec "$OCR" curl -fsS -m 30 -X POST -H 'Content-Type: application/json' \ + -d '{"model":"bge-m3","prompt":"smoke test"}' http://ollama:11434/api/embeddings >/dev/null +} + +run_named() { + local name="$1"; local fn="$2" + local before=$(vram) + if $fn; then status="OK"; else status="FAIL"; fi + local after=$(vram) + printf "| %s | %s | %s | %s |\n" "$name" "$before" "$after" "$status" >> "$REPORT" + echo " $name before=$before after=$after $status" +} + +run_overlap() { + local label="$1" fn_a="$2" fn_b="$3" + local before=$(vram) + $fn_a & pid_a=$! + $fn_b & pid_b=$! + wait $pid_a && sa="OK" || sa="FAIL" + wait $pid_b && sb="OK" || sb="FAIL" + local after=$(vram) + printf "| %s | %s | %s | %s+%s |\n" "$label" "$before" "$after" "$sa" "$sb" >> "$REPORT" + echo " $label before=$before after=$after $sa+$sb" +} + +run_stress() { + local before=$(vram) + call_ocr & p1=$! + call_marker & p2=$! + call_stt & p3=$! + call_rerank & p4=$! + call_embed & p5=$! + wait $p1 && s1="OK" || s1="FAIL" + wait $p2 && s2="OK" || s2="FAIL" + wait $p3 && s3="OK" || s3="FAIL" + wait $p4 && s4="OK" || s4="FAIL" + wait $p5 && s5="OK" || s5="FAIL" + local after=$(vram) + printf "| stress (5 concurrent) | %s | %s | %s/%s/%s/%s/%s |\n" "$before" "$after" "$s1" "$s2" "$s3" "$s4" "$s5" >> "$REPORT" + echo " stress before=$before after=$after $s1/$s2/$s3/$s4/$s5" +} + +copy_fixtures + +{ + echo "# VRAM fixture report — $(date '+%F %H:%M:%S')" + echo + echo "- baseline used = $(vram) MiB / total = 16376 MiB" + echo "- stress mode: $([[ $STRESS_MODE -eq 1 ]] && echo enabled || echo disabled)" + echo + echo "## Mode A — sequential smoke" + echo + echo "| call | before (MiB) | after (MiB) | status |" + echo "|---|---|---|---|" +} > "$REPORT" + +echo "[mode A] sequential" +run_named "OCR /ocr (ocr_ok.png)" call_ocr +run_named "STT /transcribe (sine30s)" call_stt +run_named "marker /convert (lorem1p)" call_marker +run_named "reranker /rerank" call_rerank +run_named "embed bge-m3" call_embed + +{ + echo + echo "## Mode B — light overlap" + echo + echo "| pair | before (MiB) | after (MiB) | status |" + echo "|---|---|---|---|" +} >> "$REPORT" + +echo "[mode B] light overlap" +run_overlap "OCR + embedding" call_ocr call_embed +run_overlap "marker + reranker" call_marker call_rerank +run_overlap "STT + embedding" call_stt call_embed + +if [[ $STRESS_MODE -eq 1 ]]; then + { + echo + echo "## Stress (--stress) — 5 concurrent" + echo + echo "| call | before (MiB) | after (MiB) | status |" + echo "|---|---|---|---|" + } >> "$REPORT" + echo "[stress] 5 concurrent" + run_stress +fi + +PEAK=$(awk -F'|' '$0 ~ /^\|/ && $5 ~ /(OK|FAIL)/ {gsub(/ /,"",$4); if ($4+0 > max) max=$4+0} END {print max+0}' "$REPORT") +GATE=$([[ $PEAK -gt 0 && $PEAK -lt 14000 ]] && echo PASS || echo FAIL) + +{ + echo + echo "## Summary" + echo + echo "- peak after = $PEAK MiB" + echo "- safety margin (vs 16376 MiB) = $((16376 - PEAK)) MiB" + echo "- gate (peak < 14000 MiB) = $GATE" +} >> "$REPORT" + +echo +echo "report: $REPORT" +echo "peak=$PEAK gate=$GATE" + +[[ "$GATE" == "PASS" ]] && exit 0 || exit 1 diff --git a/services/marker/server.py b/services/marker/server.py index 39e35f9..62ba851 100644 --- a/services/marker/server.py +++ b/services/marker/server.py @@ -100,6 +100,11 @@ class ConvertResponse(BaseModel): images_truncated: bool = False +@app.get("/health") +def health(): + return {"status": "ok", "service": "marker-service"} + + @app.get("/ready") async def ready(response: Response): """Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출.""" diff --git a/services/ocr/server.py b/services/ocr/server.py index b099482..f8bdb5f 100644 --- a/services/ocr/server.py +++ b/services/ocr/server.py @@ -4,13 +4,16 @@ 모델은 첫 요청 시 lazy loading. """ +import asyncio +import time import unicodedata from pathlib import Path import fitz import torch from fastapi import FastAPI -from PIL import Image +from fastapi.responses import JSONResponse +from PIL import Image, ImageDraw app = FastAPI() @@ -82,6 +85,30 @@ def ready(): } +@app.get("/smoke") +async def smoke(): + """OCR 라운드트립이 예외 없이 완료되는지 운영 verify. Docker healthcheck 미사용.""" + start = time.monotonic() + img = Image.new("RGB", (160, 60), color="white") + draw = ImageDraw.Draw(img) + draw.text((30, 20), "OK", fill="black") + try: + loop = asyncio.get_running_loop() + await asyncio.wait_for( + loop.run_in_executor(None, _ocr_image, img), + timeout=20.0, + ) + except asyncio.TimeoutError: + return JSONResponse(status_code=503, content={"status": "degraded", "reason": "timeout"}) + except Exception as exc: + return JSONResponse( + status_code=503, + content={"status": "degraded", "reason": exc.__class__.__name__}, + ) + elapsed_ms = int((time.monotonic() - start) * 1000) + return {"status": "ok", "service": "ocr-service", "inference": "ok", "elapsed_ms": elapsed_ms} + + @app.post("/ocr") async def ocr_endpoint(body: dict): """PDF/이미지 OCR — 페이지 단위 처리 (전체 일괄 로드 금지)""" diff --git a/tests/load/fixtures/lorem_1p.pdf b/tests/load/fixtures/lorem_1p.pdf new file mode 100644 index 0000000..b12b68e Binary files /dev/null and b/tests/load/fixtures/lorem_1p.pdf differ diff --git a/tests/load/fixtures/ocr_ok.png b/tests/load/fixtures/ocr_ok.png new file mode 100644 index 0000000..90f0181 Binary files /dev/null and b/tests/load/fixtures/ocr_ok.png differ diff --git a/tests/load/fixtures/sine_30s.wav b/tests/load/fixtures/sine_30s.wav new file mode 100644 index 0000000..b2ac184 Binary files /dev/null and b/tests/load/fixtures/sine_30s.wav differ