fix(gpu-health): scripts 호출 도구를 host curl + container IP 로 통일
OCR/STT 컨테이너 안에 curl 미설치 (slim python image). docker exec curl 표준은 실측 OCI exec 실패. host curl + docker bridge IP (172.20.0.x) 로 변경 — host publish 추가 아니라 docker network 내부 검증이라 보안 표면 동일. reranker 만 curl 있고 OCR/marker/STT 는 python 만 있어 분기 발생을 회피. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,26 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
# GPU 미디어/검색 서비스 health/ready/smoke 점검 (PR-GPU-Health-1).
|
||||
# OCR/STT/reranker 는 expose-only 라 docker exec 내부 curl 표준 경로 사용.
|
||||
# marker 는 ports 매핑이 있지만 일관성을 위해 동일 패턴.
|
||||
# OCR/STT 는 expose-only (host publish 없음). docker bridge IP 로 호스트에서 직접 호출 —
|
||||
# host publish 추가 아니라 docker network 내부 검증 (보안 표면 동일).
|
||||
set -uo pipefail
|
||||
|
||||
OCR=hyungi_document_server-ocr-service-1
|
||||
MARKER=hyungi_document_server-marker-service-1
|
||||
RERANKER=hyungi_document_server-reranker-1
|
||||
STT=hyungi_document_server-stt-service-1
|
||||
OLLAMA=ollama
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
container_ip() {
|
||||
docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$1" 2>/dev/null
|
||||
}
|
||||
|
||||
vram() {
|
||||
nvidia-smi --query-gpu=memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null \
|
||||
| awk -F',' '{printf "used=%dMiB free=%dMiB\n", $1, $2}'
|
||||
}
|
||||
|
||||
probe() {
|
||||
local label="$1" container="$2" path="$3" timeout="${4:-5}"
|
||||
local label="$1" container="$2" port="$3" path="$4" timeout="${5:-5}"
|
||||
local ip=$(container_ip "$container")
|
||||
printf " %-22s " "$label"
|
||||
if out=$(docker exec "$container" curl -fsS -m "$timeout" "$path" 2>&1); then
|
||||
if [[ -z "$ip" ]]; then
|
||||
echo "FAIL (container IP 없음)"
|
||||
FAIL=$((FAIL+1))
|
||||
return
|
||||
fi
|
||||
if out=$(curl -fsS -m "$timeout" "http://$ip:$port$path" 2>&1); then
|
||||
echo "OK $(echo "$out" | head -c 120)"
|
||||
PASS=$((PASS+1))
|
||||
else
|
||||
@@ -30,9 +41,15 @@ probe() {
|
||||
}
|
||||
|
||||
probe_post() {
|
||||
local label="$1" container="$2" url="$3" body="$4" timeout="${5:-30}" expect="${6:-}"
|
||||
local label="$1" container="$2" port="$3" path="$4" body="$5" timeout="${6:-30}" expect="${7:-}"
|
||||
local ip=$(container_ip "$container")
|
||||
printf " %-22s " "$label"
|
||||
if out=$(docker exec "$container" curl -fsS -m "$timeout" -H 'Content-Type: application/json' -X POST -d "$body" "$url" 2>&1); then
|
||||
if [[ -z "$ip" ]]; then
|
||||
echo "FAIL (container IP 없음)"
|
||||
FAIL=$((FAIL+1))
|
||||
return
|
||||
fi
|
||||
if out=$(curl -fsS -m "$timeout" -H 'Content-Type: application/json' -X POST -d "$body" "http://$ip:$port$path" 2>&1); then
|
||||
if [[ -z "$expect" || "$out" == *"$expect"* ]]; then
|
||||
echo "OK $(echo "$out" | head -c 100)"
|
||||
PASS=$((PASS+1))
|
||||
@@ -51,18 +68,18 @@ BASE=$(vram); echo " $BASE"
|
||||
echo
|
||||
|
||||
echo "=== health / ready ==="
|
||||
probe "OCR /health" "$OCR" "http://127.0.0.1:3200/health" 5
|
||||
probe "OCR /ready" "$OCR" "http://127.0.0.1:3200/ready" 5
|
||||
probe "marker /health" "$MARKER" "http://127.0.0.1:3300/health" 5
|
||||
probe "marker /ready" "$MARKER" "http://127.0.0.1:3300/ready" 5
|
||||
probe "reranker /health" "$RERANKER" "http://127.0.0.1:80/health" 5
|
||||
probe "stt /health" "$STT" "http://127.0.0.1:3300/health" 5
|
||||
probe "stt /ready" "$STT" "http://127.0.0.1:3300/ready" 5
|
||||
probe "OCR /health" "$OCR" 3200 "/health" 5
|
||||
probe "OCR /ready" "$OCR" 3200 "/ready" 5
|
||||
probe "marker /health" "$MARKER" 3300 "/health" 5
|
||||
probe "marker /ready" "$MARKER" 3300 "/ready" 5
|
||||
probe "reranker /health" "$RERANKER" 80 "/health" 5
|
||||
probe "stt /health" "$STT" 3300 "/health" 5
|
||||
probe "stt /ready" "$STT" 3300 "/ready" 5
|
||||
|
||||
echo
|
||||
echo "=== smoke ==="
|
||||
probe "OCR /smoke" "$OCR" "http://127.0.0.1:3200/smoke" 30
|
||||
probe_post "bge-m3 embed" "$OCR" "http://ollama:11434/api/embeddings" '{"model":"bge-m3","prompt":"smoke test"}' 30 '"embedding"'
|
||||
probe "OCR /smoke" "$OCR" 3200 "/smoke" 30
|
||||
probe_post "bge-m3 embed" "$OLLAMA" 11434 "/api/embeddings" '{"model":"bge-m3","prompt":"smoke test"}' 30 '"embedding"'
|
||||
|
||||
echo
|
||||
echo "=== nvidia-smi after ==="
|
||||
|
||||
Reference in New Issue
Block a user