From e0f928f429565a011ea44cf1e694490ca7c3a964 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 8 Apr 2026 13:16:37 +0900 Subject: [PATCH] =?UTF-8?q?feat(deploy):=20Phase=201.3=20reranker=20(TEI?= =?UTF-8?q?=20bge-reranker-v2-m3)=20=EC=84=9C=EB=B9=84=EC=8A=A4=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docker-compose.yml에 reranker 서비스 추가: - image: ghcr.io/huggingface/text-embeddings-inference:1.5 - MODEL_ID=BAAI/bge-reranker-v2-m3 - MAX_BATCH_TOKENS=8192, MAX_CONCURRENT_REQUESTS=4 - GPU 1개 할당 (RTX 4070 Ti Super, CUDA 13.0) - expose 80만 (host 노출 X, internal network 전용) - reranker_cache volume으로 모델 영속화 - fastapi가 depends_on 안 함 → 단독 시작 가능, reranker 없어도 fastapi 동작 (rerank_service가 RRF fallback) 다음 단계: - GPU에서 docker pull로 호환성 확인 - docker compose up -d reranker → warmup - config.yaml의 rerank.endpoint를 http://reranker:80/rerank로 갱신 (GPU 직접) - fastapi rebuild + 평가셋 측정 (rerank=true) --- docker-compose.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index b55599a..cf4e1f3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -45,6 +45,28 @@ services: - "127.0.0.1:11434:11434" restart: unless-stopped + # Phase 1.3: bge-reranker-v2-m3 (TEI) — internal only, fastapi에서 reranker:80으로 호출 + # fastapi가 depends_on 안 함 → 단독 시작 가능, 없어도 fastapi 동작 (rerank=false fallback) + reranker: + image: ghcr.io/huggingface/text-embeddings-inference:1.5 + container_name: hyungi_document_server-reranker-1 + expose: + - "80" + environment: + - MODEL_ID=BAAI/bge-reranker-v2-m3 + - MAX_BATCH_TOKENS=8192 + - MAX_CONCURRENT_REQUESTS=4 + volumes: + - reranker_cache:/data + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + restart: unless-stopped + ai-gateway: build: ./gpu-server/services/ai-gateway ports: @@ -103,3 +125,4 @@ volumes: pgdata: caddy_data: ollama_data: + reranker_cache: