Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 624b9d523d | |||
| 5cabf728e6 | |||
| cd694e7386 | |||
| 7247d242a2 | |||
| 9434017114 | |||
| 66f3287564 |
+1
-1
@@ -282,7 +282,7 @@ async def search(
|
|||||||
content={
|
content={
|
||||||
"error_reason": "unknown_reranker_backend",
|
"error_reason": "unknown_reranker_backend",
|
||||||
"backend_requested": reranker_backend,
|
"backend_requested": reranker_backend,
|
||||||
"allowed": ["baseline", "cand_gte_ml_base"],
|
"allowed": ["baseline"],
|
||||||
"detail": msg,
|
"detail": msg,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -44,11 +44,10 @@ RERANK_TIMEOUT = 5.0
|
|||||||
# server-side allowlist map. query parameter 가 raw endpoint URL 받지 않음.
|
# server-side allowlist map. query parameter 가 raw endpoint URL 받지 않음.
|
||||||
RERANKER_BACKEND_MAP: dict[str, dict[str, str] | None] = {
|
RERANKER_BACKEND_MAP: dict[str, dict[str, str] | None] = {
|
||||||
"baseline": None, # production reranker (config.yaml endpoint via AIClient.rerank)
|
"baseline": None, # production reranker (config.yaml endpoint via AIClient.rerank)
|
||||||
"cand_gte_ml_base": {
|
# Phase 2B 후보 reranker 전부 NO-GO 종결 (2026-06-18 teardown):
|
||||||
"endpoint": "http://rerank-cand-gte-ml-base:80/rerank",
|
# - cand_gte_ml_base : 컨테이너·DB 테이블(마이그 360)·override.rerank-cand.yml 제거됨
|
||||||
},
|
# - mxbai_large (deberta-v2 → TEI 1.7 미지원) / bge_v2_gemma_2b (1_Pooling 부재) 미진입
|
||||||
# mxbai_large 후보 (deberta-v2 → TEI 1.7 미지원) Phase 2B-Extended 이관
|
# dispatcher scaffold(_resolve_reranker)는 향후 후보 재진입 위해 보존.
|
||||||
# bge_v2_gemma_2b 후보 (LLM-based reranker, 1_Pooling/config.json 부재) Phase 2B-Extended 이관
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
# hyungi_Document_Server 설정
|
# hyungi_Document_Server 설정
|
||||||
|
|
||||||
ai:
|
ai:
|
||||||
gateway:
|
|
||||||
endpoint: "http://ai-gateway:8080"
|
|
||||||
|
|
||||||
models:
|
models:
|
||||||
# ─── 단일 generation 호스트 routing (2026-05-14 GPU LLM 제거) ───
|
# ─── 단일 generation 호스트 routing (2026-05-14 GPU LLM 제거) ───
|
||||||
|
|||||||
@@ -1,135 +0,0 @@
|
|||||||
# Phase 2A — Embedding candidate compose override (Diagnose only)
|
|
||||||
#
|
|
||||||
# Profile-isolated: `--profile embed-cand` 명시 opt-in. default up 시 미기동.
|
|
||||||
# production fastapi/postgres/reranker 에 영향 0.
|
|
||||||
# 본 PR 종료 시 별 chore (PR-2A-Chunks-Cand-Cleanup-1) 에서 제거.
|
|
||||||
#
|
|
||||||
# 후보 상태 (2026-05-23):
|
|
||||||
# - me5_large_inst : ✅ smoke PASS (dim 1024)
|
|
||||||
# - bge_mgemma2 : ❌ Phase 2A-Extended 별 PR 이관 (9B FP16 → VRAM OOM risk + 다운로드 cost)
|
|
||||||
# - me5_ko : ❌ 폐기 (401 Unauthorized, gated/모델명 부정확)
|
|
||||||
# - snowflake_l_v2 : 신규 추가 (Snowflake/snowflake-arctic-embed-l-v2.0, 2024-12, multilingual 강화)
|
|
||||||
#
|
|
||||||
# 사용:
|
|
||||||
# docker compose -f docker-compose.yml -f docker-compose.override.cand.yml \
|
|
||||||
# --profile embed-cand up -d embedding-cand-me5-inst
|
|
||||||
#
|
|
||||||
# 호출 (DS network 내부):
|
|
||||||
# http://embedding-cand-me5-inst:80/embed
|
|
||||||
# http://embedding-cand-snowflake-l-v2:80/embed
|
|
||||||
|
|
||||||
services:
|
|
||||||
embedding-cand-me5-inst:
|
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:1.7
|
|
||||||
restart: unless-stopped
|
|
||||||
container_name: hyungi_document_server-embedding-cand-me5-inst-1
|
|
||||||
expose:
|
|
||||||
- "80"
|
|
||||||
environment:
|
|
||||||
- MODEL_ID=intfloat/multilingual-e5-large-instruct
|
|
||||||
- MAX_BATCH_TOKENS=8192
|
|
||||||
- MAX_CONCURRENT_REQUESTS=4
|
|
||||||
volumes:
|
|
||||||
- embedding_cand_me5_inst_cache:/data
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
start_period: 60s
|
|
||||||
profiles: ["embed-cand"]
|
|
||||||
|
|
||||||
embedding-cand-snowflake-l-v2:
|
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:1.7
|
|
||||||
restart: unless-stopped
|
|
||||||
container_name: hyungi_document_server-embedding-cand-snowflake-l-v2-1
|
|
||||||
expose:
|
|
||||||
- "80"
|
|
||||||
environment:
|
|
||||||
- MODEL_ID=Snowflake/snowflake-arctic-embed-l-v2.0
|
|
||||||
- MAX_BATCH_TOKENS=8192
|
|
||||||
- MAX_CONCURRENT_REQUESTS=4
|
|
||||||
volumes:
|
|
||||||
- embedding_cand_snowflake_l_v2_cache:/data
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
start_period: 60s
|
|
||||||
profiles: ["embed-cand"]
|
|
||||||
|
|
||||||
# ===== 비활성 후보 (Phase 2A-Extended 별 PR 이관 또는 폐기) =====
|
|
||||||
# 진단 박제만 보존. 본 PR scope 외.
|
|
||||||
|
|
||||||
embedding-cand-bge-mgemma2:
|
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:1.7
|
|
||||||
container_name: hyungi_document_server-embedding-cand-bge-mgemma2-1
|
|
||||||
expose:
|
|
||||||
- "80"
|
|
||||||
environment:
|
|
||||||
- MODEL_ID=BAAI/bge-multilingual-gemma2
|
|
||||||
- MAX_BATCH_TOKENS=8192
|
|
||||||
- MAX_CONCURRENT_REQUESTS=4
|
|
||||||
volumes:
|
|
||||||
- embedding_cand_bge_mgemma2_cache:/data
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
start_period: 300s
|
|
||||||
profiles: ["embed-cand-extended"] # 본 PR 미사용. extended 별 profile.
|
|
||||||
|
|
||||||
embedding-cand-me5-ko:
|
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:1.7
|
|
||||||
container_name: hyungi_document_server-embedding-cand-me5-ko-1
|
|
||||||
expose:
|
|
||||||
- "80"
|
|
||||||
environment:
|
|
||||||
- MODEL_ID=dragonkue/multilingual-e5-large-ko
|
|
||||||
- MAX_BATCH_TOKENS=8192
|
|
||||||
- MAX_CONCURRENT_REQUESTS=4
|
|
||||||
volumes:
|
|
||||||
- embedding_cand_me5_ko_cache:/data
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
start_period: 60s
|
|
||||||
profiles: ["embed-cand-disabled"] # 401 fail. 사용 X.
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
embedding_cand_me5_inst_cache:
|
|
||||||
embedding_cand_snowflake_l_v2_cache:
|
|
||||||
embedding_cand_bge_mgemma2_cache:
|
|
||||||
embedding_cand_me5_ko_cache:
|
|
||||||
@@ -1,101 +0,0 @@
|
|||||||
# Phase 2B — Reranker candidate compose override (Diagnose only)
|
|
||||||
#
|
|
||||||
# Profile-isolated: `--profile rerank-cand` 명시 opt-in. default up 시 미기동.
|
|
||||||
# production fastapi/postgres/reranker(bge-reranker-v2-m3) 에 영향 0.
|
|
||||||
# 본 PR 종료 후 별 chore (PR-2B-Rerank-Cand-Cleanup-1) 에서 제거.
|
|
||||||
#
|
|
||||||
# 후보 상태 (2026-05-23):
|
|
||||||
# - gte_ml_base : Apache 2.0, 305M, smoke 대기
|
|
||||||
# - mxbai_large : Apache 2.0, ~435M, safetensors 부재 — TEI smoke risk
|
|
||||||
# - bge_v2_gemma_2b : Gemma 라이센스, 2.5B FP16 ~5GB, smoke 대기
|
|
||||||
#
|
|
||||||
# 사용:
|
|
||||||
# docker compose -f docker-compose.yml -f docker-compose.override.rerank-cand.yml \
|
|
||||||
# --profile rerank-cand up -d rerank-cand-gte-ml-base
|
|
||||||
|
|
||||||
services:
|
|
||||||
rerank-cand-gte-ml-base:
|
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:1.7
|
|
||||||
restart: unless-stopped
|
|
||||||
container_name: hyungi_document_server-rerank-cand-gte-ml-base-1
|
|
||||||
expose:
|
|
||||||
- "80"
|
|
||||||
environment:
|
|
||||||
- MODEL_ID=Alibaba-NLP/gte-multilingual-reranker-base
|
|
||||||
- MAX_BATCH_TOKENS=8192
|
|
||||||
- MAX_CONCURRENT_REQUESTS=4
|
|
||||||
volumes:
|
|
||||||
- rerank_cand_gte_ml_base_cache:/data
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
start_period: 60s
|
|
||||||
profiles: ["rerank-cand"]
|
|
||||||
|
|
||||||
rerank-cand-mxbai-large:
|
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:1.7
|
|
||||||
restart: unless-stopped
|
|
||||||
container_name: hyungi_document_server-rerank-cand-mxbai-large-1
|
|
||||||
expose:
|
|
||||||
- "80"
|
|
||||||
environment:
|
|
||||||
- MODEL_ID=mixedbread-ai/mxbai-rerank-large-v1
|
|
||||||
- MAX_BATCH_TOKENS=8192
|
|
||||||
- MAX_CONCURRENT_REQUESTS=4
|
|
||||||
volumes:
|
|
||||||
- rerank_cand_mxbai_large_cache:/data
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
start_period: 60s
|
|
||||||
profiles: ["rerank-cand"]
|
|
||||||
|
|
||||||
rerank-cand-bge-v2-gemma-2b:
|
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:1.7
|
|
||||||
restart: unless-stopped
|
|
||||||
container_name: hyungi_document_server-rerank-cand-bge-v2-gemma-2b-1
|
|
||||||
expose:
|
|
||||||
- "80"
|
|
||||||
environment:
|
|
||||||
- MODEL_ID=BAAI/bge-reranker-v2-gemma
|
|
||||||
- MAX_BATCH_TOKENS=8192
|
|
||||||
- MAX_CONCURRENT_REQUESTS=2
|
|
||||||
volumes:
|
|
||||||
- rerank_cand_bge_v2_gemma_2b_cache:/data
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
start_period: 120s
|
|
||||||
profiles: ["rerank-cand"]
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
rerank_cand_gte_ml_base_cache:
|
|
||||||
rerank_cand_mxbai_large_cache:
|
|
||||||
rerank_cand_bge_v2_gemma_2b_cache:
|
|
||||||
+1
-14
@@ -149,7 +149,7 @@ services:
|
|||||||
# → 32 한도 초과 → 413. 64 로 늘림.
|
# → 32 한도 초과 → 413. 64 로 늘림.
|
||||||
# GPU VRAM free 6199MiB 충분. baseline path (MAX_RERANK_INPUT=200) 영향 0.
|
# GPU VRAM free 6199MiB 충분. baseline path (MAX_RERANK_INPUT=200) 영향 0.
|
||||||
- MAX_BATCH_TOKENS=16384
|
- MAX_BATCH_TOKENS=16384
|
||||||
- MAX_CLIENT_BATCH_SIZE=64
|
- MAX_CLIENT_BATCH_SIZE=256 # 2026-06-18 fix: 64→256, MAX_RERANK_INPUT=200 커버 (batch>64 ERROR=RRF silent fallback 해소; MAX_BATCH_TOKENS가 VRAM 상한이라 entries 증가는 VRAM 무관)
|
||||||
- MAX_CONCURRENT_REQUESTS=4
|
- MAX_CONCURRENT_REQUESTS=4
|
||||||
volumes:
|
volumes:
|
||||||
- reranker_cache:/data
|
- reranker_cache:/data
|
||||||
@@ -168,19 +168,6 @@ services:
|
|||||||
start_period: 120s
|
start_period: 120s
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
ai-gateway:
|
|
||||||
build: ./gpu-server/services/ai-gateway
|
|
||||||
ports:
|
|
||||||
- "127.0.0.1:8081:8080"
|
|
||||||
environment:
|
|
||||||
- PRIMARY_ENDPOINT=http://100.76.254.116:8801/v1/chat/completions
|
|
||||||
- FALLBACK_ENDPOINT=http://ollama:11434/v1/chat/completions
|
|
||||||
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
|
|
||||||
- DAILY_BUDGET_USD=${DAILY_BUDGET_USD:-5.00}
|
|
||||||
# depends_on: ollama 제거 (2026-06-08) — ollama 서비스가 standalone 으로 이관됨.
|
|
||||||
# FALLBACK_ENDPOINT 의 ollama:11434 는 standalone(동일 hostname, DS 망 부착)으로 해소.
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
fastapi:
|
fastapi:
|
||||||
build: ./app
|
build: ./app
|
||||||
ports:
|
ports:
|
||||||
|
|||||||
@@ -1394,7 +1394,7 @@ def main() -> int:
|
|||||||
"--reranker-backend",
|
"--reranker-backend",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Phase 2B Diagnose reranker dispatcher slug (baseline | cand_gte_ml_base). 미지정 = production.",
|
help="Phase 2B Diagnose reranker dispatcher slug (baseline). 후보 cand_gte_ml_base = NO-GO 종결·teardown(2026-06-18). 미지정 = production.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--rewrite-backend",
|
"--rewrite-backend",
|
||||||
|
|||||||
Reference in New Issue
Block a user