Feat/two node endpoints #51

Merged
hyungi merged 3 commits from feat/two-node-endpoints into main 2026-07-02 14:31:28 +09:00
Showing only changes of commit d53fcc2b36 - Show all commits
+6 -2
View File
@@ -17,6 +17,7 @@ snippet 생성:
from __future__ import annotations
import asyncio
import os
import re
from typing import TYPE_CHECKING
@@ -33,8 +34,11 @@ logger = setup_logger("rerank")
# 동시 rerank 호출 제한 (GPU saturation 방지)
RERANK_SEMAPHORE = asyncio.Semaphore(2)
# rerank input 크기 제한 (latency / VRAM hard cap)
MAX_RERANK_INPUT = 200
# rerank input 크기 제한 (latency / VRAM hard cap).
# 2노드 이관(2026-07-02): env MAX_RERANK_INPUT 로 조정 가능 — 맥미니 llama.cpp 리랭크는
# 후보 수에 선형(NAS발 실측 50=0.60s / 100=0.95s / 200=1.89s)이라 NAS 배포는 50 권장.
# 기본 200 = 현행(GPU TEI) 무회귀.
MAX_RERANK_INPUT = int(os.getenv("MAX_RERANK_INPUT", "200"))
MAX_CHUNKS_PER_DOC = 2
# Soft timeout (초)