diff --git a/app/services/search/rerank_service.py b/app/services/search/rerank_service.py index 877926a..706fe79 100644 --- a/app/services/search/rerank_service.py +++ b/app/services/search/rerank_service.py @@ -17,6 +17,7 @@ snippet 생성: from __future__ import annotations import asyncio +import os import re from typing import TYPE_CHECKING @@ -33,8 +34,11 @@ logger = setup_logger("rerank") # 동시 rerank 호출 제한 (GPU saturation 방지) RERANK_SEMAPHORE = asyncio.Semaphore(2) -# rerank input 크기 제한 (latency / VRAM hard cap) -MAX_RERANK_INPUT = 200 +# rerank input 크기 제한 (latency / VRAM hard cap). +# 2노드 이관(2026-07-02): env MAX_RERANK_INPUT 로 조정 가능 — 맥미니 llama.cpp 리랭크는 +# 후보 수에 선형(NAS발 실측 50=0.60s / 100=0.95s / 200=1.89s)이라 NAS 배포는 50 권장. +# 기본 200 = 현행(GPU TEI) 무회귀. +MAX_RERANK_INPUT = int(os.getenv("MAX_RERANK_INPUT", "200")) MAX_CHUNKS_PER_DOC = 2 # Soft timeout (초)