Files
hyungi_document_server/docker-compose.yml
hyungi 5cabf728e6 fix(search): reranker MAX_CLIENT_BATCH_SIZE 64→256
rerank_service.py 가 후보를 MAX_RERANK_INPUT=200 까지 청크 없이 한 번에 TEI 로 POST → TEI 한도 64 초과(85) 시 HTTPError → RRF silent fallback(리랭크 누락=검색 품질 저하, 48h 4회). MAX_BATCH_TOKENS=16384 가 VRAM 상한이라 client batch entries 한도만 256(MAX_RERANK_INPUT 200 커버)으로 상향, reranker 만 재생성. 검증: 85-text rerank HTTP 200, batch 에러 0.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 23:35:43 +00:00

274 lines
11 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
services:
postgres:
image: pgvector/pgvector:pg16
volumes:
- pgdata:/var/lib/postgresql/data
- ./migrations:/docker-entrypoint-initdb.d
environment:
POSTGRES_DB: pkm
POSTGRES_USER: pkm
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
ports:
- "100.110.63.63:15432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U pkm"]
interval: 5s
timeout: 5s
retries: 5
restart: unless-stopped
kordoc-service:
build: ./services/kordoc
ports:
- "127.0.0.1:3100:3100"
volumes:
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
mem_limit: 4g
memswap_limit: 4g
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://localhost:3100/health').then(r=>{process.exit(r.ok?0:1)}).catch(()=>process.exit(1))"]
interval: 10s
timeout: 5s
retries: 3
restart: unless-stopped
ocr-service:
build: ./services/ocr
expose:
- "3200"
volumes:
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
- ocr_models:/root/.cache
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3200/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 180s
restart: unless-stopped
# Phase 1B (2026-05-01): PDF → markdown 변환. ocr-service 와 별도 컨테이너 (deps 충돌 회피).
marker-service:
build: ./services/marker
ports:
- "127.0.0.1:3300:3300"
expose:
- "3300"
environment:
- HF_HOME=/models/huggingface
- TORCH_HOME=/models/torch
# D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~3.5GB) 해제가 90% 봉투의 전제.
# /ready 는 idle 에서도 200 (fastapi depends_on service_healthy 유지).
# 롤백 = MARKER_PRELOAD=1 + MARKER_IDLE_UNLOAD_MINUTES=0.
- MARKER_PRELOAD=0
- MARKER_IDLE_UNLOAD_MINUTES=${MARKER_IDLE_UNLOAD_MINUTES:-30}
volumes:
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
- marker_models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3300/ready"]
interval: 30s
timeout: 10s
retries: 3
start_period: 300s
restart: unless-stopped
stt-service:
# 2026-05-08 (D9 Track B revised): GPU is canonical STT owner.
# 정책: Mac mini = Gemma 26B 전용 우선이므로 STT/Whisper 는 호출량 무관 GPU 서버 소유.
# 이전 "Mac mini 이전본" 주석은 trace 오인 기반이었고 본 revised 결정으로 폐기.
# fastapi 의 STT_ENDPOINT 는 `http://stt-service:3300` (compose 내부 DNS) 사용.
build: ./services/stt
expose:
- "3300"
volumes:
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
- stt_models:/root/.cache
environment:
- WHISPER_MODEL=${WHISPER_MODEL:-large-v3}
- WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
# D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~4GB) 해제가 90% 봉투의 전제.
# 콜드로드 수초~수십 초는 배치 작업이라 무방 (stt_worker read=1800s 가 흡수).
# 롤백 = STT_PRELOAD=1 + STT_IDLE_UNLOAD_MINUTES=0.
- STT_PRELOAD=0
- STT_IDLE_UNLOAD_MINUTES=${STT_IDLE_UNLOAD_MINUTES:-30}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
# D-1: idle-unload 도입으로 '모델 적재' 는 더 이상 상시 상태가 아님 — cuda 가용성만
# healthy 기준. 모델 적재 여부는 /ready 의 models_loaded 필드로 관측(정보성).
test: ["CMD", "python3", "-c", "import json,urllib.request,sys; r=urllib.request.urlopen('http://localhost:3300/ready'); sys.exit(0 if json.load(r).get('cuda') else 1)"]
interval: 30s
timeout: 10s
retries: 3
start_period: 300s
restart: unless-stopped
# ── ollama 서비스 제거 (2026-06-08) ──
# 정본 ollama = standalone `~/ollama/docker-compose.yml`(container_name: ollama).
# 그 컨테이너가 hyungi_document_server_default 망(external) + 동일 볼륨
# hyungi_document_server_ollama_data(external, bge-m3) 부착으로 fastapi 의 `ollama:11434`
# 임베딩을 이미 서빙(재부팅에도 durable). 본 중복 서비스는 같은 host 127.0.0.1:11434 를
# 점유 다퉈, 재부팅 후 `docker compose up` 을 'port already allocated' 로 abort →
# 뒤 의존서비스(caddy·frontend) 미기동 = 웹 outage 유발 → 제거. (ollama_data 볼륨 def 는
# standalone 이 external 로 참조하므로 아래 volumes: 에 보존.)
# Phase 1.3: bge-reranker-v2-m3 (TEI) — internal only, fastapi에서 reranker:80으로 호출
# fastapi가 depends_on 안 함 → 단독 시작 가능, 없어도 fastapi 동작 (rerank=false fallback)
reranker:
image: ghcr.io/huggingface/text-embeddings-inference:1.7
container_name: hyungi_document_server-reranker-1
expose:
- "80"
environment:
- MODEL_ID=BAAI/bge-reranker-v2-m3
# PR-2Q-Rerank-Payload-Fix (2026-05-24): 2 env 변경 — 413 root cause 분리.
# (a) MAX_BATCH_TOKENS 8192 → 16384: 각 batch 의 token sum 한도
# (b) MAX_CLIENT_BATCH_SIZE 32 → 64: 1 request 안 entries 수 한도 (default 32).
# multi-query cap 60 chunks (×2 chunks_per_doc dedup) 가 batch entries 54~60
# → 32 한도 초과 → 413. 64 로 늘림.
# GPU VRAM free 6199MiB 충분. baseline path (MAX_RERANK_INPUT=200) 영향 0.
- MAX_BATCH_TOKENS=16384
- MAX_CLIENT_BATCH_SIZE=256 # 2026-06-18 fix: 64→256, MAX_RERANK_INPUT=200 커버 (batch>64 ERROR=RRF silent fallback 해소; MAX_BATCH_TOKENS가 VRAM 상한이라 entries 증가는 VRAM 무관)
- MAX_CONCURRENT_REQUESTS=4
volumes:
- reranker_cache:/data
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 120s
restart: unless-stopped
fastapi:
build: ./app
ports:
- "100.110.63.63:8000:8000"
volumes:
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents
- ./config.yaml:/app/config.yaml:ro
- ./domain_policy.yaml:/app/domain_policy.yaml:ro
- ./scripts:/app/scripts:ro
- ./logs:/app/logs
- ./migrations:/app/migrations:ro
depends_on:
postgres:
condition: service_healthy
kordoc-service:
condition: service_healthy
marker-service:
condition: service_healthy
env_file:
- credentials.env
environment:
- DATABASE_URL=postgresql+asyncpg://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm
- KORDOC_ENDPOINT=http://kordoc-service:3100
- OCR_ENDPOINT=http://ocr-service:3200
- MARKER_ENDPOINT=http://marker-service:3300
- MARKER_CONTAINER_PATH_PREFIX=/documents
# 2026-05-08 (D9 Track B revised): GPU stt-service 정식 승격, 내부 DNS 사용.
- STT_ENDPOINT=http://stt-service:3300
# KGS Code 등 외부 학습 자료 추가 스캔 경로 (host .env 에서 주입). 빈 값이면 비활성.
- ADDITIONAL_WATCH_TARGETS=${ADDITIONAL_WATCH_TARGETS:-}
# PR-MacMini-Derived-Worker-1
- STUDY_EXPLANATION_ENABLED=${STUDY_EXPLANATION_ENABLED:-true}
- INTERNAL_WORKER_TOKEN=${INTERNAL_WORKER_TOKEN}
# Voice Memo PoC v1 — bot 계정 한정 long-expiry access token. default false → 일반 운영 영향 0.
# 활성화: host .env 에 VOICE_MEMO_BOT_TOKEN_ENABLED=true. plan: rosy-launching-otter.md
- VOICE_MEMO_BOT_TOKEN_ENABLED=${VOICE_MEMO_BOT_TOKEN_ENABLED:-false}
- VOICE_MEMO_BOT_USERNAME=${VOICE_MEMO_BOT_USERNAME:-voice-memo-bot}
- VOICE_MEMO_BOT_TOKEN_EXPIRE_DAYS=${VOICE_MEMO_BOT_TOKEN_EXPIRE_DAYS:-365}
# PR-Notebook-Client-1 — notebook-client-bot long-expiry token (laptop-worker-bot wrapper 재활용,
# 1B Pilot dormant 후 username rename). default false → 운영 영향 0.
- LAPTOP_WORKER_BOT_TOKEN_ENABLED=${LAPTOP_WORKER_BOT_TOKEN_ENABLED:-false}
- LAPTOP_WORKER_BOT_USERNAME=${LAPTOP_WORKER_BOT_USERNAME:-laptop-worker-bot}
- LAPTOP_WORKER_BOT_TOKEN_EXPIRE_DAYS=${LAPTOP_WORKER_BOT_TOKEN_EXPIRE_DAYS:-365}
# PR-2 of DS AI routing policy (2026-05-23) — backends dispatcher via llm-router.
# router_url default points at Mac mini Tailscale interface :8890 (PR-1).
# DS_BACKENDS_VIA_ROUTER=false 로 legacy 직접 호출 path 즉시 복귀.
- LLM_ROUTER_URL=${LLM_ROUTER_URL:-http://100.76.254.116:8890}
- DS_BACKENDS_VIA_ROUTER=${DS_BACKENDS_VIA_ROUTER:-true}
restart: unless-stopped
frontend:
build: ./frontend
ports:
- "127.0.0.1:3000:3000"
depends_on:
- fastapi
restart: unless-stopped
# crawl-24x7 A-8 1차: 전 소스 헬스 패널 — 내부 전용 (읽기 전용 SELECT 만).
# '내부 전용' 성립 구현 = 별도 바인딩뿐 (r4 결정): Tailscale 인터페이스에만 publish.
# 기존 SvelteKit 라우트(vhost=Host 헤더 검사=앱 가드 환원)나 프록시 경로 차단(경로 가드
# 회귀)으로 옮기지 말 것. caddy/home-caddy 라우트 추가 금지. fastapi/postgres 바인딩 선례.
crawl-health:
build: ./services/crawl-health
ports:
- "100.110.63.63:8765:8765"
environment:
- CRAWL_HEALTH_DSN=postgresql://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm
depends_on:
postgres:
condition: service_healthy
restart: unless-stopped
# crawl-24x7 B-3: 구독 세션 Playwright fetch 격리 — internal-only (host 포트·caddy 라우트 금지).
# 브라우저 hang/크래시가 fastapi APScheduler 를 잠식하지 않게 별도 컨테이너 + mem cap.
# 세션 파일(쿠키=credential 등가물)은 repo 밖 호스트 경로 ro mount (600, gitignore 무관 영역).
playwright-fetcher:
build: ./services/playwright-fetcher
volumes:
- /home/hyungi/.local/share/crawl-auth:/auth:ro
mem_limit: 2g
restart: unless-stopped
caddy:
image: caddy:2
ports:
- "8080:80"
volumes:
- ./Caddyfile:/etc/caddy/Caddyfile
- caddy_data:/data
depends_on:
- fastapi
- frontend
restart: unless-stopped
volumes:
pgdata:
caddy_data:
ollama_data:
reranker_cache:
ocr_models:
stt_models:
marker_models: