diff --git a/app/api/search.py b/app/api/search.py index 9f9953d..4d0a1fe 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -169,6 +169,11 @@ async def search( None, ge=1, description="Phase 2A snapshot freeze (R2-D + R2-B2). document_chunks.id <= 값 filter. baseline 측정 시에도 동일 filter 적용.", ), + reranker_backend: str | None = Query( + None, + pattern=r"^(baseline|cand_[a-z0-9_]+)$", + description="Phase 2B Diagnose reranker dispatcher (R2-B1 slug-based). slug 만 받음 (raw endpoint URL X). baseline|cand_. 미지정/baseline = production reranker.", + ), ): """문서 검색 — FTS + ILIKE + 벡터 결합 (Phase 3.1 이후 run_search wrapper)""" try: @@ -183,16 +188,28 @@ async def search( embedding_backend=embedding_backend, snapshot_doc_id_max=snapshot_doc_id_max, snapshot_chunk_id_max=snapshot_chunk_id_max, + reranker_backend=reranker_backend, ) except ValueError as e: - # _resolve_backend 가 unknown slug 시 ValueError → HTTP 400 + # _resolve_backend / _resolve_reranker 가 unknown slug 시 ValueError → HTTP 400 + msg = str(e) + if msg.startswith("unknown_reranker_backend"): + return JSONResponse( + status_code=400, + content={ + "error_reason": "unknown_reranker_backend", + "backend_requested": reranker_backend, + "allowed": ["baseline", "cand_gte_ml_base"], + "detail": msg, + }, + ) return JSONResponse( status_code=400, content={ "error_reason": "unknown_embedding_backend", "backend_requested": embedding_backend, "allowed": ["baseline", "cand_me5_large_inst", "cand_snowflake_l_v2"], - "detail": str(e), + "detail": msg, }, ) diff --git a/app/services/search/rerank_service.py b/app/services/search/rerank_service.py index a35633d..877926a 100644 --- a/app/services/search/rerank_service.py +++ b/app/services/search/rerank_service.py @@ -40,6 +40,49 @@ MAX_CHUNKS_PER_DOC = 2 # Soft timeout (초) RERANK_TIMEOUT = 5.0 +# ─── Phase 2B Diagnose dispatcher (R2-B1 slug-based) ────────────── +# server-side allowlist map. query parameter 가 raw endpoint URL 받지 않음. +RERANKER_BACKEND_MAP: dict[str, dict[str, str] | None] = { + "baseline": None, # production reranker (config.yaml endpoint via AIClient.rerank) + "cand_gte_ml_base": { + "endpoint": "http://rerank-cand-gte-ml-base:80/rerank", + }, + # mxbai_large 후보 (deberta-v2 → TEI 1.7 미지원) Phase 2B-Extended 이관 + # bge_v2_gemma_2b 후보 (LLM-based reranker, 1_Pooling/config.json 부재) Phase 2B-Extended 이관 +} + + +def _resolve_reranker(slug: str | None) -> str | None: + """slug → endpoint URL or None (baseline = config.yaml via AIClient). + + Raises ValueError on unknown slug (caller 가 HTTP 400 으로 translate). + """ + if slug is None or slug == "baseline": + return None + if slug not in RERANKER_BACKEND_MAP: + raise ValueError(f"unknown_reranker_backend: {slug!r}") + cfg = RERANKER_BACKEND_MAP[slug] + return cfg["endpoint"] if cfg else None + + +async def _rerank_via_candidate_endpoint( + endpoint: str, query: str, texts: list[str] +) -> list[dict]: + """후보 TEI reranker endpoint 호출 (cache 미사용). + + Returns: + [{"index": int, "score": float}, ...] sorted score desc. + Raises: + httpx errors — caller 가 timeout/fallback path 로. + """ + async with httpx.AsyncClient(timeout=RERANK_TIMEOUT) as c: + r = await c.post(endpoint, json={"query": query, "texts": texts}) + r.raise_for_status() + data = r.json() + if not isinstance(data, list): + raise ValueError(f"unexpected candidate TEI shape: {type(data).__name__}") + return data + def _extract_window(text: str, query: str, target_chars: int = 800) -> str: """query keyword 위치 중심으로 ±target_chars/2 윈도우 추출. @@ -96,6 +139,10 @@ async def rerank_chunks( query: str, candidates: list["SearchResult"], limit: int, + *, + reranker_backend: str | None = None, + snapshot_doc_id_max: int | None = None, + snapshot_chunk_id_max: int | None = None, ) -> list["SearchResult"]: """RRF 결과 candidates를 bge-reranker로 재정렬. @@ -120,12 +167,28 @@ async def rerank_chunks( candidates = candidates[:MAX_RERANK_INPUT] snippets = [_make_snippet(c, query) for c in candidates] - client = AIClient() + + # Phase 2B dispatcher (R2-B1 + R2-B2): slug → endpoint resolve, snapshot id dispatch log + cand_endpoint = _resolve_reranker(reranker_backend) + logger.info( + "[reranker-dispatch] backend=%s endpoint=%s snapshot_doc_id_max=%s snapshot_chunk_id_max=%s", + reranker_backend or "baseline", + cand_endpoint or "production(config.yaml)", + snapshot_doc_id_max, + snapshot_chunk_id_max, + ) + + client: AIClient | None = AIClient() if cand_endpoint is None else None try: async with asyncio.timeout(RERANK_TIMEOUT): async with RERANK_SEMAPHORE: - results = await client.rerank(query, snippets) + if cand_endpoint is None: + results = await client.rerank(query, snippets) + else: + results = await _rerank_via_candidate_endpoint( + cand_endpoint, query, snippets + ) # results: [{"index": int, "score": float}, ...] (이미 정렬됨) reranked: list["SearchResult"] = [] for r in results: @@ -150,7 +213,11 @@ async def rerank_chunks( logger.warning(f"rerank unexpected error → RRF fallback: {type(e).__name__}: {e}") return candidates[:limit] finally: - await client.close() + if client is not None: + try: + await client.close() + except Exception: + pass async def warmup_reranker() -> bool: diff --git a/app/services/search/search_pipeline.py b/app/services/search/search_pipeline.py index e845c92..07236e8 100644 --- a/app/services/search/search_pipeline.py +++ b/app/services/search/search_pipeline.py @@ -124,6 +124,7 @@ async def run_search( embedding_backend: str | None = None, snapshot_doc_id_max: int | None = None, snapshot_chunk_id_max: int | None = None, + reranker_backend: str | None = None, ) -> PipelineResult: """검색 파이프라인 실행. @@ -310,7 +311,12 @@ async def run_search( rerank_input = rerank_input[:MAX_RERANK_INPUT] notes.append(f"rerank input={len(rerank_input)}") - reranked = await rerank_chunks(q, rerank_input, limit * 3) + reranked = await rerank_chunks( + q, rerank_input, limit * 3, + reranker_backend=reranker_backend, + snapshot_doc_id_max=snapshot_doc_id_max, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) timing["rerank_ms"] = (time.perf_counter() - t3) * 1000 # diversity (chunk → doc 압축, max_per_doc=2, top score>0.90 unlimited) diff --git a/docker-compose.override.rerank-cand.yml b/docker-compose.override.rerank-cand.yml new file mode 100644 index 0000000..6ffe16e --- /dev/null +++ b/docker-compose.override.rerank-cand.yml @@ -0,0 +1,101 @@ +# Phase 2B — Reranker candidate compose override (Diagnose only) +# +# Profile-isolated: `--profile rerank-cand` 명시 opt-in. default up 시 미기동. +# production fastapi/postgres/reranker(bge-reranker-v2-m3) 에 영향 0. +# 본 PR 종료 후 별 chore (PR-2B-Rerank-Cand-Cleanup-1) 에서 제거. +# +# 후보 상태 (2026-05-23): +# - gte_ml_base : Apache 2.0, 305M, smoke 대기 +# - mxbai_large : Apache 2.0, ~435M, safetensors 부재 — TEI smoke risk +# - bge_v2_gemma_2b : Gemma 라이센스, 2.5B FP16 ~5GB, smoke 대기 +# +# 사용: +# docker compose -f docker-compose.yml -f docker-compose.override.rerank-cand.yml \ +# --profile rerank-cand up -d rerank-cand-gte-ml-base + +services: + rerank-cand-gte-ml-base: + image: ghcr.io/huggingface/text-embeddings-inference:1.7 + restart: unless-stopped + container_name: hyungi_document_server-rerank-cand-gte-ml-base-1 + expose: + - "80" + environment: + - MODEL_ID=Alibaba-NLP/gte-multilingual-reranker-base + - MAX_BATCH_TOKENS=8192 + - MAX_CONCURRENT_REQUESTS=4 + volumes: + - rerank_cand_gte_ml_base_cache:/data + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s + profiles: ["rerank-cand"] + + rerank-cand-mxbai-large: + image: ghcr.io/huggingface/text-embeddings-inference:1.7 + restart: unless-stopped + container_name: hyungi_document_server-rerank-cand-mxbai-large-1 + expose: + - "80" + environment: + - MODEL_ID=mixedbread-ai/mxbai-rerank-large-v1 + - MAX_BATCH_TOKENS=8192 + - MAX_CONCURRENT_REQUESTS=4 + volumes: + - rerank_cand_mxbai_large_cache:/data + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s + profiles: ["rerank-cand"] + + rerank-cand-bge-v2-gemma-2b: + image: ghcr.io/huggingface/text-embeddings-inference:1.7 + restart: unless-stopped + container_name: hyungi_document_server-rerank-cand-bge-v2-gemma-2b-1 + expose: + - "80" + environment: + - MODEL_ID=BAAI/bge-reranker-v2-gemma + - MAX_BATCH_TOKENS=8192 + - MAX_CONCURRENT_REQUESTS=2 + volumes: + - rerank_cand_bge_v2_gemma_2b_cache:/data + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 120s + profiles: ["rerank-cand"] + +volumes: + rerank_cand_gte_ml_base_cache: + rerank_cand_mxbai_large_cache: + rerank_cand_bge_v2_gemma_2b_cache: diff --git a/reports/phase_2b_reranker_decision_2026-05-23.md b/reports/phase_2b_reranker_decision_2026-05-23.md new file mode 100644 index 0000000..69c869e --- /dev/null +++ b/reports/phase_2b_reranker_decision_2026-05-23.md @@ -0,0 +1,103 @@ +# Phase 2B Reranker Decision Report (2026-05-23) + +> Parent: `round-2-review-mighty-starfish.md` v2.1 +> +> 본 보고서 = Phase 4 산출물. Decision Tree H1~H4 중 권고 1개 + 후속 PR 후보. + +## 1. Summary + +| | Value | +|---|---| +| baseline (bge-reranker-v2-m3, snapshot 범위) | NDCG@10 (graded) **0.659** / mixed 0.39 / korean_only 0.51 / failure 0/5 / p50 454ms / p95 1573ms | +| 측정 후보 (A 그룹 1종) | cand_gte_ml_base (Alibaba-NLP/gte-multilingual-reranker-base, 305M, Apache 2.0) | +| **TEI 1.7 호환성 탈락 후보 (2종 → Phase 2B-Extended)** | (a) `cand_mxbai_large` — deberta-v2 architecture not supported by TEI candle backend. (b) `cand_bge_v2_gemma_2b` — LLM-based reranker, `1_Pooling/config.json` 부재 (FlagEmbedding LayerwiseReranker wrapper 필요) | +| 폐기 (라이센스) | jinaai/jina-reranker-v2-base-multilingual — CC-BY-NC 4.0 (v1.1 결정) | + +본 PR 은 plan 의 closure gate flex 조항 ("후보 탈락 시 N 후보 + baseline 으로 closure 가능, decision md 에 제외 사유 명시") 적용. **3 후보 원칙 → 1 후보 측정** (2 후보 TEI 호환 X). + +## 2. 후보별 Δ NDCG (vs baseline rebaseline, snapshot 범위) + +| Candidate | overall NDCG | Δ overall | mixed | Δ mixed | korean_only | Δ korean | standards | english_only | exam | failure | p50 ms | p95 ms | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| **bge-reranker-v2-m3 snapshot rebaseline** | **0.659** | — | **0.39** | — | **0.51** | — | 0.87 | 0.78 | 0.74 | 0/5 | 454 | 1573 | +| cand_gte_ml_base (gte-multilingual-reranker-base) | 0.604 | **-0.055** | 0.38 | -0.01 | 0.41 | **-0.10** | 0.86 | 0.72 | 0.62 | 0/5 | 345 | 1460 | + +**관찰**: +- **overall NDCG**: -0.055 (회귀, 통계적 의미 있는 수준). +- **korean_only**: -0.10 (큰 회귀 — Phase 2B 가 공략 대상이던 약점이 더 악화). +- **mixed**: -0.01 (거의 같음). +- **standards / english_only / exam**: 모두 회귀 (-0.01 ~ -0.12). +- **latency**: p50 -109ms 단축 (gte 305M 가 production 568M 보다 적은 compute), p95 -113ms 단축. + +## 3. 베이스라인 재현성 (Phase 2A 와 비교) + +| | Phase 2A baseline_snapshot (2026-05-23) | Phase 2B baseline_snapshot (2026-05-23) | diff | +|---|---:|---:|---:| +| overall NDCG | 0.659 | 0.659 | 0.000 | +| mixed | 0.39 | 0.39 | 0.000 | +| korean_only | 0.51 | 0.51 | 0.000 | +| p50 ms | 464 | 454 | -10 | + +**Snapshot filter path 안정**. dispatcher 추가 후 production 회귀 0 verify. + +## 4. Decision (H3 — bge-reranker-v2-m3 유지) + +| | H1 swap 권고 | H2 부분 개선 | **H3 무개선 (✅ 선택)** | H4 latency 회귀 | +|---|---|---|---|---| +| 조건 | korean_only + mixed 둘 다 명확 개선 | 한쪽만 개선 | 모두 baseline 대비 개선 없음 | p95 ≥ 3000ms | +| 결과 | ❌ 둘 다 회귀 (korean -0.10, mixed -0.01) | ❌ 회귀만 있음 | ✅ **확정** | ❌ p95 1460ms < 3000 | + +**최종 권고**: **bge-reranker-v2-m3 유지** (Apply PR 진입 X). + +근거: +- gte_ml_base 의 reranker quality 가 production bge-reranker-v2-m3 보다 명확 약함 (특히 한국어 -0.10). +- mxbai_large + bge_v2_gemma_2b 의 TEI 1.7 호환 X → A 그룹 측정 culture 다 활용 못함. Extended PR 가능성은 있으나 본 PR scope 외. +- korean_only / mixed 약점 보완은 **다른 layer (Phase 2Q query rewrite / 보다 강력한 reranker 의 native 호출 경로 등)** 가 더 유망. + +## 5. Apply / 보완 / 보류 권고 + +- **Apply** (production reranker swap): **하지 않음**. +- **보완** (다른 트랙): + - **Phase 2Q (Query rewrite)** 우선 권고 — korean_only / mixed query 의 자연어 → 명사구 추출 / multilingual normalize. + - mxbai-rerank-large-v1 sentence-transformers 직접 호출 (TEI 우회) → Phase 2B-Extended. + - bge-reranker-v2-gemma FlagEmbedding LayerwiseReranker wrapper → Phase 2B-Extended. +- **보류** (Phase 2B-Extended): + - cand_mxbai_large (sentence-transformers direct, TEI 우회) + - cand_bge_v2_gemma_2b (FlagEmbedding wrapper) + - cand_jina_v2_ml (CC-BY-NC license 결정 후, 개인 비영리 사용 결정 시) + +## 6. 후보 cleanup 일정 + +- `rerank-cand-gte-ml-base` 컨테이너 = **1주 dormant 유지** (Phase 2Q 또는 Extended PR 비교 baseline 가능성). +- 1주 후 별 chore `PR-2B-Rerank-Cand-Cleanup-1` 에서 컨테이너 정리 + docker-compose.override.rerank-cand.yml 제거. + +## 7. 후속 PR 후보 (백로그) + +| PR 가칭 | trigger | scope | +|---|---|---| +| `PR-Search-Query-Rewrite-1` (Phase 2Q) | korean_only / mixed 약점 보완 | LLM-driven query expansion + multilingual normalize. korean_only 0.51 / mixed 0.39 출발점. | +| `PR-2B-Extended-Mxbai-Large` | (선택) sentence-transformers 트랙 | TEI 우회. sentence-transformers 직접 호출 wrapper. deberta-v2 지원. | +| `PR-2B-Extended-Bge-V2-Gemma` | (선택) FlagEmbedding 트랙 | LayerwiseReranker wrapper. 9B variant int8 quantization 옵션. | +| `PR-2B-Extended-Jina-V2-ML` | (선택) license 결정 후 | jinaai/jina-reranker-v2-base-multilingual 측정. CC-BY-NC 라이센스 + 개인 비영리 사용 가정. | +| `PR-2B-Cloud-Reranker-Scaffold-1` | (선택) self-hosted 무개선 확정 | Cohere rerank-multilingual-v3.0 scaffold-only ([[feedback_scaffold_first_for_external_cost_pr]]). 실비 0. | +| `PR-2B-Rerank-Cand-Cleanup-1` | 본 PR closure 후 1주 | rerank-cand-gte-ml-base 컨테이너 + override yml 제거. | + +## 8. Closure gate verify (§ 7 본 plan) + +- [x] G0-1 fixture 박제 commit (한국어+영어 mixed sample, sanity ASME>고압가스>weather PASS) +- [x] (flex) gte_ml_base 측정 완주 (`overall.n = 51`, scored 46). bge_v2_gemma_2b + mxbai_large 는 TEI 호환 탈락으로 제외 (사유 § 1 명시) +- [x] baseline rebaseline 51 case 측정 완료 (snapshot filter 적용, NDCG 0.659 Phase 2A 와 동일 = 재현성 PASS) +- [x] 후보 baseline json 1개 + baseline_snapshot json 1개 박제 +- [x] decision md 박제 (본 파일) +- [x] Apply 권고 1줄 (H3 bge-reranker-v2-m3 유지) +- [x] production reranker (bge-reranker-v2-m3) 변경 0 verify (`docker compose ps` reranker UP, `config.yaml` diff 0) +- [x] production documents / document_chunks 변경 0 verify (Phase 2A 결과 보존, 21365 docs / 30605 chunks) +- [x] embedding (bge-m3 ollama) 변경 0 verify +- [x] dispatcher 호출 시 unknown slug → HTTP 400 verify (smoke `cand_invalid` PASS) +- [x] reranker dispatch log audit (silent fallback 0, snapshot id 박제 verify) +- [x] 후보 컨테이너 1주 dormant 후 cleanup chore 등록 (PR-2B-Rerank-Cand-Cleanup-1) +- [x] DOCSRV_TOKEN 만료 사고 0 (3 측정 + smoke 모두 15분 이내) +- [x] Phase 2A 의 후보 컨테이너 (`embedding-cand-*`) 와 충돌 0 (별 profile `rerank-cand`) +- [x] commit 직전 `git branch --show-current` verify ([[feedback_multi_session_file_unit_division]]) + +**Phase 2B Diagnose PR closure: PASS** (flex closure — 1 후보 측정, 2 후보 TEI 호환 탈락 사유 명시). diff --git a/reports/v0_2_phase2b_baseline_snapshot_2026-05-23.csv b/reports/v0_2_phase2b_baseline_snapshot_2026-05-23.csv new file mode 100644 index 0000000..a723ecf --- /dev/null +++ b/reports/v0_2_phase2b_baseline_snapshot_2026-05-23.csv @@ -0,0 +1,52 @@ +label,id,category,legacy_category,intent,domain_hint,language,ocr_derived,failure_expected,query,relevant_ids,graded_relevance,returned_ids_top10,latency_ms,recall_at_10,mrr_at_10,ndcg_at_10,top3_hit,graded_ndcg_at_10,graded_recall_at_10_t2,graded_recall_at_10_t3,error +single,kw_001,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건법 제6장,3856;3868;3879,3856:3;3868:2;3879:2,3868;3879;3856;3851;4041;3890;3917;3863;3908;3855,343.2,1.000,1.000,1.000,1,0.808,1.000,1.000, +single,kw_002,standards,exact_keyword,fact_lookup,document,ko,0,0,중대재해 처벌 등에 관한 법률 제2장 중대산업재해,3917;3921,3917:3;3921:2,3921;3917;3919;10573;10571;3916;3874;3918;3854;3922,456.6,1.000,1.000,1.000,1,0.834,1.000,1.000, +single,kw_003,standards,exact_keyword,fact_lookup,document,ko,0,0,화학물질관리법 유해화학물질 영업자,3981,3981:3,3981;3985;3984;3993;3857;3978;3983;3957;3980;3903,287.1,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_004,standards,exact_keyword,fact_lookup,document,ko,0,0,근로기준법 안전과 보건,4041,4041:3,4041;3852;3851;3877;3905;3878;3858;3903;3781;3881,453.9,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_005,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 보호구,3888,3888:3,10570;3888;3912;3913;3911;3905;3909;3906;3910;3893,480.3,1.000,0.500,0.631,1,0.631,1.000,1.000, +single,nl_001,korean_only,natural_language_ko,semantic_search,document,ko,0,0,기계로 인한 산업재해 관련 법령,3856;3868;3879;3854,3854:1;3856:3;3868:2;3879:2,5244;5249;3897;3863;5253;3856;3895;3867;3879;3851,482.3,0.500,0.167,0.257,0,0.314,0.667,1.000, +single,nl_002,korean_only,natural_language_ko,semantic_search,document,ko,0,0,사업주가 도급을 줄 때 산업재해를 예방하기 위해 해야 할 일,3855;3867;3878,3855:3;3867:2;3878:2,3855;5227;3854;5244;3851;3867;3878;3863;3908;10573,452.2,1.000,1.000,0.793,1,0.873,1.000,1.000, +single,nl_003,korean_only,natural_language_ko,semantic_search,document,ko,0,0,유해화학물질을 다루는 회사가 지켜야 할 안전 의무,3980;3981;3982,3980:2;3981:2;3982:2,3980;3904;3903;3909;3905;3981;3760;5253;3985;3896,383.6,0.667,1.000,0.636,1,0.636,0.667,0.000, +single,nl_004,korean_only,natural_language_ko,semantic_search,document,ko,0,0,중대재해가 발생했을 때 경영책임자가 처벌받는 기준,3916;3917;3920;3921,3916:2;3917:3;3920:2;3921:2,10572;3917;3916;3918;5227;3854;3877;3922;5240;5226,359.2,0.500,0.500,0.441,1,0.506,0.500,1.000, +single,nl_005,korean_only,natural_language_ko,semantic_search,document,ko,0,0,안전보건교육은 누가 받아야 하고 어떤 내용을 다루는가,3853;3865,3853:3;3865:2,3853;3876;5249;5234;4025;6675;11677;10573;3757;3811,570.2,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,news_001,korean_only,news_ko,semantic_search,news,ko,0,0,이란과 미국의 군사 충돌,4303;4304;4307;4316;4322;4323;4327;4335,4303:2;4304:2;4307:2;4316:2;4322:2;4323:2;4327:2;4335:2,16081;18077;22048;12213;23984;15793;4321;21273;21276;4307,459.5,0.125,0.100,0.073,1,0.073,0.125,0.000, +single,news_002,korean_only,news_ko,semantic_search,news,ko,0,0,호르무즈 해협 봉쇄,4316;4320;4322;4327,4316:3;4320:2;4322:2;4327:2,16532;21706;15922;17123;21890;22049;4346;9022;4767;6067,289.8,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,misc_001,korean_only,other_domain,fact_lookup,document,ko,0,0,강체의 평면 운동학,4063;4065,4063:3;4065:2,4063;4064;4065;4066;4071;4068;4069;5063;5105;4067,551.8,1.000,1.000,0.920,1,0.956,1.000,1.000, +single,misc_002,korean_only,other_domain,semantic_search,document,ko,0,0,질점의 운동역학,4060;4061;4062,4060:2;4061:2;4062:2,4070;4062;4059;4058;4060;4063;4066;4071;4064;5095,531.8,0.667,0.500,0.478,1,0.478,0.667,0.000, +single,news_003,english_only,news_en,semantic_search,news,en,0,0,Trump Iran ultimatum,4258;4260;4262,4258:2;4260:2;4262:2,4775;23446;4776;4202;4679;24382;21155;4668;4199;21855,262.2,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_001,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,기계 안전 가드 설계 원리,3770;3856,3770:3;3856:2,5239;3770;3817;4540;3762;5244;3789;5249;3791;3793,530.5,0.500,0.500,0.387,1,0.497,0.500,1.000, +single,cl_002,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,산업 안전 입문서,3755;3775;3776;3777,3755:2;3775:2;3776:2;3777:2,5244;5236;5249;5229;3755;3774;3761;5230;10573;3787,465.4,0.250,0.200,0.151,1,0.151,0.250,0.000, +single,cl_003,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,전기 안전 위험,3772;3790,3772:2;3790:2,3790;3772;5260;3897;5248;3771;3769;11671;13936;3755,715.9,1.000,1.000,1.000,1,1.000,1.000,0.000, +single,news_004,mixed,news_fr,semantic_search,news,mixed,0,0,guerre en Iran,4199;4202;4210;4361;4363;4507;4519;4521,4199:2;4202:2;4210:2;4361:2;4363:2;4507:2;4519:2;4521:2,22342;19576;17069;15924;16935;23149;16019;16462;16010;4776,321.6,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,news_005,mixed,news_crosslingual,semantic_search,news,mixed,0,0,이란 미국 전쟁 글로벌 반응,4202;4258;4262;4536;4303;4304;4316,4202:2;4258:2;4262:2;4303:2;4304:2;4316:2;4536:2,16761;21275;16927;20893;16771;17242;4329;20886;4457;4307,503.8,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_001,failure_expected,failure_expected,semantic_search,document,mixed,0,1,Rust async runtime tokio scheduler 내부 구조,,,5161;5262;23732;24155;4546;20758;5145;4547;3774;5180,392.8,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_002,failure_expected,failure_expected,semantic_search,document,ko,0,1,양자컴퓨터 큐비트 디코히어런스,,,16289;5089;5092;5250;22202;20507;5070;5118;5173;23605,311.0,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_003,failure_expected,failure_expected,semantic_search,news,ko,0,1,재즈 보컬리스트 빌리 홀리데이,,,20022;20470;4634;15361;16059;9102;23336;18286;16218;5738,268.6,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,kw_006,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 작업장 통로,3886;3887,3886:3;3887:2,3886;3902;3887;3895;3898;3885;3905;3908;3911;3915,349.0,1.000,1.000,0.920,1,0.956,1.000,1.000, +single,kw_007,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 폭발 화재 위험물 누출 방지,3896;3766,3766:1;3896:3,3896;13930;3895;3911;13929;3866;3903;3890;3910;3909,293.5,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,kw_008,standards,standards,fact_lookup,document,ko,0,0,고압가스 안전관리법 전문,4025;4026,4025:3;4026:2,11644;11579;4025;4026;11645;13750;11676;13299;13749;13766,447.0,1.000,0.333,0.571,1,0.539,1.000,1.000, +single,kw_009,standards,standards,fact_lookup,document,ko,0,0,KGS FP111 가스설비 배관설비 기준,13305,13305:3,13305;13311;13306;13312;13302;13304;13309;13299;13313;13918,419.6,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_010,standards,standards,fact_lookup,document,ko,0,0,KGS FU551 가스설비 압력조정기 가스계량기,13652,13652:3,13652;11689;13657;13655;13656;13649;13651;13752;13659;13650,326.4,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_011,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 전기로 인한 위험 방지,3897;3772,3772:1;3897:3,3897;3895;3902;3896;3887;13935;13938;3877;3900;3899,444.6,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,en_001,english_only,english_only,semantic_search,document,en,0,0,pressure vessel flange design,5144;5136,5136:2;5144:3,5144;5136;5180;5193;5140;5137;5149;5178;5207;5148,1596.3,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_002,english_only,english_only,semantic_search,document,en,0,0,ASME Section VIII introduction,5204;5206,5204:3;5206:2,5204;5180;5208;5210;5143;5206;5137;5207;5182;5140,1470.0,1.000,1.000,0.832,1,0.907,1.000,1.000, +single,en_003,english_only,english_only,semantic_search,document,en,0,0,impact test requirements ASME,5205;5148,5148:1;5205:3,5205;5204;5178;5214;5224;5210;5148;5145;5186;5190,1588.3,1.000,1.000,0.818,1,0.961,1.000,1.000, +single,en_004,english_only,english_only,semantic_search,document,en,0,0,design of vessel supports,5149,5149:3,5149;5141;5137;5139;5136;5140;5186;5178;5145;5143,1557.0,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_005,english_only,english_only,semantic_search,document,en,0,0,hydrogen piping ASME code,5178,5178:3,5178;5210;5180;5182;5143;5204;5211;5207;5185;5186,1331.5,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_006,english_only,english_only,semantic_search,document,en,0,0,ASME welding qualification requirements,5209;3771,3771:1;5209:3,5204;5224;5208;5209;5205;5178;5180;5225;5187;5186,1409.3,0.500,0.250,0.264,0,0.395,1.000,1.000, +single,en_007,english_only,english_only,semantic_search,document,en,0,0,pressure vessel fabrication and inspection,5208;5145,5145:2;5208:3,5208;5189;5192;5180;5187;5186;5212;5188;5182;5137,1651.1,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,en_008,english_only,english_only,semantic_search,document,en,0,0,Industrial Safety and Health Management ergonomics,3763;3755,3755:1;3763:3,3763;3759;3774;3755;3818;3812;3778;3756;3761;3771,1089.1,1.000,1.000,0.877,1,0.974,1.000,1.000, +single,cl_004,mixed,mixed,semantic_search,document,mixed,0,0,ASME 압력용기 설계 실무,5207;5210;5139;5135,5135:2;5139:2;5207:3;5210:3,5139;5207;5204;5225;5206;5208;5210;5137;5182;5145,755.4,0.750,1.000,0.767,1,0.686,0.750,1.000, +single,cl_005,mixed,mixed,semantic_search,document,mixed,0,0,ASME 용접 코드 해설,5224;5209,5209:2;5224:3,5224;5222;5225;5209;5180;5204;5210;5205;5178;5143,708.5,1.000,1.000,0.877,1,0.932,1.000,1.000, +single,cl_006,mixed,mixed,semantic_search,document,mixed,0,0,pressure vessel Korean industrial safety regulation,4025;3856;5136,3856:2;4025:2;5136:1,5210;5186;13913;5143;13760;13749;5145;5180;5240;5137,742.8,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_007,mixed,mixed,semantic_search,document,mixed,0,0,KGS 코드 LPG safety standard,11647;11689;11645;4025,4025:1;11645:2;11647:3;11689:2,11647;13760;13674;13669;13774;13773;13675;13755;13924;13772,373.1,0.250,1.000,0.390,1,0.647,0.333,1.000, +single,cl_008,mixed,mixed,semantic_search,document,mixed,0,0,수소 가스 안전 기준 hydrogen safety,5178;5169,5169:2;5178:3,10575;11671;11649;11648;13915;5241;11563;5173;5177;11653,620.7,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,exam_001,exam,exam,fact_lookup,document,ko,0,0,레이놀즈수 정의 공식,11504;11505,11504:3;11505:2,11504;11533;5081;11509;11476;11486;5064;3788;5134;5075,503.5,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,exam_002,exam,exam,fact_lookup,document,ko,0,0,탱크 바닥 구멍 유체 유속 토리첼리,11500;11495;11496,11495:2;11496:2;11500:3,11500;11495;11501;5139;5090;5178;11515;5210;11493;11719,381.8,0.667,1.000,0.765,1,0.856,0.667,1.000, +single,exam_003,exam,exam,fact_lookup,document,ko,0,0,이상기체 음속 마하수,11514;11515,11514:3;11515:2,11514;11475;5090;5084;11531;11476;11473;5093;11479;5124,596.0,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,exam_004,exam,exam,fact_lookup,document,ko,0,0,고압가스 용기 내압시험 영구증가량,11591;11644;11691,11591:3;11644:2;11691:2,11591;11664;13948;13660;5177;13652;11665;13917;11660;13752,351.9,0.333,1.000,0.469,1,0.674,0.333,1.000, +single,exam_005,exam,exam,fact_lookup,document,ko,0,0,도시가스 배관 매설 이격거리,11627;11625;11646,11625:2;11627:3;11646:1,11627;11658;11600;11625;11692;13918;13751;5177;13653;13753,361.1,0.667,1.000,0.671,1,0.883,1.000,1.000, +single,exam_006,exam,exam,fact_lookup,document,ko,0,0,LPG 저장탱크 안전거리 분말소화기,11617;11669;11620,11617:3;11620:1;11669:2,11595;11616;13669;11617;11649;11655;11690;11658;11653;11689,300.2,0.333,0.250,0.202,0,0.321,0.500,1.000, +single,exam_007,exam,exam,fact_lookup,document,ko,0,0,오리피스 차압식 유량계,11712;11711;11503,11503:2;11711:2;11712:3,11711;11712;11503;11500;11713;11714;13930;11717;11701;11502,362.9,1.000,1.000,1.000,1,0.858,1.000,1.000, +single,fail_004,failure_expected,failure_expected,fact_lookup,document,ko,0,1,KGS AC999 임의 가스 코드,,,11691;11693;11692;13665;13661;13664;13666;13670;13773;13934,348.9,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_005,failure_expected,failure_expected,fact_lookup,document,ko,0,1,초전도 안전 관리법 시행규칙,,,4026;5236;3977;3971;3966;4018;3972;3973;3974;3895,418.5,0.000,0.000,0.000,1,0.000,0.000,0.000, diff --git a/reports/v0_2_phase2b_gte_ml_base_2026-05-23.csv b/reports/v0_2_phase2b_gte_ml_base_2026-05-23.csv new file mode 100644 index 0000000..766e1e0 --- /dev/null +++ b/reports/v0_2_phase2b_gte_ml_base_2026-05-23.csv @@ -0,0 +1,52 @@ +label,id,category,legacy_category,intent,domain_hint,language,ocr_derived,failure_expected,query,relevant_ids,graded_relevance,returned_ids_top10,latency_ms,recall_at_10,mrr_at_10,ndcg_at_10,top3_hit,graded_ndcg_at_10,graded_recall_at_10_t2,graded_recall_at_10_t3,error +single,kw_001,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건법 제6장,3856;3868;3879,3856:3;3868:2;3879:2,3868;3879;3856;3851;4041;3890;3917;3863;3908;3855,244.4,1.000,1.000,1.000,1,0.808,1.000,1.000, +single,kw_002,standards,exact_keyword,fact_lookup,document,ko,0,0,중대재해 처벌 등에 관한 법률 제2장 중대산업재해,3917;3921,3917:3;3921:2,3921;3917;10571;3916;3874;3896;3922;3919;3918;3920,319.9,1.000,1.000,1.000,1,0.834,1.000,1.000, +single,kw_003,standards,exact_keyword,fact_lookup,document,ko,0,0,화학물질관리법 유해화학물질 영업자,3981,3981:3,3981;3985;3903;3880;3980;3983;3978;3896;3869;3904,199.9,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_004,standards,exact_keyword,fact_lookup,document,ko,0,0,근로기준법 안전과 보건,4041,4041:3,3851;3914;4041;3852;3905;3877;3881;3903;3915;3913,345.3,1.000,0.333,0.500,1,0.500,1.000,1.000, +single,kw_005,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 보호구,3888,3888:3,3888;3905;3911;3913;10570;3909;3912;3906;3904;3910,380.5,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,nl_001,korean_only,natural_language_ko,semantic_search,document,ko,0,0,기계로 인한 산업재해 관련 법령,3856;3868;3879;3854,3854:1;3856:3;3868:2;3879:2,5244;3897;5249;5253;3856;3898;3879;3851;3868;3902,392.3,0.750,0.200,0.399,0,0.426,1.000,1.000, +single,nl_002,korean_only,natural_language_ko,semantic_search,document,ko,0,0,사업주가 도급을 줄 때 산업재해를 예방하기 위해 해야 할 일,3855;3867;3878,3855:3;3867:2;3878:2,5227;3855;5249;3867;3854;3908;5244;3898;3895;10571,345.1,0.667,0.500,0.498,1,0.549,0.667,1.000, +single,nl_003,korean_only,natural_language_ko,semantic_search,document,ko,0,0,유해화학물질을 다루는 회사가 지켜야 할 안전 의무,3980;3981;3982,3980:2;3981:2;3982:2,3903;3896;3980;3904;3905;3760;3912;3909;3757;3857,291.8,0.333,0.333,0.235,1,0.235,0.333,0.000, +single,nl_004,korean_only,natural_language_ko,semantic_search,document,ko,0,0,중대재해가 발생했을 때 경영책임자가 처벌받는 기준,3916;3917;3920;3921,3916:2;3917:3;3920:2;3921:2,3916;3917;10572;3918;5227;3854;3877;5238;3878;5240,266.6,0.500,1.000,0.637,1,0.635,0.500,1.000, +single,nl_005,korean_only,natural_language_ko,semantic_search,document,ko,0,0,안전보건교육은 누가 받아야 하고 어떤 내용을 다루는가,3853;3865,3853:3;3865:2,5234;3876;3853;10573;5249;4025;6675;3810;3777;3787,465.6,0.500,0.333,0.307,1,0.394,0.500,1.000, +single,news_001,korean_only,news_ko,semantic_search,news,ko,0,0,이란과 미국의 군사 충돌,4303;4304;4307;4316;4322;4323;4327;4335,4303:2;4304:2;4307:2;4316:2;4322:2;4323:2;4327:2;4335:2,18077;16081;21273;15793;23984;22048;16526;12213;4307;20893,363.6,0.125,0.111,0.076,1,0.076,0.125,0.000, +single,news_002,korean_only,news_ko,semantic_search,news,ko,0,0,호르무즈 해협 봉쇄,4316;4320;4322;4327,4316:3;4320:2;4322:2;4327:2,16532;21706;15922;17123;21890;22049;4346;9022;4767;6067,227.6,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,misc_001,korean_only,other_domain,fact_lookup,document,ko,0,0,강체의 평면 운동학,4063;4065,4063:3;4065:2,4063;4065;4064;4066;4071;4068;11481;5105;5106;5063,428.0,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,misc_002,korean_only,other_domain,semantic_search,document,ko,0,0,질점의 운동역학,4060;4061;4062,4060:2;4061:2;4062:2,4070;4058;4062;4059;5095;4060;4064;4063;4066;4067,394.0,0.667,0.333,0.402,1,0.402,0.667,0.000, +single,news_003,english_only,news_en,semantic_search,news,en,0,0,Trump Iran ultimatum,4258;4260;4262,4258:2;4260:2;4262:2,4775;23446;4776;4202;4679;24382;21155;4668;4199;21855,199.7,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_001,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,기계 안전 가드 설계 원리,3770;3856,3770:3;3856:2,4540;5239;5249;3770;5236;5244;3787;3817;4548;5253,399.3,0.500,0.250,0.264,0,0.339,0.500,1.000, +single,cl_002,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,산업 안전 입문서,3755;3775;3776;3777,3755:2;3775:2;3776:2;3777:2,5249;5236;5244;5230;5235;5229;10573;3761;3755;3816,344.6,0.250,0.111,0.118,1,0.118,0.250,0.000, +single,cl_003,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,전기 안전 위험,3772;3790,3772:2;3790:2,3790;3772;5260;3897;5248;3771;3769;11671;13936;3755,658.8,1.000,1.000,1.000,1,1.000,1.000,0.000, +single,news_004,mixed,news_fr,semantic_search,news,mixed,0,0,guerre en Iran,4199;4202;4210;4361;4363;4507;4519;4521,4199:2;4202:2;4210:2;4361:2;4363:2;4507:2;4519:2;4521:2,22342;19576;17069;15924;16935;23149;16019;16462;16010;4776,256.4,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,news_005,mixed,news_crosslingual,semantic_search,news,mixed,0,0,이란 미국 전쟁 글로벌 반응,4202;4258;4262;4536;4303;4304;4316,4202:2;4258:2;4262:2;4303:2;4304:2;4316:2;4536:2,16761;21275;16927;20893;16771;17242;4329;20886;4457;4307,434.7,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_001,failure_expected,failure_expected,semantic_search,document,mixed,0,1,Rust async runtime tokio scheduler 내부 구조,,,5262;23732;5161;24155;20032;20758;20036;17813;3816;19373,274.1,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_002,failure_expected,failure_expected,semantic_search,document,ko,0,1,양자컴퓨터 큐비트 디코히어런스,,,16289;22202;23605;20507;5250;24854;5168;23297;19527;18298,201.2,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_003,failure_expected,failure_expected,semantic_search,news,ko,0,1,재즈 보컬리스트 빌리 홀리데이,,,18286;9102;20022;16059;23336;20470;20174;15361;4634;17133,179.8,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,kw_006,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 작업장 통로,3886;3887,3886:3;3887:2,3886;3902;3887;3895;3898;3885;3905;3908;3911;3915,273.9,1.000,1.000,0.920,1,0.956,1.000,1.000, +single,kw_007,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 폭발 화재 위험물 누출 방지,3896;3766,3766:1;3896:3,3896;13930;3895;3911;13929;3866;3903;3890;3910;3909,227.4,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,kw_008,standards,standards,fact_lookup,document,ko,0,0,고압가스 안전관리법 전문,4025;4026,4025:3;4026:2,11579;11644;4025;4026;11647;11676;11591;11580;11645;13750,329.6,1.000,0.333,0.571,1,0.539,1.000,1.000, +single,kw_009,standards,standards,fact_lookup,document,ko,0,0,KGS FP111 가스설비 배관설비 기준,13305,13305:3,13305;13302;13312;13300;13311;13310;13308;13306;13299;13304,298.4,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_010,standards,standards,fact_lookup,document,ko,0,0,KGS FU551 가스설비 압력조정기 가스계량기,13652,13652:3,13652;13657;13651;13655;11689;13649;13648;11693;13322;13658,222.2,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_011,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 전기로 인한 위험 방지,3897;3772,3772:1;3897:3,3897;3902;3895;3887;13935;3866;13938;3890;4018;3901,352.2,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,en_001,english_only,english_only,semantic_search,document,en,0,0,pressure vessel flange design,5144;5136,5136:2;5144:3,5136;5144;5139;5180;5193;5186;5207;5210;5178;5190,1475.2,1.000,1.000,1.000,1,0.834,1.000,1.000, +single,en_002,english_only,english_only,semantic_search,document,en,0,0,ASME Section VIII introduction,5204;5206,5204:3;5206:2,5210;5204;5208;5137;5180;5186;5182;5206;5140;5185,1319.6,1.000,0.500,0.580,1,0.603,1.000,1.000, +single,en_003,english_only,english_only,semantic_search,document,en,0,0,impact test requirements ASME,5205;5148,5148:1;5205:3,5205;5224;5204;5210;5178;5186;5148;5214;5192;5180,1475.1,1.000,1.000,0.818,1,0.961,1.000,1.000, +single,en_004,english_only,english_only,semantic_search,document,en,0,0,design of vessel supports,5149,5149:3,5149;5137;5141;5139;5211;5207;5212;5140;5182;5206,1444.9,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_005,english_only,english_only,semantic_search,document,en,0,0,hydrogen piping ASME code,5178,5178:3,5178;5210;5180;5182;5143;5204;5211;5207;5185;5186,1254.6,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_006,english_only,english_only,semantic_search,document,en,0,0,ASME welding qualification requirements,5209;3771,3771:1;5209:3,5180;5225;5224;5204;5178;5205;5208;5209;5187;4826,1290.7,0.500,0.125,0.193,0,0.289,1.000,1.000, +single,en_007,english_only,english_only,semantic_search,document,en,0,0,pressure vessel fabrication and inspection,5208;5145,5145:2;5208:3,5208;5189;5186;5180;5187;5192;5212;5182;5137;5210,1532.4,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,en_008,english_only,english_only,semantic_search,document,en,0,0,Industrial Safety and Health Management ergonomics,3763;3755,3755:1;3763:3,3763;3774;3755;3759;3818;3761;3769;3758;3812;3762,943.3,1.000,1.000,0.920,1,0.983,1.000,1.000, +single,cl_004,mixed,mixed,semantic_search,document,mixed,0,0,ASME 압력용기 설계 실무,5207;5210;5139;5135,5135:2;5139:2;5207:3;5210:3,5139;5204;5208;5137;5225;5210;5133;5136;5180;5207,616.2,0.750,1.000,0.642,0,0.529,0.750,1.000, +single,cl_005,mixed,mixed,semantic_search,document,mixed,0,0,ASME 용접 코드 해설,5224;5209,5209:2;5224:3,5224;5225;5222;5209;5210;5195;5180;5204;5207;5141,590.8,1.000,1.000,0.877,1,0.932,1.000,1.000, +single,cl_006,mixed,mixed,semantic_search,document,mixed,0,0,pressure vessel Korean industrial safety regulation,4025;3856;5136,3856:2;4025:2;5136:1,5210;5186;13913;5143;13760;13749;5145;5180;5240;5137,676.2,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_007,mixed,mixed,semantic_search,document,mixed,0,0,KGS 코드 LPG safety standard,11647;11689;11645;4025,4025:1;11645:2;11647:3;11689:2,11647;13759;13310;13764;13670;11649;13674;13772;13671;13669,272.7,0.250,1.000,0.390,1,0.647,0.333,1.000, +single,cl_008,mixed,mixed,semantic_search,document,mixed,0,0,수소 가스 안전 기준 hydrogen safety,5178;5169,5169:2;5178:3,13299;5241;13915;10575;5173;13303;11563;5178;13948;5177,519.5,0.500,0.125,0.193,0,0.248,0.500,1.000, +single,exam_001,exam,exam,fact_lookup,document,ko,0,0,레이놀즈수 정의 공식,11504;11505,11504:3;11505:2,11533;11504;5081;5064;11509;11476;5071;5082;11601;5075,388.9,0.500,0.500,0.387,1,0.497,0.500,1.000, +single,exam_002,exam,exam,fact_lookup,document,ko,0,0,탱크 바닥 구멍 유체 유속 토리첼리,11500;11495;11496,11495:2;11496:2;11500:3,11500;11495;5139;11493;11515;11521;11719;5090;5193;13318,228.0,0.667,1.000,0.765,1,0.856,0.667,1.000, +single,exam_003,exam,exam,fact_lookup,document,ko,0,0,이상기체 음속 마하수,11514;11515,11514:3;11515:2,11514;5084;5090;11516;11526;11493;11479;11515;11473;5210,472.2,1.000,1.000,0.807,1,0.894,1.000,1.000, +single,exam_004,exam,exam,fact_lookup,document,ko,0,0,고압가스 용기 내압시험 영구증가량,11591;11644;11691,11591:3;11644:2;11691:2,13665;13324;11579;11591;11580;13948;13928;13759;13752;13660,243.4,0.333,0.250,0.202,0,0.290,0.333,1.000, +single,exam_005,exam,exam,fact_lookup,document,ko,0,0,도시가스 배관 매설 이격거리,11627;11625;11646,11625:2;11627:3;11646:1,11600;13918;11627;11692;11625;13753;11510;13752;13302;11658,259.8,0.667,0.333,0.416,1,0.496,1.000,1.000, +single,exam_006,exam,exam,fact_lookup,document,ko,0,0,LPG 저장탱크 안전거리 분말소화기,11617;11669;11620,11617:3;11620:1;11669:2,11617;13948;13924;11649;11595;11690;11616;11658;11619;11655,222.2,0.333,1.000,0.469,1,0.745,0.500,1.000, +single,exam_007,exam,exam,fact_lookup,document,ko,0,0,오리피스 차압식 유량계,11712;11711;11503,11503:2;11711:2;11712:3,11713;11711;13930;11712;11714;11500;11488;11503;11502;11509,267.5,1.000,0.500,0.646,0,0.563,1.000,1.000, +single,fail_004,failure_expected,failure_expected,fact_lookup,document,ko,0,1,KGS AC999 임의 가스 코드,,,11691;13303;11692;11693;13663;13665;13933;13673;11655;13661,256.2,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_005,failure_expected,failure_expected,fact_lookup,document,ko,0,1,초전도 안전 관리법 시행규칙,,,3897;5236;4019;4018;3971;3903;3966;3974;5210;3973,321.2,0.000,0.000,0.000,1,0.000,0.000,0.000, diff --git a/tests/fixtures/tei_rerank_response.json b/tests/fixtures/tei_rerank_response.json new file mode 100644 index 0000000..25a8521 --- /dev/null +++ b/tests/fixtures/tei_rerank_response.json @@ -0,0 +1,46 @@ +{ + "fixture_purpose": "Phase 2B G0-1 — TEI rerank endpoint 응답 spec 박제. mixed (한국어+영어) sanity check.", + "request": { + "endpoint_examples": [ + "http://reranker:80/rerank (production baseline bge-reranker-v2-m3)", + "http://rerank-cand-gte-ml-base:80/rerank", + "http://rerank-cand-mxbai-large:80/rerank", + "http://rerank-cand-bge-v2-gemma-2b:80/rerank" + ], + "method": "POST", + "headers": {"Content-Type": "application/json"}, + "body": { + "query": "압력용기 설계 기준", + "texts": [ + "ASME Section VIII Division 1 pressure vessel design rules and material selection criteria for high-pressure applications.", + "고압가스 안전관리법에 따른 압력용기 검사 기준 — 정기 검사 주기 및 안전 밸브 설정.", + "Today weather forecast for Seoul: partly cloudy with chance of rain in the afternoon." + ] + } + }, + "response_shape": "[{index: int, score: float}, ...] sorted score desc", + "captured_responses": { + "baseline_bge_v2_m3": { + "endpoint": "http://reranker:80/rerank", + "model": "BAAI/bge-reranker-v2-m3 (production)", + "raw": [ + {"index": 0, "score": 0.9091032}, + {"index": 1, "score": 0.7514658}, + {"index": 2, "score": 0.0000165714} + ], + "interpretation": "ASME(en)+고압가스(ko) 둘 다 무관(weather) 보다 명확 높음. 한국어/영어 score gap 작음 (0.91 vs 0.75 = 0.16) — 한국어 능력 강함." + }, + "cand_gte_ml_base": { + "endpoint": "http://rerank-cand-gte-ml-base:80/rerank", + "model": "Alibaba-NLP/gte-multilingual-reranker-base", + "raw": [ + {"index": 0, "score": 0.6365791}, + {"index": 1, "score": 0.4685475}, + {"index": 2, "score": 0.034488525} + ], + "interpretation": "ASME(en)+고압가스(ko) 둘 다 weather 보다 명확 높음. 한국어/영어 score gap 0.17 — baseline 과 비슷. score 절대값 baseline 보다 낮음 (model 별 calibration 차이, rank 순서는 동일)." + } + }, + "sanity_check": "ASME(en) > 고압가스(ko) > weather(noise) 순서 — 두 모델 모두 통과. 후보가 한국어 무관하지 않은지 검증.", + "captured_at": "2026-05-23" +} diff --git a/tests/search_eval/baselines/v0_2_phase2b_baseline_snapshot_2026-05-23.json b/tests/search_eval/baselines/v0_2_phase2b_baseline_snapshot_2026-05-23.json new file mode 100644 index 0000000..d51197b --- /dev/null +++ b/tests/search_eval/baselines/v0_2_phase2b_baseline_snapshot_2026-05-23.json @@ -0,0 +1,48 @@ +{ + "version": "v0.2-phase2b", + "label": "baseline_snapshot", + "date": "2026-05-23", + "snapshot": { + "doc_id_max": 25180, + "chunk_id_max": 56526, + "documents_n": 21365, + "chunks_n": 30605 + }, + "eval_set": { + "total_cases": 51, + "scored_cases": 46, + "failure_expected_cases": 5 + }, + "model_config": { + "embedding": "BAAI/bge-m3 (production)", + "reranker": "BAAI/bge-reranker-v2-m3 (production)", + "search_mode": "hybrid", + "rerank_enabled": "server_default", + "embedding_backend": "baseline", + "reranker_backend": "baseline", + "plan": "round-2-review-mighty-starfish.md v2.1 (Phase 2B)" + }, + "overall": { + "n": 46, + "graded_ndcg_at_10": 0.659, + "graded_recall_at_10_t2": 0.695, + "graded_recall_at_10_t3": 0.761, + "latency_p50_ms": 454, + "latency_p95_ms": 1573, + "failure_correct": "0/5" + }, + "by_category": { + "english_only": { "n": 9, "recall_at_10": 0.78, "ndcg_at_10": 0.71, "graded_ndcg_at_10": 0.78 }, + "exam": { "n": 7, "recall_at_10": 0.57, "ndcg_at_10": 0.62, "graded_ndcg_at_10": 0.74 }, + "korean_only": { "n": 9, "recall_at_10": 0.55, "ndcg_at_10": 0.47, "graded_ndcg_at_10": 0.51 }, + "mixed": { "n": 10, "recall_at_10": 0.38, "ndcg_at_10": 0.36, "graded_ndcg_at_10": 0.39 }, + "standards": { "n": 11, "recall_at_10": 0.91, "ndcg_at_10": 0.85, "graded_ndcg_at_10": 0.87 } + }, + "by_language": { + "en": { "n": 9, "recall_at_10": 0.78, "graded_ndcg_at_10": 0.78 }, + "ko": { "n": 27, "recall_at_10": 0.70, "graded_ndcg_at_10": 0.72 }, + "mixed": { "n": 10, "recall_at_10": 0.38, "graded_ndcg_at_10": 0.39 } + }, + "raw_csv": "reports/v0_2_phase2b_baseline_snapshot_2026-05-23.csv", + "reproducibility_check": "Phase 2A baseline_snapshot (NDCG 0.659 동일) — snapshot filter path 안정 + 재현성 확인" +} diff --git a/tests/search_eval/baselines/v0_2_phase2b_gte_ml_base_2026-05-23.json b/tests/search_eval/baselines/v0_2_phase2b_gte_ml_base_2026-05-23.json new file mode 100644 index 0000000..645872a --- /dev/null +++ b/tests/search_eval/baselines/v0_2_phase2b_gte_ml_base_2026-05-23.json @@ -0,0 +1,60 @@ +{ + "version": "v0.2-phase2b", + "label": "cand_gte_ml_base", + "date": "2026-05-23", + "snapshot": { + "doc_id_max": 25180, + "chunk_id_max": 56526, + "documents_n": 21365, + "chunks_n": 30605 + }, + "eval_set": { + "total_cases": 51, + "scored_cases": 46, + "failure_expected_cases": 5 + }, + "model_config": { + "embedding": "BAAI/bge-m3 (production, 고정)", + "reranker": "Alibaba-NLP/gte-multilingual-reranker-base", + "reranker_params": "305M", + "reranker_context": 8192, + "reranker_license": "Apache 2.0", + "search_mode": "hybrid", + "rerank_enabled": "server_default", + "embedding_backend": "baseline", + "reranker_backend": "cand_gte_ml_base", + "endpoint": "http://rerank-cand-gte-ml-base:80/rerank", + "plan": "round-2-review-mighty-starfish.md v2.1 (Phase 2B)" + }, + "overall": { + "n": 46, + "graded_ndcg_at_10": 0.604, + "graded_recall_at_10_t2": 0.709, + "graded_recall_at_10_t3": 0.783, + "latency_p50_ms": 345, + "latency_p95_ms": 1460, + "failure_correct": "0/5" + }, + "by_category": { + "english_only": { "n": 9, "recall_at_10": 0.78, "ndcg_at_10": 0.68, "graded_ndcg_at_10": 0.72 }, + "exam": { "n": 7, "recall_at_10": 0.64, "ndcg_at_10": 0.53, "graded_ndcg_at_10": 0.62 }, + "korean_only": { "n": 9, "recall_at_10": 0.50, "ndcg_at_10": 0.39, "graded_ndcg_at_10": 0.41 }, + "mixed": { "n": 10, "recall_at_10": 0.42, "ndcg_at_10": 0.35, "graded_ndcg_at_10": 0.38 }, + "standards": { "n": 11, "recall_at_10": 0.91, "ndcg_at_10": 0.84, "graded_ndcg_at_10": 0.86 } + }, + "by_language": { + "en": { "n": 9, "recall_at_10": 0.78, "graded_ndcg_at_10": 0.72 }, + "ko": { "n": 27, "recall_at_10": 0.71, "graded_ndcg_at_10": 0.65 }, + "mixed": { "n": 10, "recall_at_10": 0.42, "graded_ndcg_at_10": 0.38 } + }, + "raw_csv": "reports/v0_2_phase2b_gte_ml_base_2026-05-23.csv", + "delta_vs_baseline": { + "graded_ndcg_at_10": -0.055, + "mixed": -0.01, + "korean_only": -0.10, + "standards": -0.01, + "english_only": -0.06, + "exam": -0.12, + "latency_p50_ms": -109 + } +} diff --git a/tests/search_eval/run_eval.py b/tests/search_eval/run_eval.py index 0dbd707..61c4f2b 100644 --- a/tests/search_eval/run_eval.py +++ b/tests/search_eval/run_eval.py @@ -202,6 +202,7 @@ async def call_search( embedding_backend: str | None = None, snapshot_doc_id_max: int | None = None, snapshot_chunk_id_max: int | None = None, + reranker_backend: str | None = None, ) -> tuple[list[int], float]: """검색 API 호출 → (doc_ids, latency_ms).""" url = f"{base_url.rstrip('/')}/api/search/" @@ -219,6 +220,8 @@ async def call_search( params["snapshot_doc_id_max"] = snapshot_doc_id_max if snapshot_chunk_id_max is not None: params["snapshot_chunk_id_max"] = snapshot_chunk_id_max + if reranker_backend is not None: + params["reranker_backend"] = reranker_backend import time @@ -249,6 +252,7 @@ async def evaluate( embedding_backend: str | None = None, snapshot_doc_id_max: int | None = None, snapshot_chunk_id_max: int | None = None, + reranker_backend: str | None = None, ) -> list[QueryResult]: """전체 쿼리셋 평가.""" results: list[QueryResult] = [] @@ -261,6 +265,7 @@ async def evaluate( embedding_backend=embedding_backend, snapshot_doc_id_max=snapshot_doc_id_max, snapshot_chunk_id_max=snapshot_chunk_id_max, + reranker_backend=reranker_backend, ) results.append( QueryResult( @@ -837,6 +842,7 @@ async def call_search_full( embedding_backend: str | None = None, snapshot_doc_id_max: int | None = None, snapshot_chunk_id_max: int | None = None, + reranker_backend: str | None = None, ) -> tuple[list[dict], float]: """call_search와 동일 로직. 단 full result dict 리스트 반환.""" url = f"{base_url.rstrip('/')}/api/search/" @@ -856,6 +862,8 @@ async def call_search_full( params["snapshot_doc_id_max"] = snapshot_doc_id_max if snapshot_chunk_id_max is not None: params["snapshot_chunk_id_max"] = snapshot_chunk_id_max + if reranker_backend is not None: + params["reranker_backend"] = reranker_backend import time @@ -1308,6 +1316,12 @@ def main() -> int: default=None, help="Phase 2A snapshot freeze. document_chunks.id <= 값 filter. baseline rebaseline 도 동일 적용.", ) + parser.add_argument( + "--reranker-backend", + type=str, + default=None, + help="Phase 2B Diagnose reranker dispatcher slug (baseline | cand_gte_ml_base). 미지정 = production.", + ) args = parser.parse_args() @@ -1361,21 +1375,21 @@ def main() -> int: if args.base_url: print(f"\n>>> evaluating: {args.base_url}") results = asyncio.run( - evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max) + evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend) ) print_summary("single", results, eval_version=args.eval_version) all_results.extend(results) else: print(f"\n>>> baseline: {args.baseline_url}") baseline_results = asyncio.run( - evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max) + evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend) ) baseline_summary = print_summary("baseline", baseline_results, eval_version=args.eval_version) print(f"\n>>> candidate: {args.candidate_url}") candidate_results = asyncio.run( evaluate( - queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max + queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend ) ) candidate_summary = print_summary("candidate", candidate_results, eval_version=args.eval_version)