From 3092e3009d1609aa5c1b53d55c2b1074c7635d6f Mon Sep 17 00:00:00 2001 From: hyungi Date: Sat, 23 May 2026 06:55:13 +0000 Subject: [PATCH] =?UTF-8?q?feat(eval):=20Phase=202A=20Diagnose=20Phase=203?= =?UTF-8?q?+4=20=E2=80=94=20dispatcher=20+=203=20=EC=B8=A1=EC=A0=95=20+=20?= =?UTF-8?q?decision=20(H3=20bge-m3=20=EC=9C=A0=EC=A7=80)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit phase-2a-embedding-diagnose.md v4 § 6 (dispatcher) + § 7 Phase 3 (51 case 측정) + § 7 Phase 4 (decision) Round 2 review: round-2-review-mighty-starfish.md (R2-2 + R2-B1 페어 invariant + slug-based resolve) 코드 변경: - app/services/search/retrieval_service.py: - CANDIDATE_BACKEND_MAP allowlist (baseline / cand_me5_large_inst / cand_snowflake_l_v2) - _resolve_backend(slug) → docs_table/chunks_table/embed_endpoint or None - _embed_query_via_tei() — candidate TEI 엔드포인트 호출 (cache 미사용) - _VALID_DOCS_TABLE + _VALID_CHUNKS_TABLE regex (R2-B1 2단계 gate) - _search_vector_docs / _search_vector_chunks: docs_table/chunks_table + snapshot_*_id_max 파라미터 - search_vector + search_vector_multilingual: embedding_backend + snapshot_*_id_max 파라미터 + dispatch log - app/services/search/search_pipeline.py: run_search() 시그니처 + 4 search_vector* 호출 threading - app/api/search.py: 3 Query parameter + ValueError → HTTP 400 (allowed list 응답) - tests/search_eval/run_eval.py: --embedding-backend + --snapshot-doc-id-max + --snapshot-chunk-id-max + call_search/call_search_full/evaluate threading + main 3 asyncio.run threading 측정 산출물 (51 case, scored=46, failure=5): - reports/v0_2_phase2a_baseline_snapshot_2026-05-23.csv (snapshot filter 적용 production path) - reports/v0_2_phase2a_me5_large_inst_2026-05-23.csv - reports/v0_2_phase2a_snowflake_l_v2_2026-05-23.csv - tests/search_eval/baselines/v0_2_phase2a_{baseline_snapshot,me5_large_inst,snowflake_l_v2}_2026-05-23.json (3개) 결과: | Candidate | NDCG | Δ vs baseline | mixed | korean_only | p50 ms | |------------------------------------|-----:|--------------:|------:|------------:|-------:| | bge-m3 (baseline snapshot) | 0.659| — | 0.39 | 0.51 | 464 | | cand_me5_large_inst | 0.477| -0.182 | 0.17 | 0.47 | 194 | | cand_snowflake_l_v2 | 0.616| -0.043 | 0.35 | 0.52 | 254 | Decision (H3): bge-m3 유지. 둘 다 net 회귀. - mE5-large-instruct: 전 카테고리 회귀 (-0.182). prefix 미적용 변수 — 별 PR PR-2A-mE5-Prefix-Retry 후보. - snowflake_l_v2: 가벼운 회귀 (-0.043). korean_only +0.01 미세 개선 신호. - korean_only/mixed 약점 보완은 Phase 2B (Reranker) 또는 Phase 2Q (Query rewrite) 권고. Decision report: reports/phase_2a_embedding_decision_2026-05-23.md (§ 1~8 포함, Closure gate 16 항목 모두 PASS). 후속 PR 백로그: - PR-2A-mE5-Prefix-Retry (별 PR) - PR-2A-Extended-Bge-Mgemma2 (별 PR, v3 결정) - PR-2A-Cloud-Embedding-Scaffold-1 (Cohere/Voyage scaffold-only, 선택) - PR-Search-Query-Rewrite-1 (Phase 2Q) - PR-Search-Reranker-V2-Diagnose (Phase 2B) - PR-2A-Chunks-Cand-Cleanup-1 (1주 후 cand 테이블 DROP) production 영향: - documents / document_chunks 컬럼/row 변경 0 - config.yaml 변경 0 (ollama bge-m3 unchanged) - 추가된 endpoint = query parameter opt-in (미지정 시 production path 회귀 0) - smoke 4건 PASS (baseline / baseline+snapshot / cand_me5 / cand_invalid → HTTP 400) - dispatch log 박제 verify (snapshot_doc/chunk_id_max 박제) Co-Authored-By: Claude Opus 4.7 (1M context) --- app/api/search.py | 46 ++- app/services/search/retrieval_service.py | 355 ++++++++++++------ app/services/search/search_pipeline.py | 31 +- .../phase_2a_embedding_decision_2026-05-23.md | 97 +++++ ...2_phase2a_baseline_snapshot_2026-05-23.csv | 52 +++ ...v0_2_phase2a_me5_large_inst_2026-05-23.csv | 52 +++ ...v0_2_phase2a_snowflake_l_v2_2026-05-23.csv | 52 +++ ..._phase2a_baseline_snapshot_2026-05-23.json | 46 +++ ...0_2_phase2a_me5_large_inst_2026-05-23.json | 60 +++ ...0_2_phase2a_snowflake_l_v2_2026-05-23.json | 59 +++ tests/search_eval/run_eval.py | 51 ++- 11 files changed, 774 insertions(+), 127 deletions(-) create mode 100644 reports/phase_2a_embedding_decision_2026-05-23.md create mode 100644 reports/v0_2_phase2a_baseline_snapshot_2026-05-23.csv create mode 100644 reports/v0_2_phase2a_me5_large_inst_2026-05-23.csv create mode 100644 reports/v0_2_phase2a_snowflake_l_v2_2026-05-23.csv create mode 100644 tests/search_eval/baselines/v0_2_phase2a_baseline_snapshot_2026-05-23.json create mode 100644 tests/search_eval/baselines/v0_2_phase2a_me5_large_inst_2026-05-23.json create mode 100644 tests/search_eval/baselines/v0_2_phase2a_snowflake_l_v2_2026-05-23.json diff --git a/app/api/search.py b/app/api/search.py index 4d0a37a..9f9953d 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -156,17 +156,45 @@ async def search( description="QueryAnalyzer 활성화 (Phase 2.1, LLM 호출). Phase 2.1은 debug 노출만, 검색 경로 영향 X", ), debug: bool = Query(False, description="단계별 candidates + timing 응답에 포함"), + embedding_backend: str | None = Query( + None, + pattern=r"^(baseline|cand_[a-z0-9_]+)$", + description="Phase 2A Diagnose dispatcher (R2-2 + R2-B1). slug 만 받음 (raw table name X). baseline|cand_. 미지정/baseline = production path.", + ), + snapshot_doc_id_max: int | None = Query( + None, ge=1, + description="Phase 2A snapshot freeze (R2-D + R2-B2). documents.id <= 값 filter. baseline 측정 시에도 동일 filter 적용.", + ), + snapshot_chunk_id_max: int | None = Query( + None, ge=1, + description="Phase 2A snapshot freeze (R2-D + R2-B2). document_chunks.id <= 값 filter. baseline 측정 시에도 동일 filter 적용.", + ), ): """문서 검색 — FTS + ILIKE + 벡터 결합 (Phase 3.1 이후 run_search wrapper)""" - pr = await run_search( - session, - q, - mode=mode, # type: ignore[arg-type] - limit=limit, - fusion=fusion, - rerank=rerank, - analyze=analyze, - ) + try: + pr = await run_search( + session, + q, + mode=mode, # type: ignore[arg-type] + limit=limit, + fusion=fusion, + rerank=rerank, + analyze=analyze, + embedding_backend=embedding_backend, + snapshot_doc_id_max=snapshot_doc_id_max, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) + except ValueError as e: + # _resolve_backend 가 unknown slug 시 ValueError → HTTP 400 + return JSONResponse( + status_code=400, + content={ + "error_reason": "unknown_embedding_backend", + "backend_requested": embedding_backend, + "allowed": ["baseline", "cand_me5_large_inst", "cand_snowflake_l_v2"], + "detail": str(e), + }, + ) # 사용자 feedback: 모든 단계 timing은 debug 응답과 별도로 항상 로그로 남긴다 timing_str = " ".join(f"{k}={v:.0f}" for k, v in pr.timing_ms.items()) diff --git a/app/services/search/retrieval_service.py b/app/services/search/retrieval_service.py index 631bf07..8fcf6d1 100644 --- a/app/services/search/retrieval_service.py +++ b/app/services/search/retrieval_service.py @@ -22,6 +22,7 @@ from __future__ import annotations import asyncio import hashlib +import re import time from typing import TYPE_CHECKING, Any @@ -48,6 +49,61 @@ _QUERY_EMBED_CACHE: dict[str, dict[str, Any]] = {} QUERY_EMBED_TTL = 86400 # 24h QUERY_EMBED_MAXSIZE = 500 +# ─── Phase 2A Diagnose dispatcher (R2-2 + R2-B1) ────────────── +# server-side allowlist map. query parameter 가 raw table name 받지 않음. +CANDIDATE_BACKEND_MAP: dict[str, dict[str, str] | None] = { + "baseline": None, + "cand_me5_large_inst": { + "docs_table": "documents_cand_me5_large_inst", + "chunks_table": "document_chunks_cand_me5_large_inst", + "embed_endpoint": "http://embedding-cand-me5-inst:80/embed", + }, + "cand_snowflake_l_v2": { + "docs_table": "documents_cand_snowflake_l_v2", + "chunks_table": "document_chunks_cand_snowflake_l_v2", + "embed_endpoint": "http://embedding-cand-snowflake-l-v2:80/embed", + }, +} + +# 2단계 gate (R2-B1) — SQL string interpolation 직전 final allowlist. +_VALID_DOCS_TABLE = re.compile(r"^(documents|documents_cand_[a-z0-9_]+)$") +_VALID_CHUNKS_TABLE = re.compile(r"^(document_chunks|document_chunks_cand_[a-z0-9_]+)$") + + +def _resolve_backend(slug: str | None) -> dict[str, str] | None: + """slug → (docs_table, chunks_table, embed_endpoint) | None (baseline). + + Raises ValueError on unknown slug (caller 가 HTTP 400 으로 translate). + """ + if slug is None or slug == "baseline": + return None + if slug not in CANDIDATE_BACKEND_MAP: + raise ValueError(f"unknown_embedding_backend: {slug!r}") + cfg = CANDIDATE_BACKEND_MAP[slug] + if cfg is None: + return None + if not all(k in cfg for k in ("docs_table", "chunks_table", "embed_endpoint")): + raise RuntimeError(f"candidate_table_pair_misconfigured: {slug}") + return cfg + + +async def _embed_query_via_tei(endpoint: str, text_: str) -> list[float] | None: + """후보 TEI endpoint 호출 (cache 미사용 — slug 별 다른 모델 분포).""" + if not text_: + return None + import httpx + try: + async with httpx.AsyncClient(timeout=30.0) as c: + r = await c.post(endpoint, json={"inputs": [text_], "truncate": True}) + r.raise_for_status() + data = r.json() + if not isinstance(data, list) or not data or not isinstance(data[0], list): + raise ValueError(f"unexpected TEI shape: {type(data).__name__}") + return data[0] + except Exception as exc: + logger.warning("candidate TEI embed failed endpoint=%s err=%r", endpoint, exc) + return None + def _query_embed_key(text_: str) -> str: return hashlib.sha256(f"{text_}|bge-m3".encode("utf-8")).hexdigest() @@ -183,53 +239,78 @@ async def search_text( async def search_vector( - session: AsyncSession, query: str, limit: int + session: AsyncSession, + query: str, + limit: int, + *, + embedding_backend: str | None = None, + snapshot_doc_id_max: int | None = None, + snapshot_chunk_id_max: int | None = None, ) -> list["SearchResult"]: """Hybrid 벡터 검색 — doc + chunks 동시 retrieval (Phase 1.2-G). - Phase 1.2-C 진단: - chunks-only는 segment 의미 손실로 자연어 query에서 catastrophic recall. - doc embedding은 전체 본문 평균 → recall robust. - → 두 retrieval 동시 사용이 정석. + Phase 2A v4 dispatcher (R2-2 + R2-B1): + embedding_backend=None|"baseline" → production (documents + document_chunks). + snapshot_*_id_max 지정 시 baseline 도 동일 filter (rebaseline measurement). + embedding_backend=cand_ → CANDIDATE_BACKEND_MAP 에서 페어 resolve. + cand 테이블 자체가 snapshot 범위로 INSERT → snapshot filter 무시 (dispatch log 만 박제). 데이터 흐름: - 1. query embedding 1번 (bge-m3) - 2. asyncio.gather로 두 SQL 동시 호출: - - _search_vector_docs: documents.embedding cosine top N - - _search_vector_chunks: document_chunks.embedding window partition (doc당 top 2) - 3. _merge_doc_and_chunk_vectors로 가중치 + dedup: - - chunk score * 1.2 (precision) - - doc score * 1.0 (recall) - - doc_id 기준 dedup, chunks 우선 - - Returns: - list[SearchResult] — doc_id 중복 제거됨. compress_chunks_to_docs는 그대로 동작. - chunks_by_doc은 search.py에서 group_by_doc으로 보존. + 1. query embedding 1번 (baseline=bge-m3 cache / cand=TEI endpoint no-cache) + 2. asyncio.gather 로 두 SQL 동시 호출: + - _search_vector_docs(docs_table, snapshot_doc_id_max) + - _search_vector_chunks(chunks_table, snapshot_chunk_id_max) + 3. _merge_doc_and_chunk_vectors 가중치 + dedup (chunk 1.2 / doc 1.0). """ - client = AIClient() - try: - query_embedding = await _get_query_embedding(client, query) - finally: + cfg = _resolve_backend(embedding_backend) + + if cfg is None: + docs_table = "documents" + chunks_table = "document_chunks" + client = AIClient() try: - await client.close() - except Exception: - pass + query_embedding = await _get_query_embedding(client, query) + finally: + try: + await client.close() + except Exception: + pass + else: + docs_table = cfg["docs_table"] + chunks_table = cfg["chunks_table"] + query_embedding = await _embed_query_via_tei(cfg["embed_endpoint"], query) + + logger.info( + "[embedding-dispatch] backend=%s docs_table=%s chunks_table=%s snapshot_doc_id_max=%s snapshot_chunk_id_max=%s", + embedding_backend or "baseline", + docs_table, + chunks_table, + snapshot_doc_id_max, + snapshot_chunk_id_max, + ) if query_embedding is None: return [] embedding_str = str(query_embedding) - # 두 SQL 병렬 호출 — 각각 별도 session 사용 (asyncpg connection은 statement 단위 직렬) Session = async_sessionmaker(engine) async def _docs_call() -> list["SearchResult"]: async with Session() as s: - return await _search_vector_docs(s, embedding_str, limit * 4) + return await _search_vector_docs( + s, embedding_str, limit * 4, + docs_table=docs_table, + snapshot_doc_id_max=snapshot_doc_id_max, + ) async def _chunks_call() -> list["SearchResult"]: async with Session() as s: - return await _search_vector_chunks(s, embedding_str, limit * 4) + return await _search_vector_chunks( + s, embedding_str, limit * 4, + chunks_table=chunks_table, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) doc_results, chunk_results = await asyncio.gather(_docs_call(), _chunks_call()) @@ -237,93 +318,116 @@ async def search_vector( async def _search_vector_docs( - session: AsyncSession, embedding_str: str, limit: int + session: AsyncSession, + embedding_str: str, + limit: int, + *, + docs_table: str = "documents", + snapshot_doc_id_max: int | None = None, ) -> list["SearchResult"]: - """documents.embedding 직접 검색 — recall robust (자연어 매칭). + """documents (또는 documents_cand_).embedding 직접 검색. - chunks가 없는 doc도 매칭 가능. score는 cosine similarity (1 - distance). - chunk_id/chunk_index/section_title은 None. + docs_table = "documents": production path. snapshot_doc_id_max 지정 시 id <= max filter. + docs_table = "documents_cand_": 후보 path. cand 테이블이 이미 snapshot 범위로 INSERT됨 → + snapshot_doc_id_max 무시. metadata 는 production documents 와 JOIN. + + R2-B1 final gate: docs_table 은 _VALID_DOCS_TABLE allowlist 통과 후 SQL interpolation. """ from api.search import SearchResult # 순환 import 회피 - result = await session.execute( - text(""" - SELECT - id, - title, - ai_domain, - ai_summary, - file_format, - (1 - (embedding <=> cast(:embedding AS vector))) AS score, - left(extracted_text, 1200) AS snippet, - 'vector_doc' AS match_reason, - NULL::bigint AS chunk_id, - NULL::integer AS chunk_index, - NULL::text AS section_title + if not _VALID_DOCS_TABLE.match(docs_table): + raise RuntimeError(f"invalid_docs_table: {docs_table!r}") + + params: dict[str, Any] = {"embedding": embedding_str, "limit": limit} + + if docs_table == "documents": + snapshot_clause = "" + if snapshot_doc_id_max is not None: + snapshot_clause = " AND id <= :snapshot_doc_id_max" + params["snapshot_doc_id_max"] = snapshot_doc_id_max + sql = f""" + SELECT id, title, ai_domain, ai_summary, file_format, + (1 - (embedding <=> cast(:embedding AS vector))) AS score, + left(extracted_text, 1200) AS snippet, + 'vector_doc' AS match_reason, + NULL::bigint AS chunk_id, NULL::integer AS chunk_index, NULL::text AS section_title FROM documents - WHERE embedding IS NOT NULL AND deleted_at IS NULL + WHERE embedding IS NOT NULL AND deleted_at IS NULL{snapshot_clause} ORDER BY embedding <=> cast(:embedding AS vector) LIMIT :limit - """), - {"embedding": embedding_str, "limit": limit}, - ) + """ + else: + # candidate: docs_table 은 (doc_id, embed_input, embed_input_hash, embedding) 만 보유 → JOIN documents + sql = f""" + SELECT d.id, d.title, d.ai_domain, d.ai_summary, d.file_format, + (1 - (c.embedding <=> cast(:embedding AS vector))) AS score, + left(d.extracted_text, 1200) AS snippet, + 'vector_doc' AS match_reason, + NULL::bigint AS chunk_id, NULL::integer AS chunk_index, NULL::text AS section_title + FROM {docs_table} c + JOIN documents d ON d.id = c.doc_id + WHERE d.deleted_at IS NULL + ORDER BY c.embedding <=> cast(:embedding AS vector) + LIMIT :limit + """ + result = await session.execute(text(sql), params) return [SearchResult(**row._mapping) for row in result] async def _search_vector_chunks( - session: AsyncSession, embedding_str: str, limit: int + session: AsyncSession, + embedding_str: str, + limit: int, + *, + chunks_table: str = "document_chunks", + snapshot_chunk_id_max: int | None = None, ) -> list["SearchResult"]: - """document_chunks.embedding 검색 + window partition (doc당 top 2 chunks). + """document_chunks (또는 document_chunks_cand_).embedding window partition. - SQL 흐름: - 1. inner CTE topk: ivfflat 인덱스로 top-K chunks 추출 - 2. ranked CTE: doc_id PARTITION + ROW_NUMBER (score 내림차순) - 3. outer: rn <= 2 (doc당 max 2 chunks) + JOIN documents + chunks_table = "document_chunks": production path. snapshot_chunk_id_max 지정 시 c.id <= max filter. + chunks_table = "document_chunks_cand_": cand 테이블 (이미 snapshot 범위로 INSERT) → filter 무시. + + R2-B1 final gate: chunks_table 은 _VALID_CHUNKS_TABLE allowlist 통과 후 SQL interpolation. """ from api.search import SearchResult # 순환 import 회피 + if not _VALID_CHUNKS_TABLE.match(chunks_table): + raise RuntimeError(f"invalid_chunks_table: {chunks_table!r}") + inner_k = max(limit * 5, 500) - result = await session.execute( - text(""" - WITH topk AS ( - SELECT - c.id AS chunk_id, - c.doc_id, - c.chunk_index, - c.section_title, - c.text, - c.embedding <=> cast(:embedding AS vector) AS dist - FROM document_chunks c - WHERE c.embedding IS NOT NULL - ORDER BY c.embedding <=> cast(:embedding AS vector) - LIMIT :inner_k - ), - ranked AS ( - SELECT - chunk_id, doc_id, chunk_index, section_title, text, dist, - ROW_NUMBER() OVER (PARTITION BY doc_id ORDER BY dist ASC) AS rn - FROM topk - ) - SELECT - d.id AS id, - d.title AS title, - d.ai_domain AS ai_domain, - d.ai_summary AS ai_summary, - d.file_format AS file_format, - (1 - r.dist) AS score, - left(r.text, 1200) AS snippet, - 'vector_chunk' AS match_reason, - r.chunk_id AS chunk_id, - r.chunk_index AS chunk_index, - r.section_title AS section_title - FROM ranked r - JOIN documents d ON d.id = r.doc_id - WHERE r.rn <= 2 AND d.deleted_at IS NULL - ORDER BY r.dist - LIMIT :limit - """), - {"embedding": embedding_str, "inner_k": inner_k, "limit": limit}, - ) + params: dict[str, Any] = {"embedding": embedding_str, "inner_k": inner_k, "limit": limit} + + snapshot_clause = "" + if chunks_table == "document_chunks" and snapshot_chunk_id_max is not None: + snapshot_clause = " AND c.id <= :snapshot_chunk_id_max" + params["snapshot_chunk_id_max"] = snapshot_chunk_id_max + + sql = f""" + WITH topk AS ( + SELECT c.id AS chunk_id, c.doc_id, c.chunk_index, c.section_title, c.text, + c.embedding <=> cast(:embedding AS vector) AS dist + FROM {chunks_table} c + WHERE c.embedding IS NOT NULL{snapshot_clause} + ORDER BY c.embedding <=> cast(:embedding AS vector) + LIMIT :inner_k + ), + ranked AS ( + SELECT chunk_id, doc_id, chunk_index, section_title, text, dist, + ROW_NUMBER() OVER (PARTITION BY doc_id ORDER BY dist ASC) AS rn + FROM topk + ) + SELECT d.id AS id, d.title AS title, d.ai_domain AS ai_domain, + d.ai_summary AS ai_summary, d.file_format AS file_format, + (1 - r.dist) AS score, left(r.text, 1200) AS snippet, + 'vector_chunk' AS match_reason, + r.chunk_id AS chunk_id, r.chunk_index AS chunk_index, r.section_title AS section_title + FROM ranked r + JOIN documents d ON d.id = r.doc_id + WHERE r.rn <= 2 AND d.deleted_at IS NULL + ORDER BY r.dist + LIMIT :limit + """ + result = await session.execute(text(sql), params) return [SearchResult(**row._mapping) for row in result] @@ -369,6 +473,10 @@ async def search_vector_multilingual( session: AsyncSession, normalized_queries: list[dict], limit: int, + *, + embedding_backend: str | None = None, + snapshot_doc_id_max: int | None = None, + snapshot_chunk_id_max: int | None = None, ) -> list["SearchResult"]: """Phase 2.2 — 다국어 normalized_queries 배열로 vector retrieval. @@ -393,18 +501,24 @@ async def search_vector_multilingual( if not normalized_queries: return [] - # 1. 각 lang별 embedding 병렬 (cache hit 활용) - client = AIClient() - try: - embed_tasks = [ - _get_query_embedding(client, q["text"]) for q in normalized_queries - ] - embeddings = await asyncio.gather(*embed_tasks) - finally: + # 1. 각 lang별 embedding 병렬 (baseline=AIClient.embed cache / cand=TEI endpoint no-cache) + _cfg_for_embed = _resolve_backend(embedding_backend) + if _cfg_for_embed is None: + client = AIClient() try: - await client.close() - except Exception: - pass + embed_tasks = [ + _get_query_embedding(client, q["text"]) for q in normalized_queries + ] + embeddings = await asyncio.gather(*embed_tasks) + finally: + try: + await client.close() + except Exception: + pass + else: + ep = _cfg_for_embed["embed_endpoint"] + embed_tasks = [_embed_query_via_tei(ep, q["text"]) for q in normalized_queries] + embeddings = await asyncio.gather(*embed_tasks) # embedding 실패한 query는 skip (weight 재정규화 없이 조용히 drop) per_query_plan: list[tuple[dict, str]] = [] @@ -417,17 +531,38 @@ async def search_vector_multilingual( if not per_query_plan: return [] - # 2. 각 embedding에 대해 doc + chunks 병렬 retrieval + # 2. multilingual dispatcher resolve (모든 lang query 가 동일 backend 사용) + cfg = _resolve_backend(embedding_backend) + docs_table = cfg["docs_table"] if cfg else "documents" + chunks_table = cfg["chunks_table"] if cfg else "document_chunks" + logger.info( + "[embedding-dispatch] backend=%s docs_table=%s chunks_table=%s snapshot_doc_id_max=%s snapshot_chunk_id_max=%s multilingual=true", + embedding_backend or "baseline", + docs_table, + chunks_table, + snapshot_doc_id_max, + snapshot_chunk_id_max, + ) + + # 3. 각 embedding에 대해 doc + chunks 병렬 retrieval Session = async_sessionmaker(engine) async def _one_query(q_meta: dict, embedding_str: str) -> list["SearchResult"]: async def _docs() -> list["SearchResult"]: async with Session() as s: - return await _search_vector_docs(s, embedding_str, limit * 4) + return await _search_vector_docs( + s, embedding_str, limit * 4, + docs_table=docs_table, + snapshot_doc_id_max=snapshot_doc_id_max, + ) async def _chunks() -> list["SearchResult"]: async with Session() as s: - return await _search_vector_chunks(s, embedding_str, limit * 4) + return await _search_vector_chunks( + s, embedding_str, limit * 4, + chunks_table=chunks_table, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) doc_r, chunk_r = await asyncio.gather(_docs(), _chunks()) return _merge_doc_and_chunk_vectors(doc_r, chunk_r) diff --git a/app/services/search/search_pipeline.py b/app/services/search/search_pipeline.py index 29b7980..e845c92 100644 --- a/app/services/search/search_pipeline.py +++ b/app/services/search/search_pipeline.py @@ -121,6 +121,9 @@ async def run_search( fusion: str = DEFAULT_FUSION, rerank: bool = True, analyze: bool = False, + embedding_backend: str | None = None, + snapshot_doc_id_max: int | None = None, + snapshot_chunk_id_max: int | None = None, ) -> PipelineResult: """검색 파이프라인 실행. @@ -214,9 +217,19 @@ async def run_search( if mode == "vector": t0 = time.perf_counter() if use_multilingual: - raw_chunks = await search_vector_multilingual(session, normalized_queries, limit) + raw_chunks = await search_vector_multilingual( + session, normalized_queries, limit, + embedding_backend=embedding_backend, + snapshot_doc_id_max=snapshot_doc_id_max, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) else: - raw_chunks = await search_vector(session, q, limit) + raw_chunks = await search_vector( + session, q, limit, + embedding_backend=embedding_backend, + snapshot_doc_id_max=snapshot_doc_id_max, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) timing["vector_ms"] = (time.perf_counter() - t0) * 1000 if not raw_chunks: notes.append("vector_search_returned_empty (AI client error or no embeddings)") @@ -231,9 +244,19 @@ async def run_search( if mode == "hybrid": t1 = time.perf_counter() if use_multilingual: - raw_chunks = await search_vector_multilingual(session, normalized_queries, limit) + raw_chunks = await search_vector_multilingual( + session, normalized_queries, limit, + embedding_backend=embedding_backend, + snapshot_doc_id_max=snapshot_doc_id_max, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) else: - raw_chunks = await search_vector(session, q, limit) + raw_chunks = await search_vector( + session, q, limit, + embedding_backend=embedding_backend, + snapshot_doc_id_max=snapshot_doc_id_max, + snapshot_chunk_id_max=snapshot_chunk_id_max, + ) timing["vector_ms"] = (time.perf_counter() - t1) * 1000 # chunk-level → doc-level 압축 (raw chunks는 chunks_by_doc에 보존) diff --git a/reports/phase_2a_embedding_decision_2026-05-23.md b/reports/phase_2a_embedding_decision_2026-05-23.md new file mode 100644 index 0000000..6b38f9a --- /dev/null +++ b/reports/phase_2a_embedding_decision_2026-05-23.md @@ -0,0 +1,97 @@ +# Phase 2A Embedding Decision Report (2026-05-23) + +> Parent: `phase-2a-embedding-diagnose.md` v4 +> +> Round 2 review: `round-2-review-mighty-starfish.md` 채택 +> +> 본 보고서 = Phase 4 산출물. Decision Tree H1~H4 중 권고 1개 + 후속 PR 후보. + +## 1. Summary + +| | Value | +|---|---| +| baseline (bge-m3, snapshot 범위) | NDCG@10 (graded) **0.659** / mixed 0.39 / korean_only 0.51 / failure 0/5 / p50 464ms / p95 1582ms | +| baseline rebaseline (snapshot filter 적용) | 위와 동일 (snapshot 범위 = corpus 전부와 거의 동일, 측정 가능 확인) | +| 후보 2종 측정 완료 | me5_large_inst (mE5-instruct), snowflake_l_v2 (Snowflake Arctic L v2.0) | +| 이관 (별 PR) | bge_mgemma2 (9B FP16 → 16GB GPU OOM risk → PR-2A-Extended-Bge-Mgemma2) | +| 폐기 | ko_me5 (HF 401 Unauthorized) | + +## 2. 후보별 Δ NDCG (vs baseline rebaseline) + +| Candidate | overall NDCG | Δ overall | mixed | Δ mixed | korean_only | Δ korean | standards | english_only | exam | failure | p50 ms | p95 ms | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| **bge-m3 snapshot rebaseline** | **0.659** | — | **0.39** | — | **0.51** | — | 0.87 | 0.78 | 0.74 | 0/5 | 464 | 1582 | +| mE5-large-instruct | 0.477 | **-0.182** | 0.17 | **-0.22** | 0.47 | -0.04 | 0.54 | 0.63 | 0.62 | 0/5 | 194 | 1348 | +| snowflake-arctic-embed-l-v2.0 | 0.616 | -0.043 | 0.35 | -0.04 | 0.52 | +0.01 | 0.87 | 0.74 | 0.56 | 0/5 | 254 | 1412 | + +**관찰**: +- **mE5-large-instruct**: 전 카테고리 큰 회귀. Δ -0.182 overall. mixed 절반 회귀 (0.39 → 0.17). standards 도 큰 회귀 (0.87 → 0.54). 단 latency p50 270ms 단축 (mE5 의 512 context = 적은 compute). +- **snowflake_l_v2**: 가벼운 회귀 (Δ -0.043). standards / korean_only 거의 동일. mixed 약간 회귀. exam 명확 회귀 (-0.18). latency p50 210ms 단축. + +**ambiguous note** (LLM 단독 결정, [[feedback_user_block_minimize]]): +- mE5-instruct 는 query input 에 `Instruct: \nQuery: ` prefix 권장 (intfloat 모델 카드). 본 PR 측정은 plain query → prefix 효과 미반영. prefix 적용 시 +0.05~0.15 회복 가능성 있으나 측정 외 — 별 PR 후보 `PR-2A-mE5-Prefix-Retry`. +- snowflake_l_v2 의 한국어 specific 벤치마크 공개 부재. 본 측정 = 사실상 한국어 specific 첫 audit. korean_only +0.01 미세 개선 신호 있으나 통계적 의미 없음 (n=9, 0.51 vs 0.52). + +## 3. Latency 영향 + +- mE5 (512 ctx): p50 464 → 194 (**−270ms**), p95 1582 → 1348 (−234ms). 빠름. +- snowflake (8192 ctx): p50 464 → 254 (**−210ms**), p95 1582 → 1412 (−170ms). 빠름. +- 둘 다 baseline 보다 빠르지만 quality 회귀. **trade-off favor quality** ([[feedback_quant_expectation_not_hard_gate]] 룰, 정량 hard gate 없으나 +0 NDCG 회복 시 latency 30% 단축 가치는 별 평가). + +## 4. Decision (H3 — bge-m3 유지) + +| | H1 swap 권고 | H2 query rewrite 보완 | **H3 bge-m3 유지 (✅ 선택)** | H4 latency 회귀 | +|---|---|---|---|---| +| 조건 | mixed + korean_only 둘 다 명확 개선 | korean_only 만 개선 / mixed 미개선 | 모든 후보 bge-m3 대비 개선 없음 | latency p95 ≥ 3000ms | +| 결과 | ❌ 둘 다 회귀 | ❌ korean_only 미세 개선 (+0.01) 만, mixed 회귀 | ✅ **확정** | ❌ 둘 다 baseline 보다 빠름 | + +**최종 권고**: **bge-m3 유지** (Apply PR 진입 안 함). + +근거: +- mE5 -0.182 / snowflake -0.043 — 둘 다 net 회귀. +- korean_only 약점 보완 도구로 embedding swap 보다 query rewrite (Phase 2Q) 또는 reranker 튜닝 (Phase 2B) 가 더 유망. +- mE5 prefix retry 는 별 PR 로 분리 — diagnose 본 PR scope 외. + +## 5. Apply / 보완 / 보류 권고 + +- **Apply** (production embedding swap): **하지 않음**. +- **보완** (다른 트랙): **Phase 2B (Reranker)** 또는 **Phase 2Q (Query rewrite)** 우선 — korean_only / mixed 약점 다른 layer 에서 공략. +- **보류** (Phase 2A-Extended): bge_mgemma2 (별 PR), mE5 prefix retry (별 PR), Cloud embedding (Cohere/Voyage) scaffold-only (별 PR). + +## 6. 후보 cleanup 일정 + +- 미선택 후보 4 테이블 (`documents_cand_me5_large_inst` / `document_chunks_cand_me5_large_inst` / `documents_cand_snowflake_l_v2` / `document_chunks_cand_snowflake_l_v2`) = **1주 dormant 유지** (mE5 prefix retry / Phase 2Q 비교 baseline 사용 가능성). +- 1주 후 별 chore `PR-2A-Chunks-Cand-Cleanup-1` 에서 DROP + 컨테이너 docker-compose.override 제거. + +## 7. 후속 PR 후보 (백로그) + +| PR 가칭 | trigger | scope | +|---|---|---| +| `PR-2A-mE5-Prefix-Retry` | 본 PR 결과 + ambiguous note | mE5-instruct query prefix 적용 후 재측정. 페어 reindex 재실행 + 51 case 재측정. 본 PR 의 dispatcher 재사용 (`CANDIDATE_BACKEND_MAP` 에 신규 slug 추가). | +| `PR-2A-Extended-Bge-Mgemma2` | v3 short-list swap 결정 | 9B FP16 OOM 회피 (quantization int8 또는 sentence-transformers). 별 컨테이너 + reindex + 측정. | +| `PR-2A-Cloud-Embedding-Scaffold-1` | (선택) self-hosted 무개선 확정 | Cohere / Voyage scaffold-only (`[[feedback_scaffold_first_for_external_cost_pr]]`). 실비 0. | +| `PR-Search-Query-Rewrite-1` (Phase 2Q) | korean_only / mixed 약점 보완 | 자연어 query → SQL/keyword 강화. | +| `PR-Search-Reranker-V2-Diagnose` (Phase 2B) | korean_only / mixed 약점 보완 | bge-reranker-v2-m3 swap 후보 측정. | +| `PR-2A-Chunks-Cand-Cleanup-1` | 본 PR closure 후 1주 | 4 cand 테이블 DROP + 컨테이너 정리. | + +## 8. Closure gate verify (§ 8 본 plan) + +- [x] G0-1 + G0-2 fixture 박제 (Phase 1 closure 시 commit `943ac5f`) +- [x] snapshot json 박제 (`v0_2_phase2a_snapshot_2026-05-23.json`, commit `a67df0a`) +- [x] 2 후보 (me5_large_inst + snowflake_l_v2) 51 case 측정 완료 (`overall.n = 46`, 5 failure 제외) +- [x] baseline rebaseline 51 case 측정 완료 (snapshot filter 적용) +- [x] 후보별 baseline json 2개 + baseline_snapshot json 1개 박제 +- [x] documents_cand_ row count = 21365 verify (2 후보 동일) +- [x] document_chunks_cand_ row count = 30605 verify (2 후보 동일) +- [x] baseline rebaseline 측정도 동일 snapshot_doc/chunk_id_max filter 통과 verify (dispatch log) +- [x] dispatcher 호출 시 unknown slug → HTTP 400 verify (smoke test `cand_invalid` 통과) +- [x] decision md 박제 (본 파일) +- [x] Apply 권고 1줄 작성 (H3) +- [x] production embedding (bge-m3 ollama) 변경 0 verify (`docker compose ps`, `ollama list`, `config.yaml` diff 0) +- [x] production `documents` row count + embedding 변경 0 verify +- [x] production `document_chunks` row count + content 변경 0 verify +- [x] 후보 cleanup 일정 명시 (1주 dormant → `PR-2A-Chunks-Cand-Cleanup-1`) +- [x] dispatch log audit (silent fallback 0, `embedding_backend_unavailable` 0, snapshot id 박제 verify) +- [x] DOCSRV_TOKEN 만료 사고 0 (3 측정 모두 15분 이내 완주) + +**Phase 2A Diagnose PR closure: PASS**. diff --git a/reports/v0_2_phase2a_baseline_snapshot_2026-05-23.csv b/reports/v0_2_phase2a_baseline_snapshot_2026-05-23.csv new file mode 100644 index 0000000..4815b17 --- /dev/null +++ b/reports/v0_2_phase2a_baseline_snapshot_2026-05-23.csv @@ -0,0 +1,52 @@ +label,id,category,legacy_category,intent,domain_hint,language,ocr_derived,failure_expected,query,relevant_ids,graded_relevance,returned_ids_top10,latency_ms,recall_at_10,mrr_at_10,ndcg_at_10,top3_hit,graded_ndcg_at_10,graded_recall_at_10_t2,graded_recall_at_10_t3,error +single,kw_001,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건법 제6장,3856;3868;3879,3856:3;3868:2;3879:2,3868;3879;3856;3851;4041;3890;3917;3863;3908;3855,418.5,1.000,1.000,1.000,1,0.808,1.000,1.000, +single,kw_002,standards,exact_keyword,fact_lookup,document,ko,0,0,중대재해 처벌 등에 관한 법률 제2장 중대산업재해,3917;3921,3917:3;3921:2,3921;3917;3919;10573;10571;3916;3874;3918;3854;3922,464.3,1.000,1.000,1.000,1,0.834,1.000,1.000, +single,kw_003,standards,exact_keyword,fact_lookup,document,ko,0,0,화학물질관리법 유해화학물질 영업자,3981,3981:3,3981;3985;3984;3993;3857;3978;3983;3957;3980;3903,291.5,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_004,standards,exact_keyword,fact_lookup,document,ko,0,0,근로기준법 안전과 보건,4041,4041:3,4041;3852;3851;3877;3905;3878;3858;3903;3781;3881,478.4,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_005,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 보호구,3888,3888:3,10570;3888;3912;3913;3911;3905;3909;3906;3910;3893,489.4,1.000,0.500,0.631,1,0.631,1.000,1.000, +single,nl_001,korean_only,natural_language_ko,semantic_search,document,ko,0,0,기계로 인한 산업재해 관련 법령,3856;3868;3879;3854,3854:1;3856:3;3868:2;3879:2,5244;5249;3897;3863;5253;3856;3895;3867;3879;3851,505.3,0.500,0.167,0.257,0,0.314,0.667,1.000, +single,nl_002,korean_only,natural_language_ko,semantic_search,document,ko,0,0,사업주가 도급을 줄 때 산업재해를 예방하기 위해 해야 할 일,3855;3867;3878,3855:3;3867:2;3878:2,3855;5227;3854;5244;3851;3867;3878;3863;3908;10573,460.3,1.000,1.000,0.793,1,0.873,1.000,1.000, +single,nl_003,korean_only,natural_language_ko,semantic_search,document,ko,0,0,유해화학물질을 다루는 회사가 지켜야 할 안전 의무,3980;3981;3982,3980:2;3981:2;3982:2,3980;3904;3903;3909;3905;3981;3760;5253;3985;3896,400.1,0.667,1.000,0.636,1,0.636,0.667,0.000, +single,nl_004,korean_only,natural_language_ko,semantic_search,document,ko,0,0,중대재해가 발생했을 때 경영책임자가 처벌받는 기준,3916;3917;3920;3921,3916:2;3917:3;3920:2;3921:2,10572;3917;3916;3918;5227;3854;3877;3922;5240;5226,363.9,0.500,0.500,0.441,1,0.506,0.500,1.000, +single,nl_005,korean_only,natural_language_ko,semantic_search,document,ko,0,0,안전보건교육은 누가 받아야 하고 어떤 내용을 다루는가,3853;3865,3853:3;3865:2,3853;3876;5249;5234;4025;6675;11677;10573;3757;3811,593.8,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,news_001,korean_only,news_ko,semantic_search,news,ko,0,0,이란과 미국의 군사 충돌,4303;4304;4307;4316;4322;4323;4327;4335,4303:2;4304:2;4307:2;4316:2;4322:2;4323:2;4327:2;4335:2,16081;18077;22048;12213;23984;15793;4321;21273;21276;4307,465.2,0.125,0.100,0.073,1,0.073,0.125,0.000, +single,news_002,korean_only,news_ko,semantic_search,news,ko,0,0,호르무즈 해협 봉쇄,4316;4320;4322;4327,4316:3;4320:2;4322:2;4327:2,16532;21706;15922;17123;21890;22049;4346;9022;4767;6067,284.7,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,misc_001,korean_only,other_domain,fact_lookup,document,ko,0,0,강체의 평면 운동학,4063;4065,4063:3;4065:2,4063;4064;4065;4066;4071;4068;4069;5063;5105;4067,568.8,1.000,1.000,0.920,1,0.956,1.000,1.000, +single,misc_002,korean_only,other_domain,semantic_search,document,ko,0,0,질점의 운동역학,4060;4061;4062,4060:2;4061:2;4062:2,4070;4062;4059;4058;4060;4063;4066;4071;4064;5095,515.5,0.667,0.500,0.478,1,0.478,0.667,0.000, +single,news_003,english_only,news_en,semantic_search,news,en,0,0,Trump Iran ultimatum,4258;4260;4262,4258:2;4260:2;4262:2,4775;23446;4776;4202;4679;24382;21155;4668;4199;21855,269.1,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_001,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,기계 안전 가드 설계 원리,3770;3856,3770:3;3856:2,5239;3770;3817;4540;5244;3762;3789;5249;3791;3793,545.1,0.500,0.500,0.387,1,0.497,0.500,1.000, +single,cl_002,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,산업 안전 입문서,3755;3775;3776;3777,3755:2;3775:2;3776:2;3777:2,5244;5236;5249;5229;3755;3774;3761;5230;10573;3787,470.0,0.250,0.200,0.151,1,0.151,0.250,0.000, +single,cl_003,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,전기 안전 위험,3772;3790,3772:2;3790:2,3790;3772;5260;3897;5248;3771;3769;11671;13936;3755,739.8,1.000,1.000,1.000,1,1.000,1.000,0.000, +single,news_004,mixed,news_fr,semantic_search,news,mixed,0,0,guerre en Iran,4199;4202;4210;4361;4363;4507;4519;4521,4199:2;4202:2;4210:2;4361:2;4363:2;4507:2;4519:2;4521:2,22342;19576;17069;15924;16935;23149;16019;16462;16010;4776,324.6,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,news_005,mixed,news_crosslingual,semantic_search,news,mixed,0,0,이란 미국 전쟁 글로벌 반응,4202;4258;4262;4536;4303;4304;4316,4202:2;4258:2;4262:2;4303:2;4304:2;4316:2;4536:2,16761;21275;16927;20893;16771;17242;4329;20886;4457;4307,513.2,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_001,failure_expected,failure_expected,semantic_search,document,mixed,0,1,Rust async runtime tokio scheduler 내부 구조,,,5161;5262;23732;24155;4546;20758;5145;4547;3774;5180,394.6,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_002,failure_expected,failure_expected,semantic_search,document,ko,0,1,양자컴퓨터 큐비트 디코히어런스,,,16289;5089;5092;5250;22202;20507;5070;5118;5173;23605,301.6,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_003,failure_expected,failure_expected,semantic_search,news,ko,0,1,재즈 보컬리스트 빌리 홀리데이,,,20022;20470;4634;15361;16059;9102;23336;18286;16218;5738,264.4,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,kw_006,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 작업장 통로,3886;3887,3886:3;3887:2,3886;3902;3887;3895;3898;3885;3905;3908;3911;3915,338.8,1.000,1.000,0.920,1,0.956,1.000,1.000, +single,kw_007,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 폭발 화재 위험물 누출 방지,3896;3766,3766:1;3896:3,3896;13930;3895;3911;13929;3866;3903;3890;3910;3909,295.6,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,kw_008,standards,standards,fact_lookup,document,ko,0,0,고압가스 안전관리법 전문,4025;4026,4025:3;4026:2,11644;11579;4025;4026;11645;13750;11676;13299;13749;13766,444.4,1.000,0.333,0.571,1,0.539,1.000,1.000, +single,kw_009,standards,standards,fact_lookup,document,ko,0,0,KGS FP111 가스설비 배관설비 기준,13305,13305:3,13305;13311;13306;13312;13302;13304;13309;13299;13313;13918,420.0,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_010,standards,standards,fact_lookup,document,ko,0,0,KGS FU551 가스설비 압력조정기 가스계량기,13652,13652:3,13652;11689;13657;13655;13656;13649;13651;13752;13659;13650,333.7,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_011,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 전기로 인한 위험 방지,3897;3772,3772:1;3897:3,3897;3895;3902;3896;3887;13935;13938;3877;3900;3899,450.7,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,en_001,english_only,english_only,semantic_search,document,en,0,0,pressure vessel flange design,5144;5136,5136:2;5144:3,5144;5136;5180;5193;5140;5137;5149;5178;5207;5148,1618.0,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_002,english_only,english_only,semantic_search,document,en,0,0,ASME Section VIII introduction,5204;5206,5204:3;5206:2,5204;5180;5208;5210;5143;5206;5137;5207;5182;5140,1458.1,1.000,1.000,0.832,1,0.907,1.000,1.000, +single,en_003,english_only,english_only,semantic_search,document,en,0,0,impact test requirements ASME,5205;5148,5148:1;5205:3,5205;5204;5178;5214;5224;5210;5148;5145;5186;5190,1600.2,1.000,1.000,0.818,1,0.961,1.000,1.000, +single,en_004,english_only,english_only,semantic_search,document,en,0,0,design of vessel supports,5149,5149:3,5149;5141;5137;5139;5136;5140;5186;5178;5145;5143,1564.2,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_005,english_only,english_only,semantic_search,document,en,0,0,hydrogen piping ASME code,5178,5178:3,5178;5210;5180;5182;5143;5204;5211;5207;5185;5186,1311.5,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_006,english_only,english_only,semantic_search,document,en,0,0,ASME welding qualification requirements,5209;3771,3771:1;5209:3,5204;5224;5208;5209;5205;5178;5180;5225;5187;5186,1417.4,0.500,0.250,0.264,0,0.395,1.000,1.000, +single,en_007,english_only,english_only,semantic_search,document,en,0,0,pressure vessel fabrication and inspection,5208;5145,5145:2;5208:3,5208;5189;5192;5180;5187;5186;5212;5188;5182;5137,1664.3,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,en_008,english_only,english_only,semantic_search,document,en,0,0,Industrial Safety and Health Management ergonomics,3763;3755,3755:1;3763:3,3763;3759;3774;3755;3818;3812;3778;3756;3761;3771,1076.8,1.000,1.000,0.877,1,0.974,1.000,1.000, +single,cl_004,mixed,mixed,semantic_search,document,mixed,0,0,ASME 압력용기 설계 실무,5207;5210;5139;5135,5135:2;5139:2;5207:3;5210:3,5139;5207;5204;5225;5206;5208;5210;5137;5182;5145,749.0,0.750,1.000,0.767,1,0.686,0.750,1.000, +single,cl_005,mixed,mixed,semantic_search,document,mixed,0,0,ASME 용접 코드 해설,5224;5209,5209:2;5224:3,5224;5222;5225;5209;5180;5204;5210;5205;5178;5143,709.8,1.000,1.000,0.877,1,0.932,1.000,1.000, +single,cl_006,mixed,mixed,semantic_search,document,mixed,0,0,pressure vessel Korean industrial safety regulation,4025;3856;5136,3856:2;4025:2;5136:1,5210;5186;13913;5143;13760;13749;5145;5180;5240;5137,741.3,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_007,mixed,mixed,semantic_search,document,mixed,0,0,KGS 코드 LPG safety standard,11647;11689;11645;4025,4025:1;11645:2;11647:3;11689:2,11647;13760;13674;13669;13774;13773;13675;13755;13924;13772,371.6,0.250,1.000,0.390,1,0.647,0.333,1.000, +single,cl_008,mixed,mixed,semantic_search,document,mixed,0,0,수소 가스 안전 기준 hydrogen safety,5178;5169,5169:2;5178:3,10575;11671;11649;11648;13915;5241;11563;5173;5177;11653,636.2,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,exam_001,exam,exam,fact_lookup,document,ko,0,0,레이놀즈수 정의 공식,11504;11505,11504:3;11505:2,11504;11533;5081;11509;11476;11486;5064;3788;5134;5075,528.8,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,exam_002,exam,exam,fact_lookup,document,ko,0,0,탱크 바닥 구멍 유체 유속 토리첼리,11500;11495;11496,11495:2;11496:2;11500:3,11500;11495;11501;5139;5090;5178;11515;5210;11493;11719,329.0,0.667,1.000,0.765,1,0.856,0.667,1.000, +single,exam_003,exam,exam,fact_lookup,document,ko,0,0,이상기체 음속 마하수,11514;11515,11514:3;11515:2,11514;11475;5090;5084;11531;11476;11473;5093;11479;5124,585.5,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,exam_004,exam,exam,fact_lookup,document,ko,0,0,고압가스 용기 내압시험 영구증가량,11591;11644;11691,11591:3;11644:2;11691:2,11591;11664;13948;13660;5177;13652;11665;13917;11660;13752,351.0,0.333,1.000,0.469,1,0.674,0.333,1.000, +single,exam_005,exam,exam,fact_lookup,document,ko,0,0,도시가스 배관 매설 이격거리,11627;11625;11646,11625:2;11627:3;11646:1,11627;11658;11600;11625;11692;13918;13751;5177;13653;13753,359.0,0.667,1.000,0.671,1,0.883,1.000,1.000, +single,exam_006,exam,exam,fact_lookup,document,ko,0,0,LPG 저장탱크 안전거리 분말소화기,11617;11669;11620,11617:3;11620:1;11669:2,11595;11616;13669;11617;11649;11655;11690;11658;11653;11689,299.7,0.333,0.250,0.202,0,0.321,0.500,1.000, +single,exam_007,exam,exam,fact_lookup,document,ko,0,0,오리피스 차압식 유량계,11712;11711;11503,11503:2;11711:2;11712:3,11711;11712;11503;11500;11713;11714;13930;11717;11701;11502,357.6,1.000,1.000,1.000,1,0.858,1.000,1.000, +single,fail_004,failure_expected,failure_expected,fact_lookup,document,ko,0,1,KGS AC999 임의 가스 코드,,,11691;11693;11692;13665;13661;13664;13666;13670;13773;13934,360.5,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_005,failure_expected,failure_expected,fact_lookup,document,ko,0,1,초전도 안전 관리법 시행규칙,,,4026;5236;3977;3971;3966;4018;3972;3973;3974;3895,420.6,0.000,0.000,0.000,1,0.000,0.000,0.000, diff --git a/reports/v0_2_phase2a_me5_large_inst_2026-05-23.csv b/reports/v0_2_phase2a_me5_large_inst_2026-05-23.csv new file mode 100644 index 0000000..fbe8d71 --- /dev/null +++ b/reports/v0_2_phase2a_me5_large_inst_2026-05-23.csv @@ -0,0 +1,52 @@ +label,id,category,legacy_category,intent,domain_hint,language,ocr_derived,failure_expected,query,relevant_ids,graded_relevance,returned_ids_top10,latency_ms,recall_at_10,mrr_at_10,ndcg_at_10,top3_hit,graded_ndcg_at_10,graded_recall_at_10_t2,graded_recall_at_10_t3,error +single,kw_001,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건법 제6장,3856;3868;3879,3856:3;3868:2;3879:2,10573;3868;3854;3879;3890;3856;3971;3867;3910;3876,129.9,1.000,0.500,0.665,0,0.546,1.000,1.000, +single,kw_002,standards,exact_keyword,fact_lookup,document,ko,0,0,중대재해 처벌 등에 관한 법률 제2장 중대산업재해,3917;3921,3917:3;3921:2,3921;3917;10573;3919;3923;3916;3874;3854;3918;3922,243.4,1.000,1.000,1.000,1,0.834,1.000,1.000, +single,kw_003,standards,exact_keyword,fact_lookup,document,ko,0,0,화학물질관리법 유해화학물질 영업자,3981,3981:3,3985;3981;3978;3984;3983;3980;3904;3869;3979;3988,115.7,1.000,0.500,0.631,1,0.631,1.000,1.000, +single,kw_004,standards,exact_keyword,fact_lookup,document,ko,0,0,근로기준법 안전과 보건,4041,4041:3,4041;3851;3858;5227;3881;4036;4040;4045;10573;3853,273.0,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_005,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 보호구,3888,3888:3,3895;3890;3901;3899;3910;3905;3915;3911;3894;3913,476.0,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,nl_001,korean_only,natural_language_ko,semantic_search,document,ko,0,0,기계로 인한 산업재해 관련 법령,3856;3868;3879;3854,3854:1;3856:3;3868:2;3879:2,5244;5253;3895;3868;3879;3856;3921;3854;3923;3915,289.2,1.000,0.250,0.581,0,0.486,1.000,1.000, +single,nl_002,korean_only,natural_language_ko,semantic_search,document,ko,0,0,사업주가 도급을 줄 때 산업재해를 예방하기 위해 해야 할 일,3855;3867;3878,3855:3;3867:2;3878:2,3867;3855;10571;10573;3917;5231;3878;3918;3851;3854,182.6,1.000,1.000,0.922,1,0.810,1.000,1.000, +single,nl_003,korean_only,natural_language_ko,semantic_search,document,ko,0,0,유해화학물질을 다루는 회사가 지켜야 할 안전 의무,3980;3981;3982,3980:2;3981:2;3982:2,11686;3980;3903;3869;3918;3981;3985;3854;3896;3955,127.6,0.667,0.500,0.463,1,0.463,0.667,0.000, +single,nl_004,korean_only,natural_language_ko,semantic_search,document,ko,0,0,중대재해가 발생했을 때 경영책임자가 처벌받는 기준,3916;3917;3920;3921,3916:2;3917:3;3920:2;3921:2,10571;10572;10573;3918;3917;3921;3877;3923;3854;11677,99.8,0.500,0.200,0.290,0,0.323,0.500,1.000, +single,nl_005,korean_only,natural_language_ko,semantic_search,document,ko,0,0,안전보건교육은 누가 받아야 하고 어떤 내용을 다루는가,3853;3865,3853:3;3865:2,11677;10573;5234;3876;13926;5249;13935;11676;3853;3921,274.7,0.500,0.111,0.185,0,0.237,0.500,1.000, +single,news_001,korean_only,news_ko,semantic_search,news,ko,0,0,이란과 미국의 군사 충돌,4303;4304;4307;4316;4322;4323;4327;4335,4303:2;4304:2;4307:2;4316:2;4322:2;4323:2;4327:2;4335:2,22907;21273;23757;21276;23571;18077;16526;15922;15911;15919,194.4,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,news_002,korean_only,news_ko,semantic_search,news,ko,0,0,호르무즈 해협 봉쇄,4316;4320;4322;4327,4316:3;4320:2;4322:2;4327:2,20240;23572;22067;20898;16532;18087;17123;15922;15918;4327,100.8,0.250,0.100,0.113,0,0.074,0.250,0.000, +single,misc_001,korean_only,other_domain,fact_lookup,document,ko,0,0,강체의 평면 운동학,4063;4065,4063:3;4065:2,4063;4071;4066;4064;4065;5105;4067;5086;5064;4068,198.8,1.000,1.000,0.850,1,0.918,1.000,1.000, +single,misc_002,korean_only,other_domain,semantic_search,document,ko,0,0,질점의 운동역학,4060;4061;4062,4060:2;4061:2;4062:2,4062;4060;4064;4059;4070;4058;4068;4061;4066;5086,171.1,1.000,1.000,0.913,1,0.913,1.000,0.000, +single,news_003,english_only,news_en,semantic_search,news,en,0,0,Trump Iran ultimatum,4258;4260;4262,4258:2;4260:2;4262:2,21441;22075;20274;23242;21897;20440;18428;16404;17008;16823,70.8,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_001,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,기계 안전 가드 설계 원리,3770;3856,3770:3;3856:2,5239;5244;11736;11638;11675;11634;11656;11737;11648;5236,214.6,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,cl_002,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,산업 안전 입문서,3755;3775;3776;3777,3755:2;3775:2;3776:2;3777:2,5249;11637;6674;5230;11737;11638;11676;3876;3867;3859,153.1,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_003,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,전기 안전 위험,3772;3790,3772:2;3790:2,13938;11565;13937;11572;11737;13769;13943;3897;5260;4020,516.6,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,news_004,mixed,news_fr,semantic_search,news,mixed,0,0,guerre en Iran,4199;4202;4210;4361;4363;4507;4519;4521,4199:2;4202:2;4210:2;4361:2;4363:2;4507:2;4519:2;4521:2,23149;25056;22342;16351;16842;17069;16457;4688;4670;4507,119.4,0.125,0.100,0.073,1,0.073,0.125,0.000, +single,news_005,mixed,news_crosslingual,semantic_search,news,mixed,0,0,이란 미국 전쟁 글로벌 반응,4202;4258;4262;4536;4303;4304;4316,4202:2;4258:2;4262:2;4303:2;4304:2;4316:2;4536:2,22907;17128;17242;19111;16526;16761;4761;4307;4457;4452,313.2,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_001,failure_expected,failure_expected,semantic_search,document,mixed,0,1,Rust async runtime tokio scheduler 내부 구조,,,11733;11698;11735;11613;11711;11736;24508;24268;5215;20238,90.2,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_002,failure_expected,failure_expected,semantic_search,document,ko,0,1,양자컴퓨터 큐비트 디코히어런스,,,11513;11510;11711;11739;11736;11738;11508;11735;11523;11509,44.4,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_003,failure_expected,failure_expected,semantic_search,news,ko,0,1,재즈 보컬리스트 빌리 홀리데이,,,11510;11735;23082;23336;11711;11513;11507;11712;11698;11508,43.0,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,kw_006,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 작업장 통로,3886;3887,3886:3;3887:2,3895;3913;3901;3899;3910;3905;3890;3915;3908;3911,160.4,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,kw_007,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 폭발 화재 위험물 누출 방지,3896;3766,3766:1;3896:3,11565;11637;11572;11636;11568;11673;11678;11634;3896;3894,110.8,0.500,0.111,0.185,0,0.276,1.000,1.000, +single,kw_008,standards,standards,fact_lookup,document,ko,0,0,고압가스 안전관리법 전문,4025;4026,4025:3;4026:2,11644;11676;11693;13941;13299;13749;13766;13306;4026;13302,139.7,0.500,0.111,0.185,0,0.102,0.500,0.000, +single,kw_009,standards,standards,fact_lookup,document,ko,0,0,KGS FP111 가스설비 배관설비 기준,13305,13305:3,13305;13941;13311;13913;13653;13307;13306;13317;13310;13313,111.7,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_010,standards,standards,fact_lookup,document,ko,0,0,KGS FU551 가스설비 압력조정기 가스계량기,13652,13652:3,13652;13941;11689;13752;13655;13319;13653;11690;11612;11693,73.1,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_011,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 전기로 인한 위험 방지,3897;3772,3772:1;3897:3,13938;3897;13937;3854;3895;3901;3915;3890;3899;3867,194.5,0.500,0.500,0.387,1,0.579,1.000,1.000, +single,en_001,english_only,english_only,semantic_search,document,en,0,0,pressure vessel flange design,5144;5136,5136:2;5144:3,5144;5136;5147;5137;5210;5180;5140;5149;5133;5145,1273.4,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_002,english_only,english_only,semantic_search,document,en,0,0,ASME Section VIII introduction,5204;5206,5204:3;5206:2,5210;5137;5212;5178;5144;5180;5145;5147;11634;5141,1116.0,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,en_003,english_only,english_only,semantic_search,document,en,0,0,impact test requirements ASME,5205;5148,5148:1;5205:3,5205;5178;5214;5210;5148;5145;5190;5186;5192;5209,1449.5,1.000,1.000,0.850,1,0.968,1.000,1.000, +single,en_004,english_only,english_only,semantic_search,document,en,0,0,design of vessel supports,5149,5149:3,5149;5141;5136;5178;5186;5145;5207;5140;5143;5204,1383.7,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_005,english_only,english_only,semantic_search,document,en,0,0,hydrogen piping ASME code,5178,5178:3,5178;5143;5180;5148;5207;5210;5179;5182;5133;5208,1312.7,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_006,english_only,english_only,semantic_search,document,en,0,0,ASME welding qualification requirements,5209;3771,3771:1;5209:3,5178;5180;5205;5209;5212;5145;5186;4835;4826;5182,1252.0,0.500,0.250,0.264,0,0.395,1.000,1.000, +single,en_007,english_only,english_only,semantic_search,document,en,0,0,pressure vessel fabrication and inspection,5208;5145,5145:2;5208:3,5208;5187;5191;5186;5188;5190;5148;5182;5143;5210,1480.7,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,en_008,english_only,english_only,semantic_search,document,en,0,0,Industrial Safety and Health Management ergonomics,3763;3755,3755:1;3763:3,3774;3755;3763;3812;3815;3756;3758;3757;3773;3770,744.7,1.000,0.500,0.693,1,0.541,1.000,1.000, +single,cl_004,mixed,mixed,semantic_search,document,mixed,0,0,ASME 압력용기 설계 실무,5207;5210;5139;5135,5135:2;5139:2;5207:3;5210:3,5139;5207;5135;5204;5225;5133;5195;5224;5180;5209,529.6,0.750,1.000,0.832,1,0.628,0.750,0.500, +single,cl_005,mixed,mixed,semantic_search,document,mixed,0,0,ASME 용접 코드 해설,5224;5209,5209:2;5224:3,5224;5222;5225;5209;5135;5133;5208;5205;11601;5206,540.8,1.000,1.000,0.877,1,0.932,1.000,1.000, +single,cl_006,mixed,mixed,semantic_search,document,mixed,0,0,pressure vessel Korean industrial safety regulation,4025;3856;5136,3856:2;4025:2;5136:1,11585;11633;11634;11737;11590;13942;13919;13917;11693;3895,560.0,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_007,mixed,mixed,semantic_search,document,mixed,0,0,KGS 코드 LPG safety standard,11647;11689;11645;4025,4025:1;11645:2;11647:3;11689:2,13764;11652;11690;11693;13941;11689;11650;11669;11651;11653,152.6,0.250,0.167,0.139,0,0.099,0.333,0.000, +single,cl_008,mixed,mixed,semantic_search,document,mixed,0,0,수소 가스 안전 기준 hydrogen safety,5178;5169,5169:2;5178:3,5173;5177;11671;11559;11651;11672;11588;11477;11652;5179,403.3,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,exam_001,exam,exam,fact_lookup,document,ko,0,0,레이놀즈수 정의 공식,11504;11505,11504:3;11505:2,11533;11504;5090;11482;11509;11505;11513;11510;11514;11534,236.5,1.000,0.500,0.605,1,0.617,1.000,1.000, +single,exam_002,exam,exam,fact_lookup,document,ko,0,0,탱크 바닥 구멍 유체 유속 토리첼리,11500;11495;11496,11495:2;11496:2;11500:3,11500;11501;11503;11719;11517;11713;11715;11712;11594;11514,105.6,0.333,1.000,0.469,1,0.674,0.333,1.000, +single,exam_003,exam,exam,fact_lookup,document,ko,0,0,이상기체 음속 마하수,11514;11515,11514:3;11515:2,11514;11479;5090;11475;11518;11515;11516;11517;11694;11478,339.3,1.000,1.000,0.832,1,0.907,1.000,1.000, +single,exam_004,exam,exam,fact_lookup,document,ko,0,0,고압가스 용기 내압시험 영구증가량,11591;11644;11691,11591:3;11644:2;11691:2,13660;11691;11591;13943;13942;13917;13653;13752;11579;13753,132.6,0.667,0.500,0.531,1,0.519,0.667,1.000, +single,exam_005,exam,exam,fact_lookup,document,ko,0,0,도시가스 배관 매설 이격거리,11627;11625;11646,11625:2;11627:3;11646:1,11658;11692;11627;11655;13753;11651;11670;11646;11690;11617,151.5,0.667,0.333,0.383,1,0.406,0.500,1.000, +single,exam_006,exam,exam,fact_lookup,document,ko,0,0,LPG 저장탱크 안전거리 분말소화기,11617;11669;11620,11617:3;11620:1;11669:2,11658;11595;11690;11669;11581;11639;11655;11650;11649;11617,113.8,0.667,0.250,0.338,0,0.353,1.000,1.000, +single,exam_007,exam,exam,fact_lookup,document,ko,0,0,오리피스 차압식 유량계,11712;11711;11503,11503:2;11711:2;11712:3,11711;11712;11503;11713;11719;11715;11717;11716;11613;11502,149.2,1.000,1.000,1.000,1,0.858,1.000,1.000, +single,fail_004,failure_expected,failure_expected,fact_lookup,document,ko,0,1,KGS AC999 임의 가스 코드,,,11691;11647;11668;5177;11693;11692;13665;13661;13666;13663,160.4,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_005,failure_expected,failure_expected,fact_lookup,document,ko,0,1,초전도 안전 관리법 시행규칙,,,5260;4026;3977;3971;3966;3972;3973;3974;3895;4019,218.3,0.000,0.000,0.000,1,0.000,0.000,0.000, diff --git a/reports/v0_2_phase2a_snowflake_l_v2_2026-05-23.csv b/reports/v0_2_phase2a_snowflake_l_v2_2026-05-23.csv new file mode 100644 index 0000000..ad79c1a --- /dev/null +++ b/reports/v0_2_phase2a_snowflake_l_v2_2026-05-23.csv @@ -0,0 +1,52 @@ +label,id,category,legacy_category,intent,domain_hint,language,ocr_derived,failure_expected,query,relevant_ids,graded_relevance,returned_ids_top10,latency_ms,recall_at_10,mrr_at_10,ndcg_at_10,top3_hit,graded_ndcg_at_10,graded_recall_at_10_t2,graded_recall_at_10_t3,error +single,kw_001,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건법 제6장,3856;3868;3879,3856:3;3868:2;3879:2,3879;3868;3890;3863;3856;3908;3851;4041;3862;3873,236.8,1.000,1.000,0.947,0,0.731,1.000,1.000, +single,kw_002,standards,exact_keyword,fact_lookup,document,ko,0,0,중대재해 처벌 등에 관한 법률 제2장 중대산업재해,3917;3921,3917:3;3921:2,3921;3917;3919;3923;10573;10571;3916;3874;3918;3854,243.0,1.000,1.000,1.000,1,0.834,1.000,1.000, +single,kw_003,standards,exact_keyword,fact_lookup,document,ko,0,0,화학물질관리법 유해화학물질 영업자,3981,3981:3,3985;3981;3984;3978;3983;3986;3957;3980;3992;3869,118.4,1.000,0.500,0.631,1,0.631,1.000,1.000, +single,kw_004,standards,exact_keyword,fact_lookup,document,ko,0,0,근로기준법 안전과 보건,4041,4041:3,4041;3852;3851;3858;3881;4036;4040;4045;3913;3912,281.7,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_005,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 보호구,3888,3888:3,3888;3893;3887;3897;3892;3890;3896;3895;3902;3889,287.1,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,nl_001,korean_only,natural_language_ko,semantic_search,document,ko,0,0,기계로 인한 산업재해 관련 법령,3856;3868;3879;3854,3854:1;3856:3;3868:2;3879:2,3878;5249;3863;3868;3856;3879;3867;3921;3851;3923,288.6,0.750,0.250,0.458,0,0.468,1.000,1.000, +single,nl_002,korean_only,natural_language_ko,semantic_search,document,ko,0,0,사업주가 도급을 줄 때 산업재해를 예방하기 위해 해야 할 일,3855;3867;3878,3855:3;3867:2;3878:2,3917;5246;3854;5227;3851;3867;3855;3878;3863;10573,233.7,1.000,0.167,0.472,0,0.418,1.000,1.000, +single,nl_003,korean_only,natural_language_ko,semantic_search,document,ko,0,0,유해화학물질을 다루는 회사가 지켜야 할 안전 의무,3980;3981;3982,3980:2;3981:2;3982:2,3980;3904;3903;3760;3985;3916;3851;3978;3905;3981,175.1,0.667,1.000,0.605,1,0.605,0.667,0.000, +single,nl_004,korean_only,natural_language_ko,semantic_search,document,ko,0,0,중대재해가 발생했을 때 경영책임자가 처벌받는 기준,3916;3917;3920;3921,3916:2;3917:3;3920:2;3921:2,10572;10573;3917;3916;3918;10571;5244;3919;5227;3854,163.1,0.500,0.333,0.363,1,0.410,0.500,1.000, +single,nl_005,korean_only,natural_language_ko,semantic_search,document,ko,0,0,안전보건교육은 누가 받아야 하고 어떤 내용을 다루는가,3853;3865,3853:3;3865:2,3853;5249;4025;5240;10573;11677;3876;3757;3811;3921,358.4,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,news_001,korean_only,news_ko,semantic_search,news,ko,0,0,이란과 미국의 군사 충돌,4303;4304;4307;4316;4322;4323;4327;4335,4303:2;4304:2;4307:2;4316:2;4322:2;4323:2;4327:2;4335:2,18077;22048;12213;4317;21273;21276;15919;16404;17242;15922,259.9,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,news_002,korean_only,news_ko,semantic_search,news,ko,0,0,호르무즈 해협 봉쇄,4316;4320;4322;4327,4316:3;4320:2;4322:2;4327:2,16532;22049;20240;22055;15917;21890;15922;15918;4346;9022,99.0,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,misc_001,korean_only,other_domain,fact_lookup,document,ko,0,0,강체의 평면 운동학,4063;4065,4063:3;4065:2,4063;4065;4066;4064;4071;4068;4058;4069;4067;5064,378.0,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,misc_002,korean_only,other_domain,semantic_search,document,ko,0,0,질점의 운동역학,4060;4061;4062,4060:2;4061:2;4062:2,4061;4060;4062;4070;4059;4064;4065;4066;4063;4058,356.3,1.000,1.000,1.000,1,1.000,1.000,0.000, +single,news_003,english_only,news_en,semantic_search,news,en,0,0,Trump Iran ultimatum,4258;4260;4262,4258:2;4260:2;4262:2,21441;4202;4776;4679;16941;21897;4775;21155;16823;4199,74.6,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_001,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,기계 안전 가드 설계 원리,3770;3856,3770:3;3856:2,5244;5239;3758;3791;3770;3817;3763;3787;4540;5253,319.4,0.500,0.200,0.237,0,0.305,0.500,1.000, +single,cl_002,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,산업 안전 입문서,3755;3775;3776;3777,3755:2;3775:2;3776:2;3777:2,5230;5249;3755;3863;3802;3851;3859;3895;3896;3890,215.7,0.250,0.333,0.195,1,0.195,0.250,0.000, +single,cl_003,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,전기 안전 위험,3772;3790,3772:2;3790:2,3897;3790;3772;3775;13935;4020;4021;13934;13938;4018,527.9,1.000,0.500,0.693,1,0.693,1.000,0.000, +single,news_004,mixed,news_fr,semantic_search,news,mixed,0,0,guerre en Iran,4199;4202;4210;4361;4363;4507;4519;4521,4199:2;4202:2;4210:2;4361:2;4363:2;4507:2;4519:2;4521:2,23242;19434;16606;24991;18723;23149;15924;16941;16404;16538,119.0,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,news_005,mixed,news_crosslingual,semantic_search,news,mixed,0,0,이란 미국 전쟁 글로벌 반응,4202;4258;4262;4536;4303;4304;4316,4202:2;4258:2;4262:2;4303:2;4304:2;4316:2;4536:2,23242;15894;16751;19434;22069;16912;18088;17242;16759;4345,305.4,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_001,failure_expected,failure_expected,semantic_search,document,mixed,0,1,Rust async runtime tokio scheduler 내부 구조,,,5161;23732;5262;5061;20758;4550;17810;4546;4547;20036,199.0,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_002,failure_expected,failure_expected,semantic_search,document,ko,0,1,양자컴퓨터 큐비트 디코히어런스,,,5057;4057;5135;5094;22202;5092;5066;5078;17899;23498,174.0,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_003,failure_expected,failure_expected,semantic_search,news,ko,0,1,재즈 보컬리스트 빌리 홀리데이,,,18567;20022;20470;19172;18286;21525;16320;21847;4780;16151,79.8,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,kw_006,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 작업장 통로,3886;3887,3886:3;3887:2,3886;3887;5249;3881;3912;3892;3898;3896;3888;3893,242.0,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_007,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 폭발 화재 위험물 누출 방지,3896;3766,3766:1;3896:3,3896;3890;3901;11572;11562;13929;11567;3894;3899;3911,115.8,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,kw_008,standards,standards,fact_lookup,document,ko,0,0,고압가스 안전관리법 전문,4025;4026,4025:3;4026:2,11644;11579;4026;4025;11693;13750;13299;13941;13749;13766,243.5,1.000,0.333,0.571,0,0.508,1.000,1.000, +single,kw_009,standards,standards,fact_lookup,document,ko,0,0,KGS FP111 가스설비 배관설비 기준,13305,13305:3,13305;13311;13306;13312;13302;13304;11688;13309;13313;13310,259.4,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_010,standards,standards,fact_lookup,document,ko,0,0,KGS FU551 가스설비 압력조정기 가스계량기,13652,13652:3,13652;11689;13651;13655;13656;13649;13658;13752;13648;13659,168.3,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,kw_011,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 전기로 인한 위험 방지,3897;3772,3772:1;3897:3,3897;3886;3887;13935;3895;3902;3896;13938;3877;3900,271.8,0.500,1.000,0.613,1,0.917,1.000,1.000, +single,en_001,english_only,english_only,semantic_search,document,en,0,0,pressure vessel flange design,5144;5136,5136:2;5144:3,5144;5136;5149;5180;5140;5178;5207;5148;5212;5137,1436.2,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_002,english_only,english_only,semantic_search,document,en,0,0,ASME Section VIII introduction,5204;5206,5204:3;5206:2,5204;5212;5208;5210;5206;11634;5207;5141;5182;5183,1266.8,1.000,1.000,0.850,1,0.918,1.000,1.000, +single,en_003,english_only,english_only,semantic_search,document,en,0,0,impact test requirements ASME,5205;5148,5148:1;5205:3,5205;5204;5178;5214;5190;5148;5145;5185;5192;5212,1442.7,1.000,1.000,0.832,1,0.964,1.000,1.000, +single,en_004,english_only,english_only,semantic_search,document,en,0,0,design of vessel supports,5149,5149:3,5149;5136;5186;5178;5145;5148;5192;5185;5212;5147,1387.4,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_005,english_only,english_only,semantic_search,document,en,0,0,hydrogen piping ASME code,5178,5178:3,5178;5204;5207;5143;5147;5179;5180;5137;5210;5182,1294.3,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,en_006,english_only,english_only,semantic_search,document,en,0,0,ASME welding qualification requirements,5209;3771,3771:1;5209:3,5178;5224;5180;5205;5209;5212;5225;5145;4835;4826,1256.9,0.500,0.200,0.237,0,0.355,1.000,1.000, +single,en_007,english_only,english_only,semantic_search,document,en,0,0,pressure vessel fabrication and inspection,5208;5145,5145:2;5208:3,5208;5186;5212;5137;5148;5143;5204;5185;5140;5193,1483.3,0.500,1.000,0.613,1,0.787,0.500,1.000, +single,en_008,english_only,english_only,semantic_search,document,en,0,0,Industrial Safety and Health Management ergonomics,3763;3755,3755:1;3763:3,3812;3763;3813;3756;3755;3757;3815;3774;3814;3770,759.6,1.000,0.500,0.624,1,0.629,1.000,1.000, +single,cl_004,mixed,mixed,semantic_search,document,mixed,0,0,ASME 압력용기 설계 실무,5207;5210;5139;5135,5135:2;5139:2;5207:3;5210:3,5139;5207;5225;5204;5133;5212;5182;5140;5137;5224,567.3,0.500,1.000,0.637,1,0.522,0.500,0.500, +single,cl_005,mixed,mixed,semantic_search,document,mixed,0,0,ASME 용접 코드 해설,5224;5209,5209:2;5224:3,5224;5222;5225;5209;5210;5208;5205;5204;5178;11601,512.5,1.000,1.000,0.877,1,0.932,1.000,1.000, +single,cl_006,mixed,mixed,semantic_search,document,mixed,0,0,pressure vessel Korean industrial safety regulation,4025;3856;5136,3856:2;4025:2;5136:1,5143;3762;5210;11633;5204;3985;5182;3797;5186;11600,562.4,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,cl_007,mixed,mixed,semantic_search,document,mixed,0,0,KGS 코드 LPG safety standard,11647;11689;11645;4025,4025:1;11645:2;11647:3;11689:2,11647;13674;13669;13774;13773;13675;11688;13757;13769;11644,200.8,0.250,1.000,0.390,1,0.647,0.333,1.000, +single,cl_008,mixed,mixed,semantic_search,document,mixed,0,0,수소 가스 안전 기준 hydrogen safety,5178;5169,5169:2;5178:3,5241;11671;5177;11653;11568;5173;11538;11579;5178;11648,421.8,0.500,0.111,0.185,0,0.237,0.500,1.000, +single,exam_001,exam,exam,fact_lookup,document,ko,0,0,레이놀즈수 정의 공식,11504;11505,11504:3;11505:2,11533;11504;11482;11509;11505;11513;11510;11476;11712;11486,253.7,1.000,0.500,0.624,1,0.627,1.000,1.000, +single,exam_002,exam,exam,fact_lookup,document,ko,0,0,탱크 바닥 구멍 유체 유속 토리첼리,11500;11495;11496,11495:2;11496:2;11500:3,11500;11495;11501;11503;5071;5139;13771;11515;11719;13307,156.7,0.667,1.000,0.765,1,0.856,0.667,1.000, +single,exam_003,exam,exam,fact_lookup,document,ko,0,0,이상기체 음속 마하수,11514;11515,11514:3;11515:2,11514;11479;5090;11475;11473;11515;11518;5057;11487;11516,381.2,1.000,1.000,0.832,1,0.907,1.000,1.000, +single,exam_004,exam,exam,fact_lookup,document,ko,0,0,고압가스 용기 내압시험 영구증가량,11591;11644;11691,11591:3;11644:2;11691:2,13948;13660;5177;13652;13759;13942;13917;13752;4026;11579,161.2,0.000,0.000,0.000,0,0.000,0.000,0.000, +single,exam_005,exam,exam,fact_lookup,document,ko,0,0,도시가스 배관 매설 이격거리,11627;11625;11646,11625:2;11627:3;11646:1,11658;11600;11625;11692;11627;13751;11655;13753;11624;13652,170.4,0.667,0.333,0.416,0,0.448,1.000,1.000, +single,exam_006,exam,exam,fact_lookup,document,ko,0,0,LPG 저장탱크 안전거리 분말소화기,11617;11669;11620,11617:3;11620:1;11669:2,11595;11605;11655;11658;11690;11653;11669;13670;11639;11649,123.0,0.333,0.143,0.156,0,0.106,0.500,0.000, +single,exam_007,exam,exam,fact_lookup,document,ko,0,0,오리피스 차압식 유량계,11712;11711;11503,11503:2;11711:2;11712:3,11712;11711;11503;11500;11713;13930;11717;11715;11716;11719,158.3,1.000,1.000,1.000,1,1.000,1.000,1.000, +single,fail_004,failure_expected,failure_expected,fact_lookup,document,ko,0,1,KGS AC999 임의 가스 코드,,,11691;11647;11668;11583;11693;11692;13664;13665;13661;13666,161.9,0.000,0.000,0.000,1,0.000,0.000,0.000, +single,fail_005,failure_expected,failure_expected,fact_lookup,document,ko,0,1,초전도 안전 관리법 시행규칙,,,3977;4026;3971;3966;4018;3972;3973;3974;4019;13913,226.3,0.000,0.000,0.000,1,0.000,0.000,0.000, diff --git a/tests/search_eval/baselines/v0_2_phase2a_baseline_snapshot_2026-05-23.json b/tests/search_eval/baselines/v0_2_phase2a_baseline_snapshot_2026-05-23.json new file mode 100644 index 0000000..7b22b90 --- /dev/null +++ b/tests/search_eval/baselines/v0_2_phase2a_baseline_snapshot_2026-05-23.json @@ -0,0 +1,46 @@ +{ + "version": "v0.2-phase2a", + "label": "baseline_snapshot", + "date": "2026-05-23", + "snapshot": { + "doc_id_max": 25180, + "chunk_id_max": 56526, + "documents_n": 21365, + "chunks_n": 30605 + }, + "eval_set": { + "total_cases": 51, + "scored_cases": 46, + "failure_expected_cases": 5 + }, + "model_config": { + "embedding": "BAAI/bge-m3 (production)", + "reranker": "BAAI/bge-reranker-v2-m3", + "search_mode": "hybrid", + "rerank_enabled": "server_default", + "embedding_backend": "baseline", + "plan": "phase-2a-embedding-diagnose.md v4" + }, + "overall": { + "n": 46, + "graded_ndcg_at_10": 0.659, + "graded_recall_at_10_t2": 0.695, + "graded_recall_at_10_t3": 0.761, + "latency_p50_ms": 464, + "latency_p95_ms": 1582, + "failure_correct": "0/5" + }, + "by_category": { + "english_only": { "n": 9, "recall_at_10": 0.78, "ndcg_at_10": 0.71, "graded_ndcg_at_10": 0.78 }, + "exam": { "n": 7, "recall_at_10": 0.57, "ndcg_at_10": 0.62, "graded_ndcg_at_10": 0.74 }, + "korean_only": { "n": 9, "recall_at_10": 0.55, "ndcg_at_10": 0.47, "graded_ndcg_at_10": 0.51 }, + "mixed": { "n": 10, "recall_at_10": 0.38, "ndcg_at_10": 0.36, "graded_ndcg_at_10": 0.39 }, + "standards": { "n": 11, "recall_at_10": 0.91, "ndcg_at_10": 0.85, "graded_ndcg_at_10": 0.87 } + }, + "by_language": { + "en": { "n": 9, "recall_at_10": 0.78, "graded_ndcg_at_10": 0.78 }, + "ko": { "n": 27, "recall_at_10": 0.70, "graded_ndcg_at_10": 0.72 }, + "mixed": { "n": 10, "recall_at_10": 0.38, "graded_ndcg_at_10": 0.39 } + }, + "raw_csv": "reports/v0_2_phase2a_baseline_snapshot_2026-05-23.csv" +} diff --git a/tests/search_eval/baselines/v0_2_phase2a_me5_large_inst_2026-05-23.json b/tests/search_eval/baselines/v0_2_phase2a_me5_large_inst_2026-05-23.json new file mode 100644 index 0000000..3b4d9b3 --- /dev/null +++ b/tests/search_eval/baselines/v0_2_phase2a_me5_large_inst_2026-05-23.json @@ -0,0 +1,60 @@ +{ + "version": "v0.2-phase2a", + "label": "cand_me5_large_inst", + "date": "2026-05-23", + "snapshot": { + "doc_id_max": 25180, + "chunk_id_max": 56526, + "documents_n": 21365, + "chunks_n": 30605 + }, + "eval_set": { + "total_cases": 51, + "scored_cases": 46, + "failure_expected_cases": 5 + }, + "model_config": { + "embedding": "intfloat/multilingual-e5-large-instruct", + "dim": 1024, + "context": 512, + "reranker": "BAAI/bge-reranker-v2-m3", + "search_mode": "hybrid", + "rerank_enabled": "server_default", + "embedding_backend": "cand_me5_large_inst", + "endpoint": "http://embedding-cand-me5-inst:80/embed", + "truncate": true, + "prefix": "NOT_APPLIED — mE5-instruct 권장 'Instruct: ' query prefix 미적용 (별 PR 후보)", + "plan": "phase-2a-embedding-diagnose.md v4" + }, + "overall": { + "n": 46, + "graded_ndcg_at_10": 0.477, + "graded_recall_at_10_t2": 0.622, + "graded_recall_at_10_t3": 0.620, + "latency_p50_ms": 194, + "latency_p95_ms": 1348, + "failure_correct": "0/5" + }, + "by_category": { + "english_only": { "n": 9, "recall_at_10": 0.67, "ndcg_at_10": 0.60, "graded_ndcg_at_10": 0.63 }, + "exam": { "n": 7, "recall_at_10": 0.76, "ndcg_at_10": 0.59, "graded_ndcg_at_10": 0.62 }, + "korean_only": { "n": 9, "recall_at_10": 0.66, "ndcg_at_10": 0.48, "graded_ndcg_at_10": 0.47 }, + "mixed": { "n": 10, "recall_at_10": 0.21, "ndcg_at_10": 0.19, "graded_ndcg_at_10": 0.17 }, + "standards": { "n": 11, "recall_at_10": 0.68, "ndcg_at_10": 0.55, "graded_ndcg_at_10": 0.54 } + }, + "by_language": { + "en": { "n": 9, "recall_at_10": 0.67, "graded_ndcg_at_10": 0.63 }, + "ko": { "n": 27, "recall_at_10": 0.69, "graded_ndcg_at_10": 0.54 }, + "mixed": { "n": 10, "recall_at_10": 0.21, "graded_ndcg_at_10": 0.17 } + }, + "raw_csv": "reports/v0_2_phase2a_me5_large_inst_2026-05-23.csv", + "delta_vs_baseline": { + "graded_ndcg_at_10": -0.182, + "mixed": -0.22, + "korean_only": -0.04, + "standards": -0.33, + "english_only": -0.15, + "exam": -0.12, + "latency_p50_ms": -270 + } +} diff --git a/tests/search_eval/baselines/v0_2_phase2a_snowflake_l_v2_2026-05-23.json b/tests/search_eval/baselines/v0_2_phase2a_snowflake_l_v2_2026-05-23.json new file mode 100644 index 0000000..ec7e8c3 --- /dev/null +++ b/tests/search_eval/baselines/v0_2_phase2a_snowflake_l_v2_2026-05-23.json @@ -0,0 +1,59 @@ +{ + "version": "v0.2-phase2a", + "label": "cand_snowflake_l_v2", + "date": "2026-05-23", + "snapshot": { + "doc_id_max": 25180, + "chunk_id_max": 56526, + "documents_n": 21365, + "chunks_n": 30605 + }, + "eval_set": { + "total_cases": 51, + "scored_cases": 46, + "failure_expected_cases": 5 + }, + "model_config": { + "embedding": "Snowflake/snowflake-arctic-embed-l-v2.0", + "dim": 1024, + "context": 8192, + "reranker": "BAAI/bge-reranker-v2-m3", + "search_mode": "hybrid", + "rerank_enabled": "server_default", + "embedding_backend": "cand_snowflake_l_v2", + "endpoint": "http://embedding-cand-snowflake-l-v2:80/embed", + "truncate": true, + "plan": "phase-2a-embedding-diagnose.md v4" + }, + "overall": { + "n": 46, + "graded_ndcg_at_10": 0.616, + "graded_recall_at_10_t2": 0.726, + "graded_recall_at_10_t3": 0.728, + "latency_p50_ms": 254, + "latency_p95_ms": 1412, + "failure_correct": "0/5" + }, + "by_category": { + "english_only": { "n": 9, "recall_at_10": 0.78, "ndcg_at_10": 0.68, "graded_ndcg_at_10": 0.74 }, + "exam": { "n": 7, "recall_at_10": 0.67, "ndcg_at_10": 0.54, "graded_ndcg_at_10": 0.56 }, + "korean_only": { "n": 9, "recall_at_10": 0.60, "ndcg_at_10": 0.50, "graded_ndcg_at_10": 0.52 }, + "mixed": { "n": 10, "recall_at_10": 0.40, "ndcg_at_10": 0.32, "graded_ndcg_at_10": 0.35 }, + "standards": { "n": 11, "recall_at_10": 0.91, "ndcg_at_10": 0.85, "graded_ndcg_at_10": 0.87 } + }, + "by_language": { + "en": { "n": 9, "recall_at_10": 0.78, "graded_ndcg_at_10": 0.74 }, + "ko": { "n": 27, "recall_at_10": 0.74, "graded_ndcg_at_10": 0.67 }, + "mixed": { "n": 10, "recall_at_10": 0.40, "graded_ndcg_at_10": 0.35 } + }, + "raw_csv": "reports/v0_2_phase2a_snowflake_l_v2_2026-05-23.csv", + "delta_vs_baseline": { + "graded_ndcg_at_10": -0.043, + "mixed": -0.04, + "korean_only": +0.01, + "standards": 0.00, + "english_only": -0.04, + "exam": -0.18, + "latency_p50_ms": -210 + } +} diff --git a/tests/search_eval/run_eval.py b/tests/search_eval/run_eval.py index 013c44c..0dbd707 100644 --- a/tests/search_eval/run_eval.py +++ b/tests/search_eval/run_eval.py @@ -199,6 +199,9 @@ async def call_search( fusion: str | None = None, rerank: str | None = None, analyze: str | None = None, + embedding_backend: str | None = None, + snapshot_doc_id_max: int | None = None, + snapshot_chunk_id_max: int | None = None, ) -> tuple[list[int], float]: """검색 API 호출 → (doc_ids, latency_ms).""" url = f"{base_url.rstrip('/')}/api/search/" @@ -210,6 +213,12 @@ async def call_search( params["rerank"] = rerank if analyze is not None: params["analyze"] = analyze + if embedding_backend is not None: + params["embedding_backend"] = embedding_backend + if snapshot_doc_id_max is not None: + params["snapshot_doc_id_max"] = snapshot_doc_id_max + if snapshot_chunk_id_max is not None: + params["snapshot_chunk_id_max"] = snapshot_chunk_id_max import time @@ -237,6 +246,9 @@ async def evaluate( fusion: str | None = None, rerank: str | None = None, analyze: str | None = None, + embedding_backend: str | None = None, + snapshot_doc_id_max: int | None = None, + snapshot_chunk_id_max: int | None = None, ) -> list[QueryResult]: """전체 쿼리셋 평가.""" results: list[QueryResult] = [] @@ -245,7 +257,10 @@ async def evaluate( for q in queries: try: returned_ids, latency_ms = await call_search( - client, base_url, token, q.query, mode=mode, fusion=fusion, rerank=rerank, analyze=analyze + client, base_url, token, q.query, mode=mode, fusion=fusion, rerank=rerank, analyze=analyze, + embedding_backend=embedding_backend, + snapshot_doc_id_max=snapshot_doc_id_max, + snapshot_chunk_id_max=snapshot_chunk_id_max, ) results.append( QueryResult( @@ -819,6 +834,9 @@ async def call_search_full( rerank: str | None = None, analyze: str | None = None, debug: bool = False, + embedding_backend: str | None = None, + snapshot_doc_id_max: int | None = None, + snapshot_chunk_id_max: int | None = None, ) -> tuple[list[dict], float]: """call_search와 동일 로직. 단 full result dict 리스트 반환.""" url = f"{base_url.rstrip('/')}/api/search/" @@ -832,6 +850,12 @@ async def call_search_full( params["analyze"] = analyze if debug: params["debug"] = "true" + if embedding_backend is not None: + params["embedding_backend"] = embedding_backend + if snapshot_doc_id_max is not None: + params["snapshot_doc_id_max"] = snapshot_doc_id_max + if snapshot_chunk_id_max is not None: + params["snapshot_chunk_id_max"] = snapshot_chunk_id_max import time @@ -1266,6 +1290,25 @@ def main() -> int: choices=["v0.1", "v0.2", "both"], help="점수 출력 모드 (Phase 1, default both). v0.1=binary only / v0.2=graded only / both=둘 다", ) + parser.add_argument( + "--embedding-backend", + type=str, + default=None, + help="Phase 2A Diagnose dispatcher slug (baseline | cand_me5_large_inst | cand_snowflake_l_v2). 미지정 = production.", + ) + parser.add_argument( + "--snapshot-doc-id-max", + type=int, + default=None, + help="Phase 2A snapshot freeze. documents.id <= 값 filter. baseline rebaseline 도 동일 적용.", + ) + parser.add_argument( + "--snapshot-chunk-id-max", + type=int, + default=None, + help="Phase 2A snapshot freeze. document_chunks.id <= 값 filter. baseline rebaseline 도 동일 적용.", + ) + args = parser.parse_args() if not args.token: @@ -1318,21 +1361,21 @@ def main() -> int: if args.base_url: print(f"\n>>> evaluating: {args.base_url}") results = asyncio.run( - evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze) + evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max) ) print_summary("single", results, eval_version=args.eval_version) all_results.extend(results) else: print(f"\n>>> baseline: {args.baseline_url}") baseline_results = asyncio.run( - evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze) + evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max) ) baseline_summary = print_summary("baseline", baseline_results, eval_version=args.eval_version) print(f"\n>>> candidate: {args.candidate_url}") candidate_results = asyncio.run( evaluate( - queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze + queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max ) ) candidate_summary = print_summary("candidate", candidate_results, eval_version=args.eval_version)