feat(search): corpus_variant + exact_knn measurement dispatch (replace-diagnose c4+c5)

PR-DocSrv-Hier-Replace-Diagnose-1 c4+c5. hier vs prehier(legacy) go/no-go 비파괴 측정 hook.
- 측정 뷰 3종 (hier_measure_views.sql, additive/droppable): corpus_chunks_prehier
  (legacy+null-source 375 포함) / hier_sim_raw / hier_sim_clean (childless-tiny<30 제외,
  all-tiny doc 은 legacy fallback 정합).
- retrieval_service: _resolve_corpus_variant + CORPUS_VARIANT_MAP + _VALID_CHUNKS_TABLE
  3 뷰 추가 + exact_knn(SET LOCAL enable_indexscan/bitmapscan=off, eval 전용).
  chunk leg 만 영향 (doc-level + fts/trgm = documents 무관). baseline/None path 회귀 0.
- search_pipeline.run_search + search.py: corpus_variant/exact_knn 전달, unknown→400,
  embedding_backend cand 와 동시 사용 금지(400).
- run_eval: --corpus-variant + --exact-knn flag.
- tests/test_corpus_variant.py 22 PASS (resolver/map/allowlist + SQL injection 거부).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-05-25 05:37:15 +00:00
parent e860baa179
commit 100aaa3b0c
6 changed files with 249 additions and 11 deletions
+37
View File
@@ -0,0 +1,37 @@
-- PR-DocSrv-Hier-Replace-Diagnose-1 c4: 측정 전용 view (additive, droppable, in_corpus 무관)
-- prehier = pre-hier baseline (legacy + null-source). hier_sim_* = post-replace 시뮬(doc 단위 fallback).
-- clean = childless-tiny(<30자) leaf 제외 (A1 held-out 발견). kept-leaf = is_leaf AND (len>=30 OR has child).
DROP VIEW IF EXISTS corpus_chunks_prehier;
DROP VIEW IF EXISTS corpus_chunks_hier_sim_raw;
DROP VIEW IF EXISTS corpus_chunks_hier_sim_clean;
CREATE VIEW corpus_chunks_prehier AS
SELECT * FROM document_chunks
WHERE source_type IS DISTINCT FROM 'hier_section' AND embedding IS NOT NULL;
CREATE VIEW corpus_chunks_hier_sim_raw AS
SELECT * FROM document_chunks dc
WHERE dc.embedding IS NOT NULL AND (
(dc.source_type = 'hier_section' AND dc.is_leaf = true)
OR (dc.source_type IS DISTINCT FROM 'hier_section'
AND NOT EXISTS (SELECT 1 FROM document_chunks h
WHERE h.doc_id = dc.doc_id AND h.source_type = 'hier_section'
AND h.is_leaf = true AND h.embedding IS NOT NULL))
);
CREATE VIEW corpus_chunks_hier_sim_clean AS
SELECT * FROM document_chunks dc
WHERE dc.embedding IS NOT NULL AND (
-- kept hier leaf: is_leaf AND NOT childless-tiny
(dc.source_type = 'hier_section' AND dc.is_leaf = true
AND (length(trim(dc.text)) >= 30
OR EXISTS (SELECT 1 FROM document_chunks ch WHERE ch.parent_id = dc.id)))
-- legacy fallback: doc 에 kept(clean) hier leaf 가 하나도 없을 때만
OR (dc.source_type IS DISTINCT FROM 'hier_section'
AND NOT EXISTS (SELECT 1 FROM document_chunks h
WHERE h.doc_id = dc.doc_id AND h.source_type = 'hier_section'
AND h.is_leaf = true AND h.embedding IS NOT NULL
AND (length(trim(h.text)) >= 30
OR EXISTS (SELECT 1 FROM document_chunks ch2 WHERE ch2.parent_id = h.id))))
);
+25 -3
View File
@@ -243,6 +243,8 @@ async def call_search(
snapshot_chunk_id_max: int | None = None,
reranker_backend: str | None = None,
rewrite_backend: str | None = None,
corpus_variant: str | None = None,
exact_knn: bool = False,
) -> tuple[list[int], float]:
"""검색 API 호출 → (doc_ids, latency_ms)."""
url = f"{base_url.rstrip('/')}/api/search/"
@@ -264,6 +266,10 @@ async def call_search(
params["reranker_backend"] = reranker_backend
if rewrite_backend is not None:
params["rewrite_backend"] = rewrite_backend
if corpus_variant is not None:
params["corpus_variant"] = corpus_variant
if exact_knn:
params["exact_knn"] = "true"
import time
@@ -296,6 +302,8 @@ async def evaluate(
snapshot_chunk_id_max: int | None = None,
reranker_backend: str | None = None,
rewrite_backend: str | None = None,
corpus_variant: str | None = None,
exact_knn: bool = False,
) -> list[QueryResult]:
"""전체 쿼리셋 평가."""
results: list[QueryResult] = []
@@ -310,6 +318,8 @@ async def evaluate(
snapshot_chunk_id_max=snapshot_chunk_id_max,
reranker_backend=reranker_backend,
rewrite_backend=rewrite_backend,
corpus_variant=corpus_variant,
exact_knn=exact_knn,
)
dedup_count = count_dedup(returned_ids, 10)
if dedup_count > 0:
@@ -1392,6 +1402,18 @@ def main() -> int:
default=None,
help="Phase 2Q Diagnose query rewrite dispatcher slug (baseline | cand_multi_query_macmini | cand_multi_query_macbook). 미지정 = single-query path. Phase 1B scaffold = variants 박제만, retrieval 합성은 Phase 2.",
)
parser.add_argument(
"--corpus-variant",
type=str,
default=None,
choices=["prehier", "hier_sim_raw", "hier_sim_clean"],
help="Hier-Replace-Diagnose-1: chunk leg 측정 뷰 (prehier=legacy baseline | hier_sim_raw | hier_sim_clean). 미지정 = production corpus_chunks.",
)
parser.add_argument(
"--exact-knn",
action="store_true",
help="Hier-Replace-Diagnose-1: vector leg exact KNN (ivfflat 근사 제거). prehier vs hier_sim 공정 비교용. eval 전용.",
)
args = parser.parse_args()
@@ -1445,21 +1467,21 @@ def main() -> int:
if args.base_url:
print(f"\n>>> evaluating: {args.base_url}")
results = asyncio.run(
evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend)
evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend, corpus_variant=args.corpus_variant, exact_knn=args.exact_knn)
)
print_summary("single", results, eval_version=args.eval_version)
all_results.extend(results)
else:
print(f"\n>>> baseline: {args.baseline_url}")
baseline_results = asyncio.run(
evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend)
evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend, corpus_variant=args.corpus_variant, exact_knn=args.exact_knn)
)
baseline_summary = print_summary("baseline", baseline_results, eval_version=args.eval_version)
print(f"\n>>> candidate: {args.candidate_url}")
candidate_results = asyncio.run(
evaluate(
queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend
queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend, corpus_variant=args.corpus_variant, exact_knn=args.exact_knn
)
)
candidate_summary = print_summary("candidate", candidate_results, eval_version=args.eval_version)