Files
hyungi 100aaa3b0c feat(search): corpus_variant + exact_knn measurement dispatch (replace-diagnose c4+c5)
PR-DocSrv-Hier-Replace-Diagnose-1 c4+c5. hier vs prehier(legacy) go/no-go 비파괴 측정 hook.
- 측정 뷰 3종 (hier_measure_views.sql, additive/droppable): corpus_chunks_prehier
  (legacy+null-source 375 포함) / hier_sim_raw / hier_sim_clean (childless-tiny<30 제외,
  all-tiny doc 은 legacy fallback 정합).
- retrieval_service: _resolve_corpus_variant + CORPUS_VARIANT_MAP + _VALID_CHUNKS_TABLE
  3 뷰 추가 + exact_knn(SET LOCAL enable_indexscan/bitmapscan=off, eval 전용).
  chunk leg 만 영향 (doc-level + fts/trgm = documents 무관). baseline/None path 회귀 0.
- search_pipeline.run_search + search.py: corpus_variant/exact_knn 전달, unknown→400,
  embedding_backend cand 와 동시 사용 금지(400).
- run_eval: --corpus-variant + --exact-knn flag.
- tests/test_corpus_variant.py 22 PASS (resolver/map/allowlist + SQL injection 거부).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 05:37:15 +00:00

75 lines
2.4 KiB
Python

"""Hier-Replace-Diagnose-1 c5 — corpus_variant dispatcher 단위 테스트.
가드:
1. _resolve_corpus_variant — slug→view, unknown ValueError, None→None
2. CORPUS_VARIANT_MAP — 3 slug 1:1
3. _VALID_CHUNKS_TABLE — 측정 뷰 3종 허용 + junk/injection 거부
"""
from __future__ import annotations
import logging
import os
import sys
import pytest
# logs/llm_gate.log root 소유 → import 시 PermissionError safe-wrap (test_query_rewriter 패턴)
_orig = logging.FileHandler
logging.FileHandler = lambda f, *a, **k: (_orig(f, *a, **k) if _try(f) else logging.NullHandler()) # type: ignore
def _try(f):
try:
open(f, "a").close()
return True
except Exception:
return False
os.environ.setdefault("DATABASE_URL", "postgresql+asyncpg://test:test@localhost:5432/test")
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
from services.search import retrieval_service as rs
def test_resolve_valid_slugs():
assert rs._resolve_corpus_variant("prehier") == "corpus_chunks_prehier"
assert rs._resolve_corpus_variant("hier_sim_raw") == "corpus_chunks_hier_sim_raw"
assert rs._resolve_corpus_variant("hier_sim_clean") == "corpus_chunks_hier_sim_clean"
def test_resolve_none():
assert rs._resolve_corpus_variant(None) is None
@pytest.mark.parametrize("bad", ["", "hier", "corpus_chunks", "prehier; DROP TABLE", "hier_sim", "HIER_SIM_CLEAN"])
def test_resolve_unknown_raises(bad):
with pytest.raises(ValueError, match="unknown_corpus_variant"):
rs._resolve_corpus_variant(bad)
def test_variant_map_keys():
assert set(rs.CORPUS_VARIANT_MAP) == {"prehier", "hier_sim_raw", "hier_sim_clean"}
@pytest.mark.parametrize("view", [
"corpus_chunks_prehier", "corpus_chunks_hier_sim_raw", "corpus_chunks_hier_sim_clean",
"document_chunks", "corpus_chunks", "document_chunks_cand_me5",
])
def test_valid_chunks_table_allows(view):
assert rs._VALID_CHUNKS_TABLE.match(view)
@pytest.mark.parametrize("bad", [
"corpus_chunks_prehier; DROP TABLE x", "corpus_chunks_hier_sim", "documents",
"corpus_chunks_evil", "'; DELETE--", "corpus_chunks_hier_sim_cleanX",
])
def test_valid_chunks_table_rejects(bad):
assert not rs._VALID_CHUNKS_TABLE.match(bad)
def test_all_mapped_views_pass_allowlist():
# resolver 가 내놓는 모든 뷰는 SQL interpolation gate 통과해야 함
for v in rs.CORPUS_VARIANT_MAP.values():
assert rs._VALID_CHUNKS_TABLE.match(v)