feat(search): route retrieval through corpus_chunks view (Hier-Decomp-1 c2)

baseline chunk 벡터검색을 document_chunks → corpus_chunks 뷰(in_corpus=true)로 rewire.
in_corpus=false(비활성 hier leaf 등) 자동 제외 = 검색 오염 구조적 차단(B choke point).

- retrieval_service: baseline chunks_table=corpus_chunks, _VALID_CHUNKS_TABLE 에 corpus_chunks 허용,
  snapshot_clause 조건 corpus_chunks 포함(eval snapshot 보존). candidate(cand_*) 경로 불변.
  documents 측(FTS+doc embedding) 무변경 — doc row 는 교체 무관.
- models/chunk: 5 신규 컬럼 매핑(parent_id/level/node_type/is_leaf/in_corpus). server_default 로
  기존 chunk_worker INSERT 무영향(legacy=in_corpus true/is_leaf false).
- subject_note_rag/explanation_rag: RAG chunk 로드에 in_corpus=true 필터(교체 doc legacy 중복 방지).

게이트: G4b(rewire 불변) before/after IDENTICAL(현재 view==table no-op) / G4a(누출) synthetic
in_corpus=false leaf 가 corpus_chunks 0건·document_chunks raw top(dist 0.0) 양방향 증명. /health 200.

plan: hierarchical-decomposition-tiered-nesting-marmot.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-05-24 12:58:28 +00:00
parent 7971e69e3e
commit f940f50c60
4 changed files with 20 additions and 4 deletions
+9 -1
View File
@@ -3,7 +3,7 @@
from datetime import datetime
from pgvector.sqlalchemy import Vector
from sqlalchemy import BigInteger, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, Integer, SmallInteger, String, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from core.database import Base
@@ -34,6 +34,14 @@ class DocumentChunk(Base):
text: Mapped[str] = mapped_column(Text, nullable=False)
embedding = mapped_column(Vector(1024), nullable=True)
# Hier-Decomp-1: 계층 분해 트리 (migration 282). 기존 chunk_worker INSERT 는 미설정 →
# server_default 로 legacy 행 = in_corpus=true / is_leaf=false 보장.
parent_id: Mapped[int | None] = mapped_column(BigInteger) # 트리 부모. DB FK 미설정(app-level).
level: Mapped[int | None] = mapped_column(SmallInteger) # authoritative depth.
node_type: Mapped[str | None] = mapped_column(Text) # nullable hint, retrieval/replace 활성 조건 미사용.
is_leaf: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default="false") # authoritative leaf 마커.
in_corpus: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default="true") # 검색 코퍼스 편입 여부.
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=datetime.now
)
+7 -3
View File
@@ -67,7 +67,9 @@ CANDIDATE_BACKEND_MAP: dict[str, dict[str, str] | None] = {
# 2단계 gate (R2-B1) — SQL string interpolation 직전 final allowlist.
_VALID_DOCS_TABLE = re.compile(r"^(documents|documents_cand_[a-z0-9_]+)$")
_VALID_CHUNKS_TABLE = re.compile(r"^(document_chunks|document_chunks_cand_[a-z0-9_]+)$")
# corpus_chunks = document_chunks WHERE in_corpus=true 뷰 (Hier-Decomp-1 c2 choke point).
# baseline retrieval 은 이 뷰만 본다 → in_corpus=false(비활성 hier leaf 등) 자동 제외.
_VALID_CHUNKS_TABLE = re.compile(r"^(document_chunks|corpus_chunks|document_chunks_cand_[a-z0-9_]+)$")
def _resolve_backend(slug: str | None) -> dict[str, str] | None:
@@ -266,7 +268,9 @@ async def search_vector(
if cfg is None:
docs_table = "documents"
chunks_table = "document_chunks"
# Hier-Decomp-1 c2: baseline chunk 검색은 corpus_chunks 뷰(in_corpus=true) 경유.
# 현재는 모든 청크 in_corpus=true 라 document_chunks 와 동일 결과(rewire=no-op).
chunks_table = "corpus_chunks"
client = AIClient()
try:
query_embedding = await _get_query_embedding(client, query)
@@ -398,7 +402,7 @@ async def _search_vector_chunks(
params: dict[str, Any] = {"embedding": embedding_str, "inner_k": inner_k, "limit": limit}
snapshot_clause = ""
if chunks_table == "document_chunks" and snapshot_chunk_id_max is not None:
if chunks_table in ("document_chunks", "corpus_chunks") and snapshot_chunk_id_max is not None:
snapshot_clause = " AND c.id <= :snapshot_chunk_id_max"
params["snapshot_chunk_id_max"] = snapshot_chunk_id_max
+2
View File
@@ -147,6 +147,8 @@ async def _gather_document_evidence(
.where(
DocumentChunk.doc_id.in_(valid_doc_ids),
DocumentChunk.chunk_index < 4,
# Hier-Decomp-1 c2: 교체된 doc 의 legacy(in_corpus=false) chunk 중복 로드 방지.
DocumentChunk.in_corpus.is_(True),
)
.order_by(DocumentChunk.doc_id, DocumentChunk.chunk_index)
)
+2
View File
@@ -129,6 +129,8 @@ async def _gather_document_evidence(
.where(
DocumentChunk.doc_id.in_(valid_doc_ids),
DocumentChunk.chunk_index < 4,
# Hier-Decomp-1 c2: 교체된 doc 의 legacy(in_corpus=false) chunk 중복 로드 방지.
DocumentChunk.in_corpus.is_(True),
)
.order_by(DocumentChunk.doc_id, DocumentChunk.chunk_index)
)