From f940f50c6020e286286fd8db6637c1e8621d3521 Mon Sep 17 00:00:00 2001
From: hyungi <hyun49196@gmail.com>
Date: Sun, 24 May 2026 12:58:28 +0000
Subject: [PATCH] feat(search): route retrieval through corpus_chunks view
 (Hier-Decomp-1 c2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

baseline chunk 벡터검색을 document_chunks → corpus_chunks 뷰(in_corpus=true)로 rewire.
in_corpus=false(비활성 hier leaf 등) 자동 제외 = 검색 오염 구조적 차단(B choke point).

- retrieval_service: baseline chunks_table=corpus_chunks, _VALID_CHUNKS_TABLE 에 corpus_chunks 허용,
  snapshot_clause 조건 corpus_chunks 포함(eval snapshot 보존). candidate(cand_*) 경로 불변.
  documents 측(FTS+doc embedding) 무변경 — doc row 는 교체 무관.
- models/chunk: 5 신규 컬럼 매핑(parent_id/level/node_type/is_leaf/in_corpus). server_default 로
  기존 chunk_worker INSERT 무영향(legacy=in_corpus true/is_leaf false).
- subject_note_rag/explanation_rag: RAG chunk 로드에 in_corpus=true 필터(교체 doc legacy 중복 방지).

게이트: G4b(rewire 불변) before/after IDENTICAL(현재 view==table no-op) / G4a(누출) synthetic
in_corpus=false leaf 가 corpus_chunks 0건·document_chunks raw top(dist 0.0) 양방향 증명. /health 200.

plan: hierarchical-decomposition-tiered-nesting-marmot.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/models/chunk.py                      | 10 +++++++++-
 app/services/search/retrieval_service.py | 10 +++++++---
 app/services/study/explanation_rag.py    |  2 ++
 app/services/study/subject_note_rag.py   |  2 ++
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/app/models/chunk.py b/app/models/chunk.py
index cc37364..23dfba7 100644
--- a/app/models/chunk.py
+++ b/app/models/chunk.py
@@ -3,7 +3,7 @@
 from datetime import datetime
 
 from pgvector.sqlalchemy import Vector
-from sqlalchemy import BigInteger, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint
+from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, Integer, SmallInteger, String, Text, UniqueConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 
 from core.database import Base
@@ -34,6 +34,14 @@ class DocumentChunk(Base):
     text: Mapped[str] = mapped_column(Text, nullable=False)
     embedding = mapped_column(Vector(1024), nullable=True)
 
+    # Hier-Decomp-1: 계층 분해 트리 (migration 282). 기존 chunk_worker INSERT 는 미설정 →
+    # server_default 로 legacy 행 = in_corpus=true / is_leaf=false 보장.
+    parent_id: Mapped[int | None] = mapped_column(BigInteger)  # 트리 부모. DB FK 미설정(app-level).
+    level: Mapped[int | None] = mapped_column(SmallInteger)    # authoritative depth.
+    node_type: Mapped[str | None] = mapped_column(Text)        # nullable hint, retrieval/replace 활성 조건 미사용.
+    is_leaf: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default="false")   # authoritative leaf 마커.
+    in_corpus: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default="true")  # 검색 코퍼스 편입 여부.
+
     created_at: Mapped[datetime] = mapped_column(
         DateTime(timezone=True), default=datetime.now
     )
diff --git a/app/services/search/retrieval_service.py b/app/services/search/retrieval_service.py
index 8fcf6d1..75c4fb6 100644
--- a/app/services/search/retrieval_service.py
+++ b/app/services/search/retrieval_service.py
@@ -67,7 +67,9 @@ CANDIDATE_BACKEND_MAP: dict[str, dict[str, str] | None] = {
 
 # 2단계 gate (R2-B1) — SQL string interpolation 직전 final allowlist.
 _VALID_DOCS_TABLE = re.compile(r"^(documents|documents_cand_[a-z0-9_]+)$")
-_VALID_CHUNKS_TABLE = re.compile(r"^(document_chunks|document_chunks_cand_[a-z0-9_]+)$")
+# corpus_chunks = document_chunks WHERE in_corpus=true 뷰 (Hier-Decomp-1 c2 choke point).
+# baseline retrieval 은 이 뷰만 본다 → in_corpus=false(비활성 hier leaf 등) 자동 제외.
+_VALID_CHUNKS_TABLE = re.compile(r"^(document_chunks|corpus_chunks|document_chunks_cand_[a-z0-9_]+)$")
 
 
 def _resolve_backend(slug: str | None) -> dict[str, str] | None:
@@ -266,7 +268,9 @@ async def search_vector(
 
     if cfg is None:
         docs_table = "documents"
-        chunks_table = "document_chunks"
+        # Hier-Decomp-1 c2: baseline chunk 검색은 corpus_chunks 뷰(in_corpus=true) 경유.
+        # 현재는 모든 청크 in_corpus=true 라 document_chunks 와 동일 결과(rewire=no-op).
+        chunks_table = "corpus_chunks"
         client = AIClient()
         try:
             query_embedding = await _get_query_embedding(client, query)
@@ -398,7 +402,7 @@ async def _search_vector_chunks(
     params: dict[str, Any] = {"embedding": embedding_str, "inner_k": inner_k, "limit": limit}
 
     snapshot_clause = ""
-    if chunks_table == "document_chunks" and snapshot_chunk_id_max is not None:
+    if chunks_table in ("document_chunks", "corpus_chunks") and snapshot_chunk_id_max is not None:
         snapshot_clause = " AND c.id <= :snapshot_chunk_id_max"
         params["snapshot_chunk_id_max"] = snapshot_chunk_id_max
 
diff --git a/app/services/study/explanation_rag.py b/app/services/study/explanation_rag.py
index 4fc5330..a690d8c 100644
--- a/app/services/study/explanation_rag.py
+++ b/app/services/study/explanation_rag.py
@@ -147,6 +147,8 @@ async def _gather_document_evidence(
             .where(
                 DocumentChunk.doc_id.in_(valid_doc_ids),
                 DocumentChunk.chunk_index < 4,
+                # Hier-Decomp-1 c2: 교체된 doc 의 legacy(in_corpus=false) chunk 중복 로드 방지.
+                DocumentChunk.in_corpus.is_(True),
             )
             .order_by(DocumentChunk.doc_id, DocumentChunk.chunk_index)
         )
diff --git a/app/services/study/subject_note_rag.py b/app/services/study/subject_note_rag.py
index 00aa9ce..17bcfeb 100644
--- a/app/services/study/subject_note_rag.py
+++ b/app/services/study/subject_note_rag.py
@@ -129,6 +129,8 @@ async def _gather_document_evidence(
             .where(
                 DocumentChunk.doc_id.in_(valid_doc_ids),
                 DocumentChunk.chunk_index < 4,
+                # Hier-Decomp-1 c2: 교체된 doc 의 legacy(in_corpus=false) chunk 중복 로드 방지.
+                DocumentChunk.in_corpus.is_(True),
             )
             .order_by(DocumentChunk.doc_id, DocumentChunk.chunk_index)
         )