f940f50c60
baseline chunk 벡터검색을 document_chunks → corpus_chunks 뷰(in_corpus=true)로 rewire. in_corpus=false(비활성 hier leaf 등) 자동 제외 = 검색 오염 구조적 차단(B choke point). - retrieval_service: baseline chunks_table=corpus_chunks, _VALID_CHUNKS_TABLE 에 corpus_chunks 허용, snapshot_clause 조건 corpus_chunks 포함(eval snapshot 보존). candidate(cand_*) 경로 불변. documents 측(FTS+doc embedding) 무변경 — doc row 는 교체 무관. - models/chunk: 5 신규 컬럼 매핑(parent_id/level/node_type/is_leaf/in_corpus). server_default 로 기존 chunk_worker INSERT 무영향(legacy=in_corpus true/is_leaf false). - subject_note_rag/explanation_rag: RAG chunk 로드에 in_corpus=true 필터(교체 doc legacy 중복 방지). 게이트: G4b(rewire 불변) before/after IDENTICAL(현재 view==table no-op) / G4a(누출) synthetic in_corpus=false leaf 가 corpus_chunks 0건·document_chunks raw top(dist 0.0) 양방향 증명. /health 200. plan: hierarchical-decomposition-tiered-nesting-marmot.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
55 lines
2.4 KiB
Python
55 lines
2.4 KiB
Python
"""document_chunks 테이블 ORM — chunk 단위 검색 (Phase 0.1)"""
|
|
|
|
from datetime import datetime
|
|
|
|
from pgvector.sqlalchemy import Vector
|
|
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, Integer, SmallInteger, String, Text, UniqueConstraint
|
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
|
|
from core.database import Base
|
|
|
|
|
|
class DocumentChunk(Base):
|
|
__tablename__ = "document_chunks"
|
|
|
|
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
|
|
doc_id: Mapped[int] = mapped_column(
|
|
BigInteger, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
|
)
|
|
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
|
|
# chunking 전략 메타
|
|
chunk_type: Mapped[str] = mapped_column(String(30), nullable=False)
|
|
section_title: Mapped[str | None] = mapped_column(Text)
|
|
heading_path: Mapped[str | None] = mapped_column(Text)
|
|
page: Mapped[int | None] = mapped_column(Integer)
|
|
|
|
# 다국어/domain 메타
|
|
language: Mapped[str | None] = mapped_column(String(10))
|
|
country: Mapped[str | None] = mapped_column(String(10))
|
|
source: Mapped[str | None] = mapped_column(String(100))
|
|
domain_category: Mapped[str] = mapped_column(String(20), nullable=False)
|
|
|
|
# 본문 + 임베딩
|
|
text: Mapped[str] = mapped_column(Text, nullable=False)
|
|
embedding = mapped_column(Vector(1024), nullable=True)
|
|
|
|
# Hier-Decomp-1: 계층 분해 트리 (migration 282). 기존 chunk_worker INSERT 는 미설정 →
|
|
# server_default 로 legacy 행 = in_corpus=true / is_leaf=false 보장.
|
|
parent_id: Mapped[int | None] = mapped_column(BigInteger) # 트리 부모. DB FK 미설정(app-level).
|
|
level: Mapped[int | None] = mapped_column(SmallInteger) # authoritative depth.
|
|
node_type: Mapped[str | None] = mapped_column(Text) # nullable hint, retrieval/replace 활성 조건 미사용.
|
|
is_leaf: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default="false") # authoritative leaf 마커.
|
|
in_corpus: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default="true") # 검색 코퍼스 편입 여부.
|
|
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), default=datetime.now
|
|
)
|
|
updated_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), default=datetime.now, onupdate=datetime.now
|
|
)
|
|
|
|
__table_args__ = (
|
|
UniqueConstraint("doc_id", "chunk_index", name="uq_chunks_doc_index"),
|
|
)
|