GPU 서버에 untracked로만 존재하던 Phase 0.1 코드를 정식 commit: - app/models/chunk.py — DocumentChunk ORM (country/source/domain 메타 포함) - app/workers/chunk_worker.py — 6가지 chunking 전략 (legal/news/markdown/email/long_pdf/default) - migrations/014_document_chunks.sql — pgvector + FTS + trigram 인덱스 - app/models/queue.py — ProcessingQueue enum에 'chunk' stage 추가 - app/workers/queue_consumer.py — chunk stage 등록, classify→[embed,chunk] 자동 연결 Phase 1 reranker 통합 작업의 전제 조건. document_chunks 테이블 기반 retrieval에 사용.
47 lines
1.7 KiB
Python
47 lines
1.7 KiB
Python
"""document_chunks 테이블 ORM — chunk 단위 검색 (Phase 0.1)"""
|
|
|
|
from datetime import datetime
|
|
|
|
from pgvector.sqlalchemy import Vector
|
|
from sqlalchemy import BigInteger, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint
|
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
|
|
from core.database import Base
|
|
|
|
|
|
class DocumentChunk(Base):
|
|
__tablename__ = "document_chunks"
|
|
|
|
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
|
|
doc_id: Mapped[int] = mapped_column(
|
|
BigInteger, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
|
)
|
|
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
|
|
# chunking 전략 메타
|
|
chunk_type: Mapped[str] = mapped_column(String(30), nullable=False)
|
|
section_title: Mapped[str | None] = mapped_column(Text)
|
|
heading_path: Mapped[str | None] = mapped_column(Text)
|
|
page: Mapped[int | None] = mapped_column(Integer)
|
|
|
|
# 다국어/domain 메타
|
|
language: Mapped[str | None] = mapped_column(String(10))
|
|
country: Mapped[str | None] = mapped_column(String(10))
|
|
source: Mapped[str | None] = mapped_column(String(100))
|
|
domain_category: Mapped[str] = mapped_column(String(20), nullable=False)
|
|
|
|
# 본문 + 임베딩
|
|
text: Mapped[str] = mapped_column(Text, nullable=False)
|
|
embedding = mapped_column(Vector(1024), nullable=True)
|
|
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), default=datetime.now
|
|
)
|
|
updated_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), default=datetime.now, onupdate=datetime.now
|
|
)
|
|
|
|
__table_args__ = (
|
|
UniqueConstraint("doc_id", "chunk_index", name="uq_chunks_doc_index"),
|
|
)
|