From 72190cf90a4bf89cb6f4c37627384f142a2cb2d7 Mon Sep 17 00:00:00 2001 From: hyungi Date: Sun, 24 May 2026 07:01:34 +0000 Subject: [PATCH] feat(search): add document_chunks page/source columns + unique idx migrations 279-281: page_start/end + source_type/chunker_version/source_hash/chunk_content_hash, legacy backfill (30,952 rows), unique (doc_id,source_type,chunker_version,chunk_index). PR-DocSrv-LargeDoc-Split-Markdown-1 commit 1. Co-Authored-By: Claude Opus 4.7 (1M context) --- migrations/279_document_chunks_split_columns.sql | 10 ++++++++++ migrations/280_document_chunks_legacy_backfill.sql | 8 ++++++++ migrations/281_document_chunks_source_unique_idx.sql | 5 +++++ 3 files changed, 23 insertions(+) create mode 100644 migrations/279_document_chunks_split_columns.sql create mode 100644 migrations/280_document_chunks_legacy_backfill.sql create mode 100644 migrations/281_document_chunks_source_unique_idx.sql diff --git a/migrations/279_document_chunks_split_columns.sql b/migrations/279_document_chunks_split_columns.sql new file mode 100644 index 0000000..3b23093 --- /dev/null +++ b/migrations/279_document_chunks_split_columns.sql @@ -0,0 +1,10 @@ +-- PR-DocSrv-LargeDoc-Split-Markdown-1 (commit 1: schema) +-- 통합 schema: LargeDoc(page_start/end) + Phase 3A(source_type/chunker_version/source_hash/chunk_content_hash) 선반영. +-- Phase 3A 세션은 이 컬럼 위에서 동작 (schema 미터치). 분담 plan: brisk-paging-quokka.md +ALTER TABLE document_chunks + ADD COLUMN IF NOT EXISTS page_start integer, + ADD COLUMN IF NOT EXISTS page_end integer, + ADD COLUMN IF NOT EXISTS source_type text, + ADD COLUMN IF NOT EXISTS chunker_version text, + ADD COLUMN IF NOT EXISTS source_hash text, + ADD COLUMN IF NOT EXISTS chunk_content_hash text; diff --git a/migrations/280_document_chunks_legacy_backfill.sql b/migrations/280_document_chunks_legacy_backfill.sql new file mode 100644 index 0000000..278ddfe --- /dev/null +++ b/migrations/280_document_chunks_legacy_backfill.sql @@ -0,0 +1,8 @@ +-- 기존 row 안전 backfill = legacy/legacy (과분류 X; Phase 3A 가 정확 source_type 으로 재생성/정리). +-- page_start/end 는 기존 단일 page 컬럼으로 (page NULL 이면 NULL 유지). +UPDATE document_chunks SET + source_type = COALESCE(source_type, 'legacy'), + chunker_version = COALESCE(chunker_version, 'legacy'), + page_start = COALESCE(page_start, page), + page_end = COALESCE(page_end, page) +WHERE source_type IS NULL OR chunker_version IS NULL OR page_start IS NULL OR page_end IS NULL; diff --git a/migrations/281_document_chunks_source_unique_idx.sql b/migrations/281_document_chunks_source_unique_idx.sql new file mode 100644 index 0000000..2e7c04d --- /dev/null +++ b/migrations/281_document_chunks_source_unique_idx.sql @@ -0,0 +1,5 @@ +-- UNIQUE (doc_id, source_type, chunker_version, chunk_index). +-- dup-check (doc_id, chunk_index)=0 확인 (2026-05-24) → legacy backfill 후 키 유효. +-- 비-CONCURRENTLY: runner 트랜잭션 안전 + ~30K rows 규모 현실적. +CREATE UNIQUE INDEX IF NOT EXISTS uq_document_chunks_source_version_index +ON document_chunks (doc_id, source_type, chunker_version, chunk_index);