From d75fb7adaa5d1cccc1abc52448a41f992e3b7939 Mon Sep 17 00:00:00 2001 From: hyungi Date: Thu, 18 Jun 2026 16:43:38 +0900 Subject: [PATCH] =?UTF-8?q?feat(presegment):=20G2=20PR-1=20=EC=8A=A4?= =?UTF-8?q?=ED=82=A4=EB=A7=88=20=E2=80=94=20documents=20=EB=B6=84=ED=95=A0?= =?UTF-8?q?=20=EC=BB=AC=EB=9F=BC=20+=20lineage=20segmented=5Ffrom=20+=20pr?= =?UTF-8?q?esegment=20=EC=8A=A4=ED=85=8C=EC=9D=B4=EC=A7=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit G2 pre-segmentation 기반 스키마(추가형, 미사용까지 무동작). 권장 기본값 채택: - 362: documents.bundle_page_start/end(1-based)+presegment_role(NULL/parent/child) - 363: document_lineage CHECK 에 'segmented_from' 추가(부모→자식 관계, RESTRICT-delete 재사용) - 364: process_stage enum 에 'presegment'(extract 前 번들 분할 스테이지) - ORM: Document 3컬럼 + queue enum literal + 신규 DocumentLineage 모델 배포 DB(PG16.13, schema_migrations=361) 대비 txn-rollback 실측 PASS(362/363/364 전부). PR-2(presegment_worker+큐 배선+extract/marker range-clamp)·PR-3(LLM 경계 폴백) 후속. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/models/document.py | 8 +++++ app/models/document_lineage.py | 31 +++++++++++++++++++ app/models/queue.py | 3 +- migrations/362_documents_presegment_cols.sql | 10 ++++++ .../363_document_lineage_segmented_from.sql | 8 +++++ migrations/364_process_stage_presegment.sql | 5 +++ 6 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 app/models/document_lineage.py create mode 100644 migrations/362_documents_presegment_cols.sql create mode 100644 migrations/363_document_lineage_segmented_from.sql create mode 100644 migrations/364_process_stage_presegment.sql diff --git a/app/models/document.py b/app/models/document.py index 8436da8..14a2d60 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -41,6 +41,14 @@ class Document(Base): Integer, nullable=False, default=0, server_default="0" ) + # G2 pre-segmentation (migration 362): 번들 PDF → N 자식 분할. + # presegment_role: NULL=일반 단일문서 / 'parent'=번들원본(자체 extract/embed 안 함) / + # 'child'=논리 하위문서(부모 file_path 공유 + bundle_page_start/end 1-based inclusive 범위). + # 부모-자식 관계 자체는 document_lineage(relation_type='segmented_from'). + bundle_page_start: Mapped[int | None] = mapped_column(Integer) + bundle_page_end: Mapped[int | None] = mapped_column(Integer) + presegment_role: Mapped[str | None] = mapped_column(Text) + # 2계층: 텍스트 추출 extracted_text: Mapped[str | None] = mapped_column(Text) extracted_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) diff --git a/app/models/document_lineage.py b/app/models/document_lineage.py new file mode 100644 index 0000000..4cb29fc --- /dev/null +++ b/app/models/document_lineage.py @@ -0,0 +1,31 @@ +"""document_lineage 테이블 ORM — 문서 파생 관계 이력 (migration 217). + +G2 pre-segmentation 이 relation_type='segmented_from'(번들 → 자식) 으로 사용 (migration 363). +이력 테이블 FK = ON DELETE RESTRICT (부모 hard delete 차단, soft delete 만 허용). +""" +from datetime import datetime + +from sqlalchemy import BigInteger, ForeignKey, Text, func +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.types import TIMESTAMP + +from core.database import Base + + +class DocumentLineage(Base): + __tablename__ = "document_lineage" + + id: Mapped[int] = mapped_column(BigInteger, primary_key=True) + source_document_id: Mapped[int] = mapped_column( + BigInteger, ForeignKey("documents.id", ondelete="RESTRICT"), nullable=False + ) + derived_document_id: Mapped[int] = mapped_column( + BigInteger, ForeignKey("documents.id", ondelete="RESTRICT"), nullable=False + ) + relation_type: Mapped[str] = mapped_column(Text, nullable=False) + # 'metadata' 는 SQLAlchemy 예약속성 → Python 속성명은 meta, DB 컬럼명은 metadata. + meta: Mapped[dict] = mapped_column( + "metadata", JSONB, nullable=False, default=dict, server_default="{}" + ) + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), server_default=func.now()) diff --git a/app/models/queue.py b/app/models/queue.py index f750b1a..c0ed78e 100644 --- a/app/models/queue.py +++ b/app/models/queue.py @@ -46,9 +46,10 @@ class ProcessingQueue(Base): # 'stt' (audio): migration 150 / 'thumbnail' (video): queue_consumer 가 enqueue. # 'deep_summary' (PR-B B-1): classify_worker 가 에스컬레이션 시 enqueue. # 'fulltext' (crawl-24x7 A-2): migration 321 — 기사 페이지 fetch 후 본문 승격. + # 'presegment' (G2): migration 364 — extract 前 번들 PDF → N 자식 분할. # DB enum 변경은 마이그레이션이 처리하므로 create_type=False. Enum( - "extract", "classify", "summarize", "embed", "chunk", "preview", + "presegment", "extract", "classify", "summarize", "embed", "chunk", "preview", "stt", "thumbnail", "deep_summary", "markdown", "fulltext", name="process_stage", create_type=False, diff --git a/migrations/362_documents_presegment_cols.sql b/migrations/362_documents_presegment_cols.sql new file mode 100644 index 0000000..b1008fd --- /dev/null +++ b/migrations/362_documents_presegment_cols.sql @@ -0,0 +1,10 @@ +-- 362: G2 pre-segmentation — 번들 PDF(여러 논리문서 한 파일) → N 자식 문서 분할. +-- 자식 doc 의 원본 내 page 범위(1-based inclusive) + 분할 역할 표식. +-- 부모-자식 관계 자체는 document_lineage(relation_type='segmented_from', migration 363). +-- presegment_role: NULL=일반 단일문서(대다수) / 'parent'=번들원본(자체 extract/embed 안 함) / +-- 'child'=논리 하위문서(부모 file_path 공유 + bundle_page_start/end 범위로 슬라이스). +-- 단일 ALTER(다중 절) = 1 statement (asyncpg 멀티스테이트먼트 제약 준수). +ALTER TABLE documents + ADD COLUMN IF NOT EXISTS bundle_page_start INTEGER, + ADD COLUMN IF NOT EXISTS bundle_page_end INTEGER, + ADD COLUMN IF NOT EXISTS presegment_role TEXT; diff --git a/migrations/363_document_lineage_segmented_from.sql b/migrations/363_document_lineage_segmented_from.sql new file mode 100644 index 0000000..e13faf5 --- /dev/null +++ b/migrations/363_document_lineage_segmented_from.sql @@ -0,0 +1,8 @@ +-- 363: G2 — document_lineage.relation_type 에 'segmented_from'(번들 → 자식) 추가. +-- 217 의 column-level CHECK(PG 자동명 document_lineage_relation_type_check, 배포 DB 실측 확인) +-- 를 교체. DROP + ADD 를 단일 ALTER 의 두 절로 = 1 statement. +-- 멱등: DROP ... IF EXISTS 라 재실행 안전(이미 교체됐으면 새 제약 DROP 후 동일 재생성). +ALTER TABLE document_lineage + DROP CONSTRAINT IF EXISTS document_lineage_relation_type_check, + ADD CONSTRAINT document_lineage_relation_type_check + CHECK (relation_type IN ('cited','summarized_from','generated_from','revised_from','segmented_from')); diff --git a/migrations/364_process_stage_presegment.sql b/migrations/364_process_stage_presegment.sql new file mode 100644 index 0000000..218d1e7 --- /dev/null +++ b/migrations/364_process_stage_presegment.sql @@ -0,0 +1,5 @@ +-- 364: G2 — process_stage 큐 스테이지 enum 에 'presegment' 추가 (extract 前 번들 분할 단계). +-- PG16: ALTER TYPE ADD VALUE 는 트랜잭션 내 실행 가능(값 추가만, 同 트랜잭션 내 사용은 안 함 — +-- 사용은 후속 마이그/런타임). IF NOT EXISTS = 재실행 멱등. +-- (이 한 줄 단독 파일 — 1 statement.) +ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'presegment';