diff --git a/app/models/document.py b/app/models/document.py index cf6d3cc..dff88c8 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -35,6 +35,7 @@ class Document(Base): # 2계층: 추출 메타 (OCR 판정/실행) extract_meta: Mapped[dict | None] = mapped_column(JSONB, default=dict) + ocr_derived: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) # 2계층: AI 가공 ai_summary: Mapped[str | None] = mapped_column(Text) diff --git a/migrations/277_documents_ocr_derived.sql b/migrations/277_documents_ocr_derived.sql new file mode 100644 index 0000000..0e0942a --- /dev/null +++ b/migrations/277_documents_ocr_derived.sql @@ -0,0 +1,4 @@ +-- 2026-05-24 PR-Chore-OCR-Column-1 (1/2): documents.ocr_derived boolean 컬럼 추가. +-- RAG-independent data hygiene. ocr_derived 식별 컬럼 부재 = PR-Eval-V0_2 TBD-O FAILED 원인. +-- backfill 은 278 별 migration (asyncpg single-statement 제약). +ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_derived boolean DEFAULT false NOT NULL; diff --git a/migrations/278_documents_ocr_derived_backfill.sql b/migrations/278_documents_ocr_derived_backfill.sql new file mode 100644 index 0000000..132fbd2 --- /dev/null +++ b/migrations/278_documents_ocr_derived_backfill.sql @@ -0,0 +1,5 @@ +-- 2026-05-24 PR-Chore-OCR-Column-1 (2/2): ocr_derived backfill. +-- Rule R1 단독 (실측 audit 후): extract_meta ? ocr_attempted AND ocr_attempted = true → 8 rows. +-- R2 (이미지 file_format) 폐기 = 1건 R1 흡수 + 1건 marker 미처리 false. +-- R3 (marker PDF extract_meta 부재 283 rows) 폐기 = born-digital false positive 위험. +UPDATE documents SET ocr_derived = true WHERE deleted_at IS NULL AND extract_meta ? 'ocr_attempted' AND (extract_meta->>'ocr_attempted')::boolean = true;