feat(documents): §1 category enum + ai_suggestion 승인 파이프
plan: ~/.claude/plans/luminous-sprouting-hamster.md §1
- migrations/143_category.sql: doc_category enum (6 활성 + 3 유보) +
documents.category + documents.ai_suggestion JSONB + 2 idx.
- app/models/document.py: category (Enum, create_type=False), ai_suggestion (JSONB).
- app/prompts/classify.txt: document_type enum 에 7 실무 doctype 추가
(발주서/세금계산서/명세표/도면/증명서/계획서/시방서) + facet_doctype
필드 directive.
- config.yaml: document_types 에 7 항목 추가 (worker 검증 통과).
- app/workers/classify_worker.py: FACET_DOCTYPES / LIBRARY_SUGGESTION_DOCTYPES
상수, facet_doctype 파싱(기존값 미덮어씀), 발주서/세금계산서/명세표
감지 시 ai_suggestion={proposed_category=library, proposed_path=@library/
거래/{YYYY}/{doctype}, source_updated_at=doc.updated_at.isoformat(), ...}.
category / user_tags 자동 전이 금지 (suggestion-only).
- app/api/documents.py:
· DocumentResponse 에 category / ai_suggestion 노출
· GET /documents ?category=<cat> / ?has_suggestion / ?proposed_category
(category 지정 시 기본 news/memo 제외 해제 — §2 승인 UI 계약)
· GET /documents/library 를 Document.category=='library' 기반으로 재구현
(path subquery 는 user_tags 유지 — 분류 내부 서가 경로)
· POST /documents/{id}/accept-suggestion — FOR UPDATE + idempotent no-op +
dual 409 stale (payload source_updated_at / documents.updated_at) +
user_tags idempotent append
· DELETE /documents/{id}/suggestion — idempotent, stale 검사 없음
- scripts/backfill_category.py: dry-run / apply. 매핑(news/memo/@library/else)
+ 3-way 상대 검증 (all_rows==categorized, uncategorized==0,
cat_library==has_library_tag — 자동 전이 금지 정책 검증).
남은 DoD (원격 배포 후): docker compose up → migration 143 적용 → backfill
apply → smoke (drive_sync 발주서 업로드 suggestion 생성 / category 유지,
accept-suggestion idempotency + 409 stale 두 벡터, /documents?category=library
== /documents/library 건수 일치).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,182 @@
|
||||
"""§1 백필 — documents.category 전체 행 채우기.
|
||||
|
||||
plan: luminous-sprouting-hamster.md §1
|
||||
|
||||
매핑 규칙 (category IS NULL 인 모든 행 대상):
|
||||
source_channel='news' → category='news'
|
||||
source_channel='memo' → category='memo'
|
||||
user_tags 에 '@library/' 태그 보유 → category='library'
|
||||
그 외 → category='document'
|
||||
|
||||
자동 library 전이 금지 — 기존 @library/ 태그 보유분만 'library' 로 이행.
|
||||
audio/video 는 §3 이후 생성 (백필 대상 없음).
|
||||
|
||||
실행:
|
||||
docker compose exec fastapi python /app/scripts/backfill_category.py --dry-run
|
||||
docker compose exec fastapi python /app/scripts/backfill_category.py --apply
|
||||
|
||||
로컬:
|
||||
python scripts/backfill_category.py --dry-run
|
||||
DATABASE_URL=postgresql+asyncpg://... python scripts/backfill_category.py --apply
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
|
||||
|
||||
CLASSIFY_SQL = """
|
||||
WITH classified AS (
|
||||
SELECT
|
||||
id,
|
||||
CASE
|
||||
WHEN source_channel = 'news' THEN 'news'
|
||||
WHEN source_channel = 'memo' THEN 'memo'
|
||||
WHEN file_type = 'note' THEN 'memo'
|
||||
WHEN EXISTS (
|
||||
SELECT 1 FROM jsonb_array_elements_text(
|
||||
COALESCE(user_tags, '[]'::jsonb)
|
||||
) AS t
|
||||
WHERE t LIKE '@library/%'
|
||||
) THEN 'library'
|
||||
ELSE 'document'
|
||||
END AS target_category
|
||||
FROM documents
|
||||
WHERE category IS NULL
|
||||
)
|
||||
SELECT target_category, COUNT(*) AS n FROM classified GROUP BY 1 ORDER BY 2 DESC;
|
||||
"""
|
||||
|
||||
APPLY_SQL = """
|
||||
UPDATE documents
|
||||
SET category = CASE
|
||||
WHEN source_channel = 'news' THEN 'news'::doc_category
|
||||
WHEN source_channel = 'memo' THEN 'memo'::doc_category
|
||||
WHEN file_type = 'note' THEN 'memo'::doc_category
|
||||
WHEN EXISTS (
|
||||
SELECT 1 FROM jsonb_array_elements_text(
|
||||
COALESCE(documents.user_tags, '[]'::jsonb)
|
||||
) AS t
|
||||
WHERE t LIKE '@library/%'
|
||||
) THEN 'library'::doc_category
|
||||
ELSE 'document'::doc_category
|
||||
END
|
||||
WHERE category IS NULL;
|
||||
"""
|
||||
|
||||
VERIFY_SQL = """
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM documents) AS all_rows,
|
||||
(SELECT COUNT(*) FROM documents WHERE category IS NOT NULL) AS categorized,
|
||||
(SELECT COUNT(*) FROM documents WHERE category IS NULL) AS uncategorized,
|
||||
(SELECT COUNT(*) FROM documents WHERE category = 'library') AS cat_library,
|
||||
(SELECT COUNT(*) FROM documents
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM jsonb_array_elements_text(
|
||||
COALESCE(user_tags, '[]'::jsonb)
|
||||
) AS t
|
||||
WHERE t LIKE '@library/%'
|
||||
)) AS has_library_tag;
|
||||
"""
|
||||
|
||||
DIST_SQL = """
|
||||
SELECT COALESCE(category::text, '(null)') AS category, COUNT(*) AS n
|
||||
FROM documents
|
||||
GROUP BY category
|
||||
ORDER BY n DESC;
|
||||
"""
|
||||
|
||||
|
||||
async def run(apply: bool) -> int:
|
||||
database_url = os.getenv(
|
||||
"DATABASE_URL",
|
||||
"postgresql+asyncpg://pkm:pkm@localhost:5432/pkm",
|
||||
)
|
||||
|
||||
engine = create_async_engine(database_url)
|
||||
session_factory = async_sessionmaker(
|
||||
engine, class_=AsyncSession, expire_on_commit=False
|
||||
)
|
||||
|
||||
async with session_factory() as session:
|
||||
# 1. 현재 분포
|
||||
print("=== 현재 category 분포 ===")
|
||||
rows = (await session.execute(text(DIST_SQL))).all()
|
||||
for row in rows:
|
||||
print(f" {row.category:12} {row.n}")
|
||||
|
||||
# 2. 분류 예상 (NULL 대상만)
|
||||
print("\n=== NULL → target category (매핑 예상) ===")
|
||||
rows = (await session.execute(text(CLASSIFY_SQL))).all()
|
||||
pending_total = 0
|
||||
for row in rows:
|
||||
print(f" {row.target_category:12} {row.n}")
|
||||
pending_total += row.n
|
||||
|
||||
if pending_total == 0:
|
||||
print("\n백필 대상 없음 (모든 행 이미 category 설정됨).")
|
||||
await engine.dispose()
|
||||
return 0
|
||||
|
||||
if not apply:
|
||||
print(f"\n[dry-run] {pending_total}건 영향. --apply 로 실제 적용.")
|
||||
await engine.dispose()
|
||||
return 0
|
||||
|
||||
# 3. apply
|
||||
print(f"\n[apply] UPDATE 실행 — {pending_total}건 대상")
|
||||
result = await session.execute(text(APPLY_SQL))
|
||||
await session.commit()
|
||||
print(f" rowcount = {result.rowcount}")
|
||||
|
||||
# 4. verify
|
||||
print("\n=== 백필 후 검증 ===")
|
||||
row = (await session.execute(text(VERIFY_SQL))).one()
|
||||
print(f" all_rows = {row.all_rows}")
|
||||
print(f" categorized = {row.categorized}")
|
||||
print(f" uncategorized = {row.uncategorized}")
|
||||
print(f" cat_library = {row.cat_library}")
|
||||
print(f" has_library_tag = {row.has_library_tag}")
|
||||
|
||||
fail = []
|
||||
if row.uncategorized != 0:
|
||||
fail.append(f"uncategorized={row.uncategorized} (기대 0)")
|
||||
if row.all_rows != row.categorized:
|
||||
fail.append(f"all={row.all_rows} != categorized={row.categorized}")
|
||||
if row.cat_library != row.has_library_tag:
|
||||
fail.append(
|
||||
f"cat_library={row.cat_library} != has_library_tag={row.has_library_tag} "
|
||||
"(자동 전이 없음 정책 위반)"
|
||||
)
|
||||
if fail:
|
||||
print("\n!! 검증 실패:")
|
||||
for f in fail:
|
||||
print(f" - {f}")
|
||||
await engine.dispose()
|
||||
return 1
|
||||
|
||||
print("\n검증 통과.")
|
||||
|
||||
await engine.dispose()
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="documents.category 백필")
|
||||
mode = parser.add_mutually_exclusive_group(required=True)
|
||||
mode.add_argument("--dry-run", action="store_true", help="변경 없이 분포만 보고")
|
||||
mode.add_argument("--apply", action="store_true", help="실제 UPDATE 실행")
|
||||
args = parser.parse_args()
|
||||
|
||||
rc = asyncio.run(run(apply=args.apply))
|
||||
sys.exit(rc)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user