"""§1 백필 — documents.category 전체 행 채우기. plan: luminous-sprouting-hamster.md §1 확장: stateless-churning-raccoon.md (law 카테고리 분기 + idempotent 재실행) 매핑 규칙: source_channel='law_monitor' → category='law' source_channel='news' → category='news' source_channel='memo' → category='memo' file_type='note' → category='memo' user_tags 에 '@library/' 태그 보유 → category='library' 그 외 → category='document' 대상 WHERE: category IS NULL OR (source_channel='law_monitor' AND category='document') — law 신규 enum 도입 전 backfill 로 'document' 배정됐던 206건 재분류. 자동 library 전이 금지 — 기존 @library/ 태그 보유분만 'library' 로 이행. audio/video 는 §3 이후 생성 (백필 대상 없음). 실행: docker compose exec fastapi python /app/scripts/backfill_category.py --dry-run docker compose exec fastapi python /app/scripts/backfill_category.py --apply 로컬: python scripts/backfill_category.py --dry-run DATABASE_URL=postgresql+asyncpg://... python scripts/backfill_category.py --apply """ import argparse import asyncio import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app")) from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine CLASSIFY_SQL = """ WITH classified AS ( SELECT id, CASE WHEN source_channel = 'law_monitor' THEN 'law' WHEN source_channel = 'news' THEN 'news' WHEN source_channel = 'memo' THEN 'memo' WHEN file_type = 'note' THEN 'memo' WHEN EXISTS ( SELECT 1 FROM jsonb_array_elements_text( COALESCE(user_tags, '[]'::jsonb) ) AS t WHERE t LIKE '@library/%' ) THEN 'library' ELSE 'document' END AS target_category FROM documents WHERE category IS NULL OR (source_channel = 'law_monitor' AND category = 'document') ) SELECT target_category, COUNT(*) AS n FROM classified GROUP BY 1 ORDER BY 2 DESC; """ APPLY_SQL = """ UPDATE documents SET category = CASE WHEN source_channel = 'law_monitor' THEN 'law'::doc_category WHEN source_channel = 'news' THEN 'news'::doc_category WHEN source_channel = 'memo' THEN 'memo'::doc_category WHEN file_type = 'note' THEN 'memo'::doc_category WHEN EXISTS ( SELECT 1 FROM jsonb_array_elements_text( COALESCE(documents.user_tags, '[]'::jsonb) ) AS t WHERE t LIKE '@library/%' ) THEN 'library'::doc_category ELSE 'document'::doc_category END WHERE category IS NULL OR (source_channel = 'law_monitor' AND category = 'document'); """ VERIFY_SQL = """ SELECT (SELECT COUNT(*) FROM documents) AS all_rows, (SELECT COUNT(*) FROM documents WHERE category IS NOT NULL) AS categorized, (SELECT COUNT(*) FROM documents WHERE category IS NULL) AS uncategorized, (SELECT COUNT(*) FROM documents WHERE category = 'library') AS cat_library, (SELECT COUNT(*) FROM documents WHERE EXISTS ( SELECT 1 FROM jsonb_array_elements_text( COALESCE(user_tags, '[]'::jsonb) ) AS t WHERE t LIKE '@library/%' )) AS has_library_tag, (SELECT COUNT(*) FROM documents WHERE category = 'law') AS cat_law, (SELECT COUNT(*) FROM documents WHERE source_channel = 'law_monitor') AS law_source_count; """ DIST_SQL = """ SELECT COALESCE(category::text, '(null)') AS category, COUNT(*) AS n FROM documents GROUP BY category ORDER BY n DESC; """ async def run(apply: bool) -> int: database_url = os.getenv( "DATABASE_URL", "postgresql+asyncpg://pkm:pkm@localhost:5432/pkm", ) engine = create_async_engine(database_url) session_factory = async_sessionmaker( engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: # 1. 현재 분포 print("=== 현재 category 분포 ===") rows = (await session.execute(text(DIST_SQL))).all() for row in rows: print(f" {row.category:12} {row.n}") # 2. 분류 예상 (NULL 대상만) print("\n=== NULL → target category (매핑 예상) ===") rows = (await session.execute(text(CLASSIFY_SQL))).all() pending_total = 0 for row in rows: print(f" {row.target_category:12} {row.n}") pending_total += row.n if pending_total == 0: print("\n백필 대상 없음 (모든 행 이미 category 설정됨).") await engine.dispose() return 0 if not apply: print(f"\n[dry-run] {pending_total}건 영향. --apply 로 실제 적용.") await engine.dispose() return 0 # 3. apply print(f"\n[apply] UPDATE 실행 — {pending_total}건 대상") result = await session.execute(text(APPLY_SQL)) await session.commit() print(f" rowcount = {result.rowcount}") # 4. verify print("\n=== 백필 후 검증 ===") row = (await session.execute(text(VERIFY_SQL))).one() print(f" all_rows = {row.all_rows}") print(f" categorized = {row.categorized}") print(f" uncategorized = {row.uncategorized}") print(f" cat_library = {row.cat_library}") print(f" has_library_tag = {row.has_library_tag}") print(f" cat_law = {row.cat_law}") print(f" law_source_count = {row.law_source_count}") fail = [] if row.uncategorized != 0: fail.append(f"uncategorized={row.uncategorized} (기대 0)") if row.all_rows != row.categorized: fail.append(f"all={row.all_rows} != categorized={row.categorized}") if row.cat_library != row.has_library_tag: fail.append( f"cat_library={row.cat_library} != has_library_tag={row.has_library_tag} " "(자동 전이 없음 정책 위반)" ) if row.cat_law != row.law_source_count: fail.append( f"cat_law={row.cat_law} != law_source_count={row.law_source_count} " "(law_monitor 전체가 category='law' 여야 함)" ) if fail: print("\n!! 검증 실패:") for f in fail: print(f" - {f}") await engine.dispose() return 1 print("\n검증 통과.") await engine.dispose() return 0 def main(): parser = argparse.ArgumentParser(description="documents.category 백필") mode = parser.add_mutually_exclusive_group(required=True) mode.add_argument("--dry-run", action="store_true", help="변경 없이 분포만 보고") mode.add_argument("--apply", action="store_true", help="실제 UPDATE 실행") args = parser.parse_args() rc = asyncio.run(run(apply=args.apply)) sys.exit(rc) if __name__ == "__main__": main()