e88640d3d8
- migrations/152: ALTER TYPE doc_category ADD VALUE 'law' (DDL only; PG16 단일-트랜잭션 제약상 backfill 은 별도) - models/document.py: Enum 에 'law' 추가 (7 활성 + 3 유보) - workers/law_monitor.py: Document(..., category='law') — 신규 유입부터 세팅 - workers/classify_worker.py: source_channel='law_monitor' early-return + 최소 필드 (ai_domain='법령', ai_tags=['법령'], importance='medium'). AI classify skip — 법령 구조 고정/외부 source of truth/자동 재수집 - scripts/backfill_category.py: law 분기 + WHERE re-target ((source_channel='law_monitor' AND category='document')) + VERIFY cat_law/law_source_count + fail 조건 - api/documents.py: default 목록 제외에 law_monitor 추가 (news 와 동일 패턴) - api/dashboard.py: documents count FILTER 에 law_monitor 제외 (category_counts.law 는 기존 GROUP BY category 로 자동 노출) - frontend/Sidebar.svelte: '법령 알림' 버튼 ?source=law_monitor → ?category=law (explicit category 경로가 default exclusion 을 skip) plan: ~/.claude/plans/stateless-churning-raccoon.md axis 원칙: category=UI 축, policy/telemetry=source_channel+ai_domain 축 (feedback_category_vs_ai_domain_axis.md) 배포 순서: push → GPU pull → compose up --build fastapi frontend → backfill --dry-run → --apply. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
204 lines
7.6 KiB
Python
204 lines
7.6 KiB
Python
"""§1 백필 — documents.category 전체 행 채우기.
|
|
|
|
plan: luminous-sprouting-hamster.md §1
|
|
확장: stateless-churning-raccoon.md (law 카테고리 분기 + idempotent 재실행)
|
|
|
|
매핑 규칙:
|
|
source_channel='law_monitor' → category='law'
|
|
source_channel='news' → category='news'
|
|
source_channel='memo' → category='memo'
|
|
file_type='note' → category='memo'
|
|
user_tags 에 '@library/' 태그 보유 → category='library'
|
|
그 외 → category='document'
|
|
|
|
대상 WHERE:
|
|
category IS NULL
|
|
OR (source_channel='law_monitor' AND category='document')
|
|
— law 신규 enum 도입 전 backfill 로 'document' 배정됐던 206건 재분류.
|
|
|
|
자동 library 전이 금지 — 기존 @library/ 태그 보유분만 'library' 로 이행.
|
|
audio/video 는 §3 이후 생성 (백필 대상 없음).
|
|
|
|
실행:
|
|
docker compose exec fastapi python /app/scripts/backfill_category.py --dry-run
|
|
docker compose exec fastapi python /app/scripts/backfill_category.py --apply
|
|
|
|
로컬:
|
|
python scripts/backfill_category.py --dry-run
|
|
DATABASE_URL=postgresql+asyncpg://... python scripts/backfill_category.py --apply
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
|
|
|
|
|
CLASSIFY_SQL = """
|
|
WITH classified AS (
|
|
SELECT
|
|
id,
|
|
CASE
|
|
WHEN source_channel = 'law_monitor' THEN 'law'
|
|
WHEN source_channel = 'news' THEN 'news'
|
|
WHEN source_channel = 'memo' THEN 'memo'
|
|
WHEN file_type = 'note' THEN 'memo'
|
|
WHEN EXISTS (
|
|
SELECT 1 FROM jsonb_array_elements_text(
|
|
COALESCE(user_tags, '[]'::jsonb)
|
|
) AS t
|
|
WHERE t LIKE '@library/%'
|
|
) THEN 'library'
|
|
ELSE 'document'
|
|
END AS target_category
|
|
FROM documents
|
|
WHERE category IS NULL
|
|
OR (source_channel = 'law_monitor' AND category = 'document')
|
|
)
|
|
SELECT target_category, COUNT(*) AS n FROM classified GROUP BY 1 ORDER BY 2 DESC;
|
|
"""
|
|
|
|
APPLY_SQL = """
|
|
UPDATE documents
|
|
SET category = CASE
|
|
WHEN source_channel = 'law_monitor' THEN 'law'::doc_category
|
|
WHEN source_channel = 'news' THEN 'news'::doc_category
|
|
WHEN source_channel = 'memo' THEN 'memo'::doc_category
|
|
WHEN file_type = 'note' THEN 'memo'::doc_category
|
|
WHEN EXISTS (
|
|
SELECT 1 FROM jsonb_array_elements_text(
|
|
COALESCE(documents.user_tags, '[]'::jsonb)
|
|
) AS t
|
|
WHERE t LIKE '@library/%'
|
|
) THEN 'library'::doc_category
|
|
ELSE 'document'::doc_category
|
|
END
|
|
WHERE category IS NULL
|
|
OR (source_channel = 'law_monitor' AND category = 'document');
|
|
"""
|
|
|
|
VERIFY_SQL = """
|
|
SELECT
|
|
(SELECT COUNT(*) FROM documents) AS all_rows,
|
|
(SELECT COUNT(*) FROM documents WHERE category IS NOT NULL) AS categorized,
|
|
(SELECT COUNT(*) FROM documents WHERE category IS NULL) AS uncategorized,
|
|
(SELECT COUNT(*) FROM documents WHERE category = 'library') AS cat_library,
|
|
(SELECT COUNT(*) FROM documents
|
|
WHERE EXISTS (
|
|
SELECT 1 FROM jsonb_array_elements_text(
|
|
COALESCE(user_tags, '[]'::jsonb)
|
|
) AS t
|
|
WHERE t LIKE '@library/%'
|
|
)) AS has_library_tag,
|
|
(SELECT COUNT(*) FROM documents WHERE category = 'law') AS cat_law,
|
|
(SELECT COUNT(*) FROM documents WHERE source_channel = 'law_monitor') AS law_source_count;
|
|
"""
|
|
|
|
DIST_SQL = """
|
|
SELECT COALESCE(category::text, '(null)') AS category, COUNT(*) AS n
|
|
FROM documents
|
|
GROUP BY category
|
|
ORDER BY n DESC;
|
|
"""
|
|
|
|
|
|
async def run(apply: bool) -> int:
|
|
database_url = os.getenv(
|
|
"DATABASE_URL",
|
|
"postgresql+asyncpg://pkm:pkm@localhost:5432/pkm",
|
|
)
|
|
|
|
engine = create_async_engine(database_url)
|
|
session_factory = async_sessionmaker(
|
|
engine, class_=AsyncSession, expire_on_commit=False
|
|
)
|
|
|
|
async with session_factory() as session:
|
|
# 1. 현재 분포
|
|
print("=== 현재 category 분포 ===")
|
|
rows = (await session.execute(text(DIST_SQL))).all()
|
|
for row in rows:
|
|
print(f" {row.category:12} {row.n}")
|
|
|
|
# 2. 분류 예상 (NULL 대상만)
|
|
print("\n=== NULL → target category (매핑 예상) ===")
|
|
rows = (await session.execute(text(CLASSIFY_SQL))).all()
|
|
pending_total = 0
|
|
for row in rows:
|
|
print(f" {row.target_category:12} {row.n}")
|
|
pending_total += row.n
|
|
|
|
if pending_total == 0:
|
|
print("\n백필 대상 없음 (모든 행 이미 category 설정됨).")
|
|
await engine.dispose()
|
|
return 0
|
|
|
|
if not apply:
|
|
print(f"\n[dry-run] {pending_total}건 영향. --apply 로 실제 적용.")
|
|
await engine.dispose()
|
|
return 0
|
|
|
|
# 3. apply
|
|
print(f"\n[apply] UPDATE 실행 — {pending_total}건 대상")
|
|
result = await session.execute(text(APPLY_SQL))
|
|
await session.commit()
|
|
print(f" rowcount = {result.rowcount}")
|
|
|
|
# 4. verify
|
|
print("\n=== 백필 후 검증 ===")
|
|
row = (await session.execute(text(VERIFY_SQL))).one()
|
|
print(f" all_rows = {row.all_rows}")
|
|
print(f" categorized = {row.categorized}")
|
|
print(f" uncategorized = {row.uncategorized}")
|
|
print(f" cat_library = {row.cat_library}")
|
|
print(f" has_library_tag = {row.has_library_tag}")
|
|
print(f" cat_law = {row.cat_law}")
|
|
print(f" law_source_count = {row.law_source_count}")
|
|
|
|
fail = []
|
|
if row.uncategorized != 0:
|
|
fail.append(f"uncategorized={row.uncategorized} (기대 0)")
|
|
if row.all_rows != row.categorized:
|
|
fail.append(f"all={row.all_rows} != categorized={row.categorized}")
|
|
if row.cat_library != row.has_library_tag:
|
|
fail.append(
|
|
f"cat_library={row.cat_library} != has_library_tag={row.has_library_tag} "
|
|
"(자동 전이 없음 정책 위반)"
|
|
)
|
|
if row.cat_law != row.law_source_count:
|
|
fail.append(
|
|
f"cat_law={row.cat_law} != law_source_count={row.law_source_count} "
|
|
"(law_monitor 전체가 category='law' 여야 함)"
|
|
)
|
|
if fail:
|
|
print("\n!! 검증 실패:")
|
|
for f in fail:
|
|
print(f" - {f}")
|
|
await engine.dispose()
|
|
return 1
|
|
|
|
print("\n검증 통과.")
|
|
|
|
await engine.dispose()
|
|
return 0
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="documents.category 백필")
|
|
mode = parser.add_mutually_exclusive_group(required=True)
|
|
mode.add_argument("--dry-run", action="store_true", help="변경 없이 분포만 보고")
|
|
mode.add_argument("--apply", action="store_true", help="실제 UPDATE 실행")
|
|
args = parser.parse_args()
|
|
|
|
rc = asyncio.run(run(apply=args.apply))
|
|
sys.exit(rc)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|