"""Phase 1D — marker markdown 품질 pilot (one-shot admin script). * 영구 worker 경로 아님. 30건 한정 sample 로 baseline 품질 측정. * Phase 2 전체 백필 결정은 1D 결과 보고 후행. * 1B.5 (이미지 추출 / _meta 보존) 는 별도 PR — 본 스크립트 영역 아님. Stratification (Round 2 refined, plan: ~/.claude/plans/stratified-mingling-otter.md): 4 축: doc_type × file_size_band × text_density_band × handwritten_hint + sample_source ∈ {existing_success, controlled_backfill} - existing_success 5건 (anchor 1 + calibration 4) - controlled_backfill 25건 (handwritten 3 / scan_likely 2~3 / mixed 5 / born_digital 12 / large 2) + forced_include: doc 4809 (Note_240805_용접교육 필기) — known bad handwritten anchor. document_type ∈ SKIP_DOC_TYPES 제외 (marker_worker 룰 미러). Subcommands: select stratified 30건 dry-run + CSV+JSON 저장 enqueue select 결과를 markdown 큐에 enqueue (uq_queue_active 위반 회피) report md_status 분포·실패사유·quality 메트릭·UI 검수 URL 출력 eval_template pilot_1d_eval.csv 스켈레톤 출력 (사용자가 rubric 5축 점수 채움) 실행 (GPU 서버): docker compose exec fastapi python /app/scripts/phase1d_pilot.py select \ --csv /app/evals/markdown/pilot_1d_sample.csv docker compose exec fastapi python /app/scripts/phase1d_pilot.py enqueue --yes docker compose exec fastapi python /app/scripts/phase1d_pilot.py report docker compose exec fastapi python /app/scripts/phase1d_pilot.py eval_template \ --csv /app/evals/markdown/pilot_1d_eval.csv """ import argparse import asyncio import csv import json import os import random import re import sys from collections import Counter, defaultdict from pathlib import Path # fastapi 컨테이너는 WORKDIR=/app 에 코드를 펼쳐놓음 (app/ 디렉토리 없음). # /app/scripts/../app 이 아니라 /app 자체가 sys.path 에 있어야 `from models...` import 가능. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine # marker_worker 의 SKIP 룰 미러 — drift 회피 위해 한 곳만 진실. 변경 시 동기 필요. SKIP_DOC_TYPES = { "발주서", "세금계산서", "명세표", "Invoice", "Purchase_Order", "Estimate", "Statement", } # file_size bucket (page_count proxy). PDF 평균 1페이지 ~10~50KB. SIZE_BUCKETS = [ ("small", 0, 500 * 1024), # < 500KB ("medium", 500 * 1024, 5 * 1024 * 1024), # 500KB ~ 5MB ("large", 5 * 1024 * 1024, 10**12), # > 5MB ] # 4축 stratification 의 file_size_band — Round 2 plan FILE_SIZE_BAND_THRESHOLDS = [ ("S", 0, 1 * 1024 * 1024), # < 1MB ("M", 1 * 1024 * 1024, 10 * 1024 * 1024), # 1~10MB ("L", 10 * 1024 * 1024, 10**12), # > 10MB ] # text_density (chars per KB of file) — born-digital vs scan 구분 단일 깨끗한 proxy. # 0.17 (필기 4809) ↔ 94 (born-digital 3759) 양 끝 검증됨. TEXT_DENSITY_BANDS = [ ("scan-likely", 0.0, 5.0), ("mixed", 5.0, 50.0), ("born-digital", 50.0, float("inf")), ] HANDWRITTEN_HINT_REGEX = re.compile(r"필기|노트|handwritten|scan|스캔|note", re.IGNORECASE) # Forced include — 사용자 시각 확인에서 발견된 known bad anchor. # 1D 결과로 다음 라운드 튜닝 시 같은 문서를 재변환해 개선 여부 판정. FORCED_INCLUDES: dict[int, str] = { 4809: "known_bad_handwritten_anchor", } # 재현성 시드 — 한 번 만든 sample CSV 가 동일 결과 보장. SAMPLE_SEED = 20260502 PILOT_TARGET = 30 EXISTING_SUCCESS_TARGET = 5 CONTROLLED_BACKFILL_TARGET = PILOT_TARGET - EXISTING_SUCCESS_TARGET # 25 DEFAULT_OUT = Path("/tmp/phase1d_pilot.json") DEFAULT_CSV = Path("/tmp/phase1d_pilot.csv") DEFAULT_EVAL_CSV = Path("/tmp/phase1d_eval.csv") def _bucket(file_size: int | None) -> str: """legacy 3-bucket — cmd_report 의 file_size bucket 호환.""" if file_size is None: return "unknown" for name, lo, hi in SIZE_BUCKETS: if lo <= file_size < hi: return name return "outlier" def _file_size_band(file_size: int | None) -> str: """Round 2 refined band: S / M / L.""" if file_size is None: return "unknown" for name, lo, hi in FILE_SIZE_BAND_THRESHOLDS: if lo <= file_size < hi: return name return "L" def _text_density(text_len: int, file_size: int | None) -> float | None: """chars per KB of file. file_size==0/None 이면 None.""" if not file_size or file_size <= 0: return None return text_len / (file_size / 1024.0) def _text_density_band(density: float | None) -> str: if density is None: return "unknown" for name, lo, hi in TEXT_DENSITY_BANDS: if lo <= density < hi: return name return "unknown" def _handwritten_hint(title: str | None, file_path: str | None) -> str: """title 또는 file_path 에 필기/노트/handwritten/scan 매칭 → 'hi' / 'lo'.""" blob = " ".join(filter(None, [title or "", file_path or ""])) return "hi" if HANDWRITTEN_HINT_REGEX.search(blob) else "lo" def _scan_likely(text_len: int, file_size: int | None, density: float | None) -> bool: """text_density < 5 또는 extracted_text 부재 → 스캔 가능성 높음.""" if text_len == 0: return True if density is not None and density < 5.0: return True return False def _script_mix(extracted_text: str | None, sample_chars: int = 10000) -> str: """첫 N자에서 Hangul/CJK/Hiragana/Katakana/Latin 비율로 라벨링. 한 script ≥ 0.7 → '