From 7cab78e49085d81eb7a4c1fe385164bdf8856e08 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Fri, 1 May 2026 10:00:23 +0900 Subject: [PATCH] =?UTF-8?q?ops(canonical):=20Phase=201D=20enqueue=20?= =?UTF-8?q?=EC=A0=84=20backup=20+=20targets=20+=20md=5Fstatus=20=EC=8A=A4?= =?UTF-8?q?=EB=83=85=EC=83=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit enqueue 시작 직전 3가지 흔적 남김: (1) /tmp/phase1d_pilot.json 의 timestamped 사본 (재실행 대비) (2) 대상 30건 document_id 한 줄 출력 (3) documents.md_status 분포 스냅샷 JSON 저장 Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/phase1d_pilot.py | 52 +++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/scripts/phase1d_pilot.py b/scripts/phase1d_pilot.py index 71d7612..2c43e18 100644 --- a/scripts/phase1d_pilot.py +++ b/scripts/phase1d_pilot.py @@ -176,20 +176,56 @@ async def cmd_select(out_path: Path) -> None: # ─── enqueue ─── async def cmd_enqueue(in_path: Path, yes: bool) -> None: - payload = json.loads(in_path.read_text()) - ids = payload["ids"] + from datetime import datetime, timezone + from sqlalchemy import func + + raw_payload = in_path.read_text() + payload = json.loads(raw_payload) + ids: list[int] = payload["ids"] + + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + + # (1) /tmp/phase1d_pilot.json 원본 보존 — 재실행/덮어쓰기 대비 timestamped 사본 + backup_path = in_path.with_name(f"{in_path.stem}_pre_enqueue_{ts}.json") + backup_path.write_text(raw_payload) + print(f"[backup] {backup_path}") + + # (2) enqueue 대상 document_id 목록 (한 줄) + print(f"[targets] {len(ids)}건: {ids}") + + engine = _build_engine() + Session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) + + from models.document import Document # type: ignore + from models.queue import enqueue_stage # type: ignore + + # (3) 실행 전 md_status 분포 스냅샷 + async with Session() as session: + snap_rows = ( + await session.execute( + select(Document.md_status, func.count()) + .where(Document.deleted_at.is_(None)) + .group_by(Document.md_status) + ) + ).all() + snapshot = { + "timestamp_utc": ts, + "scope": "documents WHERE deleted_at IS NULL", + "md_status_distribution": {str(s): int(c) for s, c in snap_rows}, + "sample_ids": ids, + } + snap_path = in_path.with_name(f"phase1d_md_status_pre_{ts}.json") + snap_path.write_text(json.dumps(snapshot, ensure_ascii=False, indent=2)) + print(f"[snapshot] {snap_path}") + print(f" {snapshot['md_status_distribution']}") if not yes: confirm = input(f"\n{len(ids)}건 markdown 큐에 enqueue 합니다. 진행? [y/N] ") if confirm.strip().lower() not in ("y", "yes"): print("취소됨.") + await engine.dispose() return - engine = _build_engine() - Session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) - - from models.queue import enqueue_stage # type: ignore - enqueued, skipped = [], [] async with Session() as session: for doc_id in ids: @@ -197,7 +233,7 @@ async def cmd_enqueue(in_path: Path, yes: bool) -> None: (enqueued if ok else skipped).append(doc_id) await session.commit() - print(f"enqueued: {len(enqueued)}, skipped (이미 active): {len(skipped)}") + print(f"\nenqueued: {len(enqueued)}, skipped (이미 active): {len(skipped)}") if skipped: print(f" skipped ids: {skipped[:20]}{' …' if len(skipped) > 20 else ''}") await engine.dispose()