diff --git a/scripts/phase1d_pilot.py b/scripts/phase1d_pilot.py index 75d5d45..51ecfda 100644 --- a/scripts/phase1d_pilot.py +++ b/scripts/phase1d_pilot.py @@ -484,13 +484,34 @@ async def cmd_select(out_path: Path, csv_path: Path | None) -> None: # ─── enqueue ─── -async def cmd_enqueue(in_path: Path, yes: bool) -> None: +async def cmd_enqueue(in_path: Path, yes: bool, include_existing: bool = False) -> None: from datetime import datetime, timezone from sqlalchemy import func raw_payload = in_path.read_text() payload = json.loads(raw_payload) - ids: list[int] = payload["ids"] + items: list[dict] = payload.get("items", []) + + # Round 2 정책: existing_success (5건, anchor + calibration) 는 enqueue 제외. + # 그들은 기존 md_content 그대로 두고 평가 anchor 로 사용. 재처리 시 marker 시간 + # 낭비 + 같은 quality output overwrite 로 baseline 유실. controlled_backfill (25건) + # 만 새로 변환. 후속 라운드 (Marker 튜닝 후) anchor 재처리 필요 시 --include-existing. + if include_existing: + target_items = items + ids: list[int] = [it["id"] for it in target_items] + print(f"[mode] include_existing=True — sample 30건 전부 enqueue") + else: + target_items = [it for it in items if it.get("sample_source") == "controlled_backfill"] + ids = [it["id"] for it in target_items] + excluded = [it["id"] for it in items if it.get("sample_source") != "controlled_backfill"] + print( + f"[mode] controlled_backfill 만 enqueue ({len(ids)}건). " + f"existing_success 제외 ({len(excluded)}건): {excluded}" + ) + + if not ids: + print("[abort] enqueue 대상 없음.") + return ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") @@ -740,6 +761,11 @@ def main() -> None: p_enq = sub.add_parser("enqueue", help="markdown 큐 enqueue") p_enq.add_argument("--in", dest="in_path", type=Path, default=DEFAULT_OUT) p_enq.add_argument("--yes", action="store_true") + p_enq.add_argument( + "--include-existing", + action="store_true", + help="existing_success (anchor + calibration) 도 재처리. default 는 controlled_backfill 만.", + ) p_rep = sub.add_parser("report", help="결과 집계") p_rep.add_argument("--in", dest="in_path", type=Path, default=DEFAULT_OUT) @@ -752,7 +778,7 @@ def main() -> None: if args.cmd == "select": asyncio.run(cmd_select(args.out, args.csv)) elif args.cmd == "enqueue": - asyncio.run(cmd_enqueue(args.in_path, args.yes)) + asyncio.run(cmd_enqueue(args.in_path, args.yes, args.include_existing)) elif args.cmd == "report": asyncio.run(cmd_report(args.in_path)) elif args.cmd == "eval_template":