"""HWP(library) 백필 — 지정 PKM 폴더의 .hwp 를 content-hash dedup 후 일회성 ingest. 산업안전기사 등 외부 학습자료(category='library')를 코퍼스에 편입한다. file_watcher 의 PKM 트랙 로직을 재사용하되 dedup 을 file_path 가 아닌 **file_hash** 기준으로 해서 (a) Inbox 중복 (b) `_1`/`카피본` 사본을 1건으로 수렴시킨다(file_watcher 는 path dedup 이라 동일내용 다른경로를 중복 ingest 함). 이후 파이프라인: extract(텍스트) → classify → embed/chunk(검색) → markdown(.hwp=pyhwp hwp5html + raster NAS 영속) 실행 (GPU 서버): # dry-run (기본) — 무엇이 ingest/skip 될지만 출력 docker exec hyungi_document_server-fastapi-1 \ python /app/scripts/backfill_hwp_library.py --subdir Knowledge/Engineering # 실제 ingest docker exec hyungi_document_server-fastapi-1 \ python /app/scripts/backfill_hwp_library.py --subdir Knowledge/Engineering --commit """ import argparse import asyncio import sys from pathlib import Path from sqlalchemy import select from core.config import settings from core.database import async_session from core.utils import file_hash from models.document import Document from models.queue import enqueue_stage async def run(subdir: str, commit: bool) -> int: nas_root = Path(settings.nas_mount_path) scan_root = nas_root / "PKM" / subdir if not scan_root.exists(): print(f"[backfill] scan_root 부재: {scan_root}", file=sys.stderr) return 2 files = sorted( p for p in scan_root.rglob("*") if p.is_file() and p.suffix.lower() == ".hwp" ) print(f"[backfill] {scan_root} 하위 .hwp {len(files)}개 발견 / mode={'COMMIT' if commit else 'DRY-RUN'}") ingested = skipped_existing = skipped_batchdup = 0 seen_hashes: set[str] = set() async with async_session() as session: for fp in files: rel_path = str(fp.relative_to(nas_root)) fhash = file_hash(fp) if fhash in seen_hashes: print(f" SKIP(batch-dup) {rel_path}") skipped_batchdup += 1 continue seen_hashes.add(fhash) # content-hash dedup (path 무관) — Inbox 중복 + _1/카피본 사본 흡수 existing = ( await session.execute( select(Document.id).where(Document.file_hash == fhash).limit(1) ) ).first() if existing: print(f" SKIP(exists id={existing[0]}) {rel_path}") skipped_existing += 1 continue ingested += 1 if not commit: print(f" INGEST(dry) {rel_path}") continue doc = Document( file_path=rel_path, file_hash=fhash, file_format="hwp", file_size=fp.stat().st_size, file_type="immutable", title=fp.stem, source_channel="drive_sync", category="library", needs_conversion=False, ) session.add(doc) await session.flush() await enqueue_stage(session, doc.id, "extract") print(f" INGEST id={doc.id} {rel_path}") if commit: await session.commit() print( f"[backfill] done — ingest={ingested} " f"skip_existing={skipped_existing} skip_batchdup={skipped_batchdup}" ) return 0 def main() -> int: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--subdir", default="Knowledge/Engineering", help="PKM 하위 스캔 폴더") ap.add_argument("--commit", action="store_true", help="실제 ingest (없으면 dry-run)") args = ap.parse_args() return asyncio.run(run(args.subdir, args.commit)) if __name__ == "__main__": sys.exit(main())