#!/usr/bin/env python3 import argparse import json import os import re from pathlib import Path def detect_hangul_ratio(text: str) -> float: han = len(re.findall(r"[\u3131-\u318E\uAC00-\uD7A3]", text)) total = max(len(text), 1) return han / total def ensure_dir(path: Path) -> None: if not path.exists(): path.mkdir(parents=True, exist_ok=True) def main() -> None: parser = argparse.ArgumentParser(description="Extract full text from PDF and estimate token count") parser.add_argument("pdf", nargs="?", help="Path to PDF; if omitted, first PDF in repo root is used") parser.add_argument("--outdir", default="data", help="Output directory for extracted text") args = parser.parse_args() repo_root = Path(os.getcwd()) if args.pdf: pdf_path = Path(args.pdf) else: # pick the first PDF in repo root cands = sorted(repo_root.glob("*.pdf")) if not cands: print("{}") return pdf_path = cands[0] # Lazy import with helpful error if missing try: from pypdf import PdfReader except Exception as e: raise SystemExit( "pypdf가 설치되어 있지 않습니다. 가상환경 생성 후 'pip install pypdf tiktoken'을 실행하세요." ) # Tokenizer try: import tiktoken enc = tiktoken.get_encoding("cl100k_base") def count_tokens(s: str) -> int: return len(enc.encode(s)) tokenizer = "tiktoken(cl100k_base)" except Exception: def count_tokens(s: str) -> int: # fallback heuristic return int(len(s) / 3.3) tokenizer = "heuristic_div_3.3" reader = PdfReader(str(pdf_path)) num_pages = len(reader.pages) # Full extraction all_text_parts = [] for i in range(num_pages): try: page_text = reader.pages[i].extract_text() or "" except Exception: page_text = "" all_text_parts.append(page_text) full_text = "\n\n".join(all_text_parts).strip() # Stats chars = len(full_text) tokens = count_tokens(full_text) hangul_ratio = detect_hangul_ratio(full_text) size_bytes = pdf_path.stat().st_size # Save text outdir = Path(args.outdir) ensure_dir(outdir) txt_name = pdf_path.stem + ".txt" out_txt = outdir / txt_name out_txt.write_text(full_text, encoding="utf-8") result = { "pdf": str(pdf_path), "pages": num_pages, "size_bytes": size_bytes, "chars": chars, "tokens": tokens, "hangul_ratio": round(hangul_ratio, 4), "tokenizer": tokenizer, "text_path": str(out_txt), } print(json.dumps(result, ensure_ascii=False)) if __name__ == "__main__": main()