ai-server/scripts/pdf_stats.py

#!/usr/bin/env python3
import argparse
import json
import os
import re
from pathlib import Path


def detect_hangul_ratio(text: str) -> float:
    han = len(re.findall(r"[\u3131-\u318E\uAC00-\uD7A3]", text))
    total = max(len(text), 1)
    return han / total


def ensure_dir(path: Path) -> None:
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)


def main() -> None:
    parser = argparse.ArgumentParser(description="Extract full text from PDF and estimate token count")
    parser.add_argument("pdf", nargs="?", help="Path to PDF; if omitted, first PDF in repo root is used")
    parser.add_argument("--outdir", default="data", help="Output directory for extracted text")
    args = parser.parse_args()

    repo_root = Path(os.getcwd())
    if args.pdf:
        pdf_path = Path(args.pdf)
    else:
        # pick the first PDF in repo root
        cands = sorted(repo_root.glob("*.pdf"))
        if not cands:
            print("{}")
            return
        pdf_path = cands[0]

    # Lazy import with helpful error if missing
    try:
        from pypdf import PdfReader
    except Exception as e:
        raise SystemExit(
            "pypdf가 설치되어 있지 않습니다. 가상환경 생성 후 'pip install pypdf tiktoken'을 실행하세요."
        )

    # Tokenizer
    try:
        import tiktoken
        enc = tiktoken.get_encoding("cl100k_base")
        def count_tokens(s: str) -> int:
            return len(enc.encode(s))
        tokenizer = "tiktoken(cl100k_base)"
    except Exception:
        def count_tokens(s: str) -> int:
            # fallback heuristic
            return int(len(s) / 3.3)
        tokenizer = "heuristic_div_3.3"

    reader = PdfReader(str(pdf_path))
    num_pages = len(reader.pages)

    # Full extraction
    all_text_parts = []
    for i in range(num_pages):
        try:
            page_text = reader.pages[i].extract_text() or ""
        except Exception:
            page_text = ""
        all_text_parts.append(page_text)
    full_text = "\n\n".join(all_text_parts).strip()

    # Stats
    chars = len(full_text)
    tokens = count_tokens(full_text)
    hangul_ratio = detect_hangul_ratio(full_text)
    size_bytes = pdf_path.stat().st_size

    # Save text
    outdir = Path(args.outdir)
    ensure_dir(outdir)
    txt_name = pdf_path.stem + ".txt"
    out_txt = outdir / txt_name
    out_txt.write_text(full_text, encoding="utf-8")

    result = {
        "pdf": str(pdf_path),
        "pages": num_pages,
        "size_bytes": size_bytes,
        "chars": chars,
        "tokens": tokens,
        "hangul_ratio": round(hangul_ratio, 4),
        "tokenizer": tokenizer,
        "text_path": str(out_txt),
    }
    print(json.dumps(result, ensure_ascii=False))


if __name__ == "__main__":
    main()