100 lines
2.7 KiB
Python
100 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
def detect_hangul_ratio(text: str) -> float:
|
|
han = len(re.findall(r"[\u3131-\u318E\uAC00-\uD7A3]", text))
|
|
total = max(len(text), 1)
|
|
return han / total
|
|
|
|
|
|
def ensure_dir(path: Path) -> None:
|
|
if not path.exists():
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Extract full text from PDF and estimate token count")
|
|
parser.add_argument("pdf", nargs="?", help="Path to PDF; if omitted, first PDF in repo root is used")
|
|
parser.add_argument("--outdir", default="data", help="Output directory for extracted text")
|
|
args = parser.parse_args()
|
|
|
|
repo_root = Path(os.getcwd())
|
|
if args.pdf:
|
|
pdf_path = Path(args.pdf)
|
|
else:
|
|
# pick the first PDF in repo root
|
|
cands = sorted(repo_root.glob("*.pdf"))
|
|
if not cands:
|
|
print("{}")
|
|
return
|
|
pdf_path = cands[0]
|
|
|
|
# Lazy import with helpful error if missing
|
|
try:
|
|
from pypdf import PdfReader
|
|
except Exception as e:
|
|
raise SystemExit(
|
|
"pypdf가 설치되어 있지 않습니다. 가상환경 생성 후 'pip install pypdf tiktoken'을 실행하세요."
|
|
)
|
|
|
|
# Tokenizer
|
|
try:
|
|
import tiktoken
|
|
enc = tiktoken.get_encoding("cl100k_base")
|
|
def count_tokens(s: str) -> int:
|
|
return len(enc.encode(s))
|
|
tokenizer = "tiktoken(cl100k_base)"
|
|
except Exception:
|
|
def count_tokens(s: str) -> int:
|
|
# fallback heuristic
|
|
return int(len(s) / 3.3)
|
|
tokenizer = "heuristic_div_3.3"
|
|
|
|
reader = PdfReader(str(pdf_path))
|
|
num_pages = len(reader.pages)
|
|
|
|
# Full extraction
|
|
all_text_parts = []
|
|
for i in range(num_pages):
|
|
try:
|
|
page_text = reader.pages[i].extract_text() or ""
|
|
except Exception:
|
|
page_text = ""
|
|
all_text_parts.append(page_text)
|
|
full_text = "\n\n".join(all_text_parts).strip()
|
|
|
|
# Stats
|
|
chars = len(full_text)
|
|
tokens = count_tokens(full_text)
|
|
hangul_ratio = detect_hangul_ratio(full_text)
|
|
size_bytes = pdf_path.stat().st_size
|
|
|
|
# Save text
|
|
outdir = Path(args.outdir)
|
|
ensure_dir(outdir)
|
|
txt_name = pdf_path.stem + ".txt"
|
|
out_txt = outdir / txt_name
|
|
out_txt.write_text(full_text, encoding="utf-8")
|
|
|
|
result = {
|
|
"pdf": str(pdf_path),
|
|
"pages": num_pages,
|
|
"size_bytes": size_bytes,
|
|
"chars": chars,
|
|
"tokens": tokens,
|
|
"hangul_ratio": round(hangul_ratio, 4),
|
|
"tokenizer": tokenizer,
|
|
"text_path": str(out_txt),
|
|
}
|
|
print(json.dumps(result, ensure_ascii=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|