#!/usr/bin/env python3 import argparse import json import os from pathlib import Path from typing import List, Dict, Any import requests def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]: chunks: List[str] = [] start = 0 n = len(text) while start < n: end = min(start + max_chars, n) chunk = text[start:end].strip() if chunk: chunks.append(chunk) if end == n: break start = max(0, end - overlap) return chunks def embed_texts_ollama(texts: List[str], model: str = "nomic-embed-text", host: str = "http://localhost:11434") -> List[List[float]]: url = f"{host}/api/embeddings" vectors: List[List[float]] = [] for t in texts: resp = requests.post(url, json={"model": model, "prompt": t}, timeout=120) resp.raise_for_status() data = resp.json() vectors.append(data["embedding"]) # type: ignore[index] return vectors def main() -> None: parser = argparse.ArgumentParser(description="Build simple vector index using Ollama embeddings") parser.add_argument("--text", default=None, help="Path to extracted .txt; default = first in data/") parser.add_argument("--model", default="nomic-embed-text", help="Ollama embedding model name") parser.add_argument("--host", default="http://localhost:11434", help="Ollama host") parser.add_argument("--out", default="data/index.jsonl", help="Output JSONL path") parser.add_argument("--max-chars", type=int, default=1200, help="Max characters per chunk") parser.add_argument("--overlap", type=int, default=200, help="Characters overlap between chunks") args = parser.parse_args() data_dir = Path("data") if args.text: text_path = Path(args.text) else: txts = sorted(data_dir.glob("*.txt")) if not txts: raise SystemExit("data/*.txt가 없습니다. 먼저 scripts/pdf_stats.py로 PDF를 추출하세요.") text_path = txts[0] text = text_path.read_text(encoding="utf-8") chunks = chunk_text(text, max_chars=args.max_chars, overlap=args.overlap) vectors = embed_texts_ollama(chunks, model=args.model, host=args.host) out_path = Path(args.out) out_path.parent.mkdir(parents=True, exist_ok=True) with out_path.open("w", encoding="utf-8") as f: for i, (chunk, vec) in enumerate(zip(chunks, vectors)): row: Dict[str, Any] = { "id": f"{text_path.stem}:{i}", "text": chunk, "vector": vec, "source": text_path.name, } f.write(json.dumps(row, ensure_ascii=False) + "\n") meta = { "source_text": str(text_path), "embedding_model": args.model, "host": args.host, "chunks": len(chunks), "index_path": str(out_path), } print(json.dumps(meta, ensure_ascii=False)) if __name__ == "__main__": main()