86 lines
2.9 KiB
Python
86 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
|
|
import requests
|
|
|
|
|
|
def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]:
|
|
chunks: List[str] = []
|
|
start = 0
|
|
n = len(text)
|
|
while start < n:
|
|
end = min(start + max_chars, n)
|
|
chunk = text[start:end].strip()
|
|
if chunk:
|
|
chunks.append(chunk)
|
|
if end == n:
|
|
break
|
|
start = max(0, end - overlap)
|
|
return chunks
|
|
|
|
|
|
def embed_texts_ollama(texts: List[str], model: str = "nomic-embed-text", host: str = "http://localhost:11434") -> List[List[float]]:
|
|
url = f"{host}/api/embeddings"
|
|
vectors: List[List[float]] = []
|
|
for t in texts:
|
|
resp = requests.post(url, json={"model": model, "prompt": t}, timeout=120)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
vectors.append(data["embedding"]) # type: ignore[index]
|
|
return vectors
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Build simple vector index using Ollama embeddings")
|
|
parser.add_argument("--text", default=None, help="Path to extracted .txt; default = first in data/")
|
|
parser.add_argument("--model", default="nomic-embed-text", help="Ollama embedding model name")
|
|
parser.add_argument("--host", default="http://localhost:11434", help="Ollama host")
|
|
parser.add_argument("--out", default="data/index.jsonl", help="Output JSONL path")
|
|
parser.add_argument("--max-chars", type=int, default=1200, help="Max characters per chunk")
|
|
parser.add_argument("--overlap", type=int, default=200, help="Characters overlap between chunks")
|
|
args = parser.parse_args()
|
|
|
|
data_dir = Path("data")
|
|
if args.text:
|
|
text_path = Path(args.text)
|
|
else:
|
|
txts = sorted(data_dir.glob("*.txt"))
|
|
if not txts:
|
|
raise SystemExit("data/*.txt가 없습니다. 먼저 scripts/pdf_stats.py로 PDF를 추출하세요.")
|
|
text_path = txts[0]
|
|
|
|
text = text_path.read_text(encoding="utf-8")
|
|
chunks = chunk_text(text, max_chars=args.max_chars, overlap=args.overlap)
|
|
|
|
vectors = embed_texts_ollama(chunks, model=args.model, host=args.host)
|
|
|
|
out_path = Path(args.out)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with out_path.open("w", encoding="utf-8") as f:
|
|
for i, (chunk, vec) in enumerate(zip(chunks, vectors)):
|
|
row: Dict[str, Any] = {
|
|
"id": f"{text_path.stem}:{i}",
|
|
"text": chunk,
|
|
"vector": vec,
|
|
"source": text_path.name,
|
|
}
|
|
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
|
|
|
meta = {
|
|
"source_text": str(text_path),
|
|
"embedding_model": args.model,
|
|
"host": args.host,
|
|
"chunks": len(chunks),
|
|
"index_path": str(out_path),
|
|
}
|
|
print(json.dumps(meta, ensure_ascii=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|