feat: local AI server scaffolding (FastAPI, RAG, embeddings). Port policy (>=26000), README/API docs, scripts.
This commit is contained in:
13
scripts/dev_server.sh
Executable file
13
scripts/dev_server.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
export OLLAMA_HOST=${OLLAMA_HOST:-http://localhost:11434}
|
||||
export BASE_MODEL=${BASE_MODEL:-qwen2.5:7b-instruct}
|
||||
export BOOST_MODEL=${BOOST_MODEL:-qwen2.5:14b-instruct}
|
||||
export EMBEDDING_MODEL=${EMBEDDING_MODEL:-nomic-embed-text}
|
||||
export INDEX_PATH=${INDEX_PATH:-data/index.jsonl}
|
||||
export AI_SERVER_PORT=${AI_SERVER_PORT:-26000}
|
||||
|
||||
source .venv/bin/activate
|
||||
exec uvicorn server.main:app --host 0.0.0.0 --port "$AI_SERVER_PORT" --reload
|
||||
|
||||
85
scripts/embed_ollama.py
Normal file
85
scripts/embed_ollama.py
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]:
|
||||
chunks: List[str] = []
|
||||
start = 0
|
||||
n = len(text)
|
||||
while start < n:
|
||||
end = min(start + max_chars, n)
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
if end == n:
|
||||
break
|
||||
start = max(0, end - overlap)
|
||||
return chunks
|
||||
|
||||
|
||||
def embed_texts_ollama(texts: List[str], model: str = "nomic-embed-text", host: str = "http://localhost:11434") -> List[List[float]]:
|
||||
url = f"{host}/api/embeddings"
|
||||
vectors: List[List[float]] = []
|
||||
for t in texts:
|
||||
resp = requests.post(url, json={"model": model, "prompt": t}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
vectors.append(data["embedding"]) # type: ignore[index]
|
||||
return vectors
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Build simple vector index using Ollama embeddings")
|
||||
parser.add_argument("--text", default=None, help="Path to extracted .txt; default = first in data/")
|
||||
parser.add_argument("--model", default="nomic-embed-text", help="Ollama embedding model name")
|
||||
parser.add_argument("--host", default="http://localhost:11434", help="Ollama host")
|
||||
parser.add_argument("--out", default="data/index.jsonl", help="Output JSONL path")
|
||||
parser.add_argument("--max-chars", type=int, default=1200, help="Max characters per chunk")
|
||||
parser.add_argument("--overlap", type=int, default=200, help="Characters overlap between chunks")
|
||||
args = parser.parse_args()
|
||||
|
||||
data_dir = Path("data")
|
||||
if args.text:
|
||||
text_path = Path(args.text)
|
||||
else:
|
||||
txts = sorted(data_dir.glob("*.txt"))
|
||||
if not txts:
|
||||
raise SystemExit("data/*.txt가 없습니다. 먼저 scripts/pdf_stats.py로 PDF를 추출하세요.")
|
||||
text_path = txts[0]
|
||||
|
||||
text = text_path.read_text(encoding="utf-8")
|
||||
chunks = chunk_text(text, max_chars=args.max_chars, overlap=args.overlap)
|
||||
|
||||
vectors = embed_texts_ollama(chunks, model=args.model, host=args.host)
|
||||
|
||||
out_path = Path(args.out)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for i, (chunk, vec) in enumerate(zip(chunks, vectors)):
|
||||
row: Dict[str, Any] = {
|
||||
"id": f"{text_path.stem}:{i}",
|
||||
"text": chunk,
|
||||
"vector": vec,
|
||||
"source": text_path.name,
|
||||
}
|
||||
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
||||
|
||||
meta = {
|
||||
"source_text": str(text_path),
|
||||
"embedding_model": args.model,
|
||||
"host": args.host,
|
||||
"chunks": len(chunks),
|
||||
"index_path": str(out_path),
|
||||
}
|
||||
print(json.dumps(meta, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
15
scripts/install_server.sh
Executable file
15
scripts/install_server.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
VENV_DIR=".venv"
|
||||
|
||||
if [ ! -d "$VENV_DIR" ]; then
|
||||
python3 -m venv "$VENV_DIR"
|
||||
fi
|
||||
|
||||
source "$VENV_DIR/bin/activate"
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
echo "[ok] server deps installed in $VENV_DIR"
|
||||
|
||||
99
scripts/pdf_stats.py
Normal file
99
scripts/pdf_stats.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def detect_hangul_ratio(text: str) -> float:
|
||||
han = len(re.findall(r"[\u3131-\u318E\uAC00-\uD7A3]", text))
|
||||
total = max(len(text), 1)
|
||||
return han / total
|
||||
|
||||
|
||||
def ensure_dir(path: Path) -> None:
|
||||
if not path.exists():
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Extract full text from PDF and estimate token count")
|
||||
parser.add_argument("pdf", nargs="?", help="Path to PDF; if omitted, first PDF in repo root is used")
|
||||
parser.add_argument("--outdir", default="data", help="Output directory for extracted text")
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_root = Path(os.getcwd())
|
||||
if args.pdf:
|
||||
pdf_path = Path(args.pdf)
|
||||
else:
|
||||
# pick the first PDF in repo root
|
||||
cands = sorted(repo_root.glob("*.pdf"))
|
||||
if not cands:
|
||||
print("{}")
|
||||
return
|
||||
pdf_path = cands[0]
|
||||
|
||||
# Lazy import with helpful error if missing
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except Exception as e:
|
||||
raise SystemExit(
|
||||
"pypdf가 설치되어 있지 않습니다. 가상환경 생성 후 'pip install pypdf tiktoken'을 실행하세요."
|
||||
)
|
||||
|
||||
# Tokenizer
|
||||
try:
|
||||
import tiktoken
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
def count_tokens(s: str) -> int:
|
||||
return len(enc.encode(s))
|
||||
tokenizer = "tiktoken(cl100k_base)"
|
||||
except Exception:
|
||||
def count_tokens(s: str) -> int:
|
||||
# fallback heuristic
|
||||
return int(len(s) / 3.3)
|
||||
tokenizer = "heuristic_div_3.3"
|
||||
|
||||
reader = PdfReader(str(pdf_path))
|
||||
num_pages = len(reader.pages)
|
||||
|
||||
# Full extraction
|
||||
all_text_parts = []
|
||||
for i in range(num_pages):
|
||||
try:
|
||||
page_text = reader.pages[i].extract_text() or ""
|
||||
except Exception:
|
||||
page_text = ""
|
||||
all_text_parts.append(page_text)
|
||||
full_text = "\n\n".join(all_text_parts).strip()
|
||||
|
||||
# Stats
|
||||
chars = len(full_text)
|
||||
tokens = count_tokens(full_text)
|
||||
hangul_ratio = detect_hangul_ratio(full_text)
|
||||
size_bytes = pdf_path.stat().st_size
|
||||
|
||||
# Save text
|
||||
outdir = Path(args.outdir)
|
||||
ensure_dir(outdir)
|
||||
txt_name = pdf_path.stem + ".txt"
|
||||
out_txt = outdir / txt_name
|
||||
out_txt.write_text(full_text, encoding="utf-8")
|
||||
|
||||
result = {
|
||||
"pdf": str(pdf_path),
|
||||
"pages": num_pages,
|
||||
"size_bytes": size_bytes,
|
||||
"chars": chars,
|
||||
"tokens": tokens,
|
||||
"hangul_ratio": round(hangul_ratio, 4),
|
||||
"tokenizer": tokenizer,
|
||||
"text_path": str(out_txt),
|
||||
}
|
||||
print(json.dumps(result, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
16
scripts/venv_setup.sh
Executable file
16
scripts/venv_setup.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
VENV_DIR=".venv"
|
||||
|
||||
if [ ! -d "$VENV_DIR" ]; then
|
||||
python3 -m venv "$VENV_DIR"
|
||||
fi
|
||||
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
python -m pip install --upgrade pip
|
||||
pip install pypdf tiktoken
|
||||
|
||||
echo "[ok] venv ready at $VENV_DIR"
|
||||
|
||||
Reference in New Issue
Block a user