diff --git a/README.md b/README.md index 5c5ef24..9b85c23 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,22 @@ curl -s -X POST http://localhost:26000/paperless/hook \ 해당 훅은 문서 도착을 통지받는 용도로 제공됩니다. 실제 본문 텍스트는 Paperless API로 조회해 `/index/upsert`로 추가하세요. ### Paperless 배치 동기화(`/paperless/sync`) +### 문서 파이프라인(`/pipeline/ingest`) + +첨부 문서(텍스트가 준비된 상태: OCR/추출 선행) → 벡터 임베딩 → 한국어 번역 → HTML 생성까지 한 번에 처리합니다. + +```bash +curl -s -X POST http://localhost:26000/pipeline/ingest \ + -H 'Content-Type: application/json' -H 'X-API-Key: <키>' \ + -d '{ + "doc_id": "doc-2025-08-13-001", + "text": "(여기에 문서 텍스트)", + "generate_html": true + }' +``` + +응답에 `html_path`가 포함됩니다. 한국어 번역본이 `outputs/html/.html`로 생성되고, 번역문은 인덱스에 추가되어 RAG로 검색됩니다. + Paperless에서 다수 문서를 일괄 인덱싱합니다. diff --git a/server/main.py b/server/main.py index 0e6e649..3776fb5 100644 --- a/server/main.py +++ b/server/main.py @@ -11,6 +11,7 @@ from .index_store import JsonlIndex from .security import require_api_key from .paperless_client import PaperlessClient from .utils import chunk_text +from .pipeline import DocumentPipeline app = FastAPI(title="Local AI Server", version="0.2.1") @@ -28,6 +29,7 @@ app.add_middleware( ) ollama = OllamaClient(settings.ollama_host) index = JsonlIndex(settings.index_path) +pipeline = DocumentPipeline(ollama, settings.embedding_model, settings.boost_model) class ChatRequest(BaseModel): @@ -55,6 +57,12 @@ class UpsertRequest(BaseModel): batch: int = 16 +class PipelineIngestRequest(BaseModel): + doc_id: str + text: str + generate_html: bool = True + + @app.get("/health") def health() -> Dict[str, Any]: return { @@ -152,6 +160,12 @@ def index_reload() -> Dict[str, Any]: return {"total": total} +@app.post("/pipeline/ingest") +def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_key)) -> Dict[str, Any]: + result = pipeline.process(doc_id=req.doc_id, text=req.text, index=index, generate_html=req.generate_html) + return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path} + + # Paperless webhook placeholder (to be wired with user-provided details) class PaperlessHook(BaseModel): document_id: int diff --git a/server/pipeline.py b/server/pipeline.py new file mode 100644 index 0000000..71004c3 --- /dev/null +++ b/server/pipeline.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import List, Dict, Any + +from .utils import chunk_text +from .ollama_client import OllamaClient +from .index_store import IndexRow + + +@dataclass +class PipelineResult: + doc_id: str + html_path: str | None + added_chunks: int + chunks: int + + +class DocumentPipeline: + def __init__(self, ollama: OllamaClient, embedding_model: str, boost_model: str, output_dir: str = "outputs") -> None: + self.ollama = ollama + self.embedding_model = embedding_model + self.boost_model = boost_model + self.output_dir = Path(output_dir) + (self.output_dir / "html").mkdir(parents=True, exist_ok=True) + + def translate_to_korean(self, parts: List[str]) -> List[str]: + translated: List[str] = [] + sys_prompt = ( + "당신은 전문 번역가입니다. 입력 텍스트를 자연스러운 한국어로 충실히 번역하세요. " + "의미를 임의로 축약하거나 추가하지 마세요. 코드/수식/표기는 가능한 유지하세요." + ) + for p in parts: + messages = [ + {"role": "system", "content": sys_prompt}, + {"role": "user", "content": f"아래 텍스트를 한국어로 번역하세요:\n\n{p}"}, + ] + resp = self.ollama.chat(self.boost_model, messages, stream=False, options={"temperature": 0.2, "num_ctx": 32768}) + content = resp.get("message", {}).get("content") or resp.get("response", "") + translated.append(content.strip()) + return translated + + def build_html(self, doc_id: str, title: str, ko_text: str) -> str: + html_path = self.output_dir / "html" / f"{doc_id}.html" + html = f""" + +\n\n\n{title}\n\n\n\n
\n

{title}

\n""" + for idx, para in enumerate(ko_text.split("\n\n")): + if para.strip(): + html += f"
{para}
\n" + html += "
\n\n\n" + html_path.write_text(html, encoding="utf-8") + return str(html_path) + + def process(self, *, doc_id: str, text: str, index, generate_html: bool = True) -> PipelineResult: + parts = chunk_text(text, max_chars=1200, overlap=200) + translated = self.translate_to_korean(parts) + + to_append: List[IndexRow] = [] + for i, t in enumerate(translated): + vec = self.ollama.embeddings(self.embedding_model, t) + to_append.append(IndexRow(id=f"pipeline:{doc_id}:{i}", text=t, vector=vec, source=f"pipeline/{doc_id}")) + added = index.append(to_append) if to_append else 0 + + html_path: str | None = None + if generate_html: + html_path = self.build_html(doc_id, title=f"문서 {doc_id} (한국어 번역본)", ko_text="\n\n".join(translated)) + + return PipelineResult(doc_id=doc_id, html_path=html_path, added_chunks=added, chunks=len(translated)) +