ai-server/server/pipeline.py

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict, Any

from .utils import chunk_text
from .ollama_client import OllamaClient
from .index_store import IndexRow


@dataclass
class PipelineResult:
    doc_id: str
    html_path: str | None
    added_chunks: int
    chunks: int


class DocumentPipeline:
    def __init__(self, ollama: OllamaClient, embedding_model: str, boost_model: str, output_dir: str = "outputs") -> None:
        self.ollama = ollama
        self.embedding_model = embedding_model
        self.boost_model = boost_model
        self.output_dir = Path(output_dir)
        (self.output_dir / "html").mkdir(parents=True, exist_ok=True)

    def translate_to_korean(self, parts: List[str]) -> List[str]:
        translated: List[str] = []
        sys_prompt = (
            "당신은 전문 번역가입니다. 입력 텍스트를 자연스러운 한국어로 충실히 번역하세요. "
            "의미를 임의로 축약하거나 추가하지 마세요. 코드/수식/표기는 가능한 유지하세요."
        )
        for p in parts:
            messages = [
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": f"아래 텍스트를 한국어로 번역하세요:\n\n{p}"},
            ]
            resp = self.ollama.chat(self.boost_model, messages, stream=False, options={"temperature": 0.2, "num_ctx": 32768})
            content = resp.get("message", {}).get("content") or resp.get("response", "")
            translated.append(content.strip())
        return translated

    def build_html(self, doc_id: str, title: str, ko_text: str) -> str:
        html_path = self.output_dir / "html" / f"{doc_id}.html"
        html = f"""
<!doctype html>
<html lang=\"ko\">\n<head>\n<meta charset=\"utf-8\"/>\n<title>{title}</title>\n<style>
body{{font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Noto Sans KR', 'Apple SD Gothic Neo', Arial, sans-serif; line-height:1.6; margin:24px;}}
article{{max-width: 900px; margin: auto;}}
h1{{font-size: 1.6rem; margin-bottom: 1rem;}}
.chunk{{white-space: pre-wrap; margin: 1rem 0;}}
</style>\n</head>\n<body>\n<article>\n<h1>{title}</h1>\n"""
        for idx, para in enumerate(ko_text.split("\n\n")):
            if para.strip():
                html += f"<div class=\"chunk\" id=\"c{idx}\">{para}</div>\n"
        html += "</article>\n</body>\n</html>\n"
        html_path.write_text(html, encoding="utf-8")
        return str(html_path)

    def process(self, *, doc_id: str, text: str, index, generate_html: bool = True) -> PipelineResult:
        parts = chunk_text(text, max_chars=1200, overlap=200)
        translated = self.translate_to_korean(parts)

        to_append: List[IndexRow] = []
        for i, t in enumerate(translated):
            vec = self.ollama.embeddings(self.embedding_model, t)
            to_append.append(IndexRow(id=f"pipeline:{doc_id}:{i}", text=t, vector=vec, source=f"pipeline/{doc_id}"))
        added = index.append(to_append) if to_append else 0

        html_path: str | None = None
        if generate_html:
            html_path = self.build_html(doc_id, title=f"문서 {doc_id} (한국어 번역본)", ko_text="\n\n".join(translated))

        return PipelineResult(doc_id=doc_id, html_path=html_path, added_chunks=added, chunks=len(translated))