from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import List, Dict, Any from .utils import chunk_text from .ollama_client import OllamaClient from .index_store import IndexRow @dataclass class PipelineResult: doc_id: str html_path: str | None added_chunks: int chunks: int class DocumentPipeline: def __init__(self, ollama: OllamaClient, embedding_model: str, boost_model: str, output_dir: str = "outputs") -> None: self.ollama = ollama self.embedding_model = embedding_model self.boost_model = boost_model self.output_dir = Path(output_dir) (self.output_dir / "html").mkdir(parents=True, exist_ok=True) def translate(self, parts: List[str], target_language: str = "ko") -> List[str]: translated: List[str] = [] sys_prompt = ( "당신은 전문 번역가입니다. 입력 텍스트를 대상 언어로 자연스럽고 충실하게 번역하세요. " "의미를 임의로 축약하거나 추가하지 마세요. 코드/수식/표기는 가능한 유지하세요." ) for p in parts: messages = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": f"아래 텍스트를 {target_language}로 번역하세요.\n\n{p}"}, ] resp = self.ollama.chat(self.boost_model, messages, stream=False, options={"temperature": 0.2, "num_ctx": 32768}) content = resp.get("message", {}).get("content") or resp.get("response", "") translated.append(content.strip()) return translated def build_html(self, doc_id: str, title: str, ko_text: str) -> str: html_path = self.output_dir / "html" / f"{doc_id}.html" html = f""" \n
\n\n