from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import List, Dict, Any from .utils import chunk_text from .ollama_client import OllamaClient from .index_store import IndexRow @dataclass class PipelineResult: doc_id: str html_path: str | None added_chunks: int chunks: int class DocumentPipeline: def __init__(self, ollama: OllamaClient, embedding_model: str, boost_model: str, output_dir: str = "outputs") -> None: self.ollama = ollama self.embedding_model = embedding_model self.boost_model = boost_model self.output_dir = Path(output_dir) (self.output_dir / "html").mkdir(parents=True, exist_ok=True) def translate(self, parts: List[str], target_language: str = "ko") -> List[str]: translated: List[str] = [] sys_prompt = ( "당신은 전문 번역가입니다. 입력 텍스트를 대상 언어로 자연스럽고 충실하게 번역하세요. " "의미를 임의로 축약하거나 추가하지 마세요. 코드/수식/표기는 가능한 유지하세요." ) for p in parts: messages = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": f"아래 텍스트를 {target_language}로 번역하세요.\n\n{p}"}, ] resp = self.ollama.chat(self.boost_model, messages, stream=False, options={"temperature": 0.2, "num_ctx": 32768}) content = resp.get("message", {}).get("content") or resp.get("response", "") translated.append(content.strip()) return translated def build_html(self, doc_id: str, title: str, ko_text: str) -> str: html_path = self.output_dir / "html" / f"{doc_id}.html" html = f""" \n\n\n{title}\n\n\n\n
\n

{title}

\n""" for idx, para in enumerate(ko_text.split("\n\n")): if para.strip(): html += f"
{para}
\n" html += "
\n\n\n" html_path.write_text(html, encoding="utf-8") return str(html_path) def process(self, *, doc_id: str, text: str, index, generate_html: bool = True, translate: bool = True, target_language: str = "ko") -> PipelineResult: parts = chunk_text(text, max_chars=1200, overlap=200) translated = self.translate(parts, target_language=target_language) if translate else parts to_append: List[IndexRow] = [] for i, t in enumerate(translated): vec = self.ollama.embeddings(self.embedding_model, t) to_append.append(IndexRow(id=f"pipeline:{doc_id}:{i}", text=t, vector=vec, source=f"pipeline/{doc_id}")) added = index.append(to_append) if to_append else 0 html_path: str | None = None if generate_html: title_suffix = "번역본" if translate else "원문" html_path = self.build_html(doc_id, title=f"문서 {doc_id} ({title_suffix})", ko_text="\n\n".join(translated)) return PipelineResult(doc_id=doc_id, html_path=html_path, added_chunks=added, chunks=len(translated))