from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import List, Dict, Any from .utils import chunk_text from .ollama_client import OllamaClient from .index_store import IndexRow @dataclass class PipelineResult: doc_id: str html_path: str | None added_chunks: int chunks: int class DocumentPipeline: def __init__(self, ollama: OllamaClient, embedding_model: str, boost_model: str, output_dir: str = "outputs") -> None: self.ollama = ollama self.embedding_model = embedding_model self.boost_model = boost_model self.output_dir = Path(output_dir) (self.output_dir / "html").mkdir(parents=True, exist_ok=True) def summarize(self, parts: List[str], target_language: str = "ko", sentences: int = 5) -> List[str]: summarized: List[str] = [] sys_prompt = ( "당신은 전문 요약가입니다. 핵심 내용만 간결하게 요약하세요." ) for p in parts: if not p.strip(): summarized.append("") continue messages = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": ( f"다음 텍스트를 {target_language}로 {sentences}문장 이내로 핵심만 요약하세요. 불필요한 수식어는 제거하고, 중요한 수치/용어는 보존하세요.\n\n{p}" )}, ] resp = self.ollama.chat(self.boost_model, messages, stream=False, options={"temperature": 0.2, "num_ctx": 32768}) content = resp.get("message", {}).get("content") or resp.get("response", "") summarized.append(content.strip()) # 최종 통합 요약(선택): 각 청크 요약을 다시 결합해 더 짧게 joined = "\n\n".join(s for s in summarized if s) if not joined.strip(): return summarized messages2 = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": ( f"아래 부분 요약들을 {target_language}로 {max(3, sentences)}문장 이내로 다시 한번 통합 요약하세요.\n\n{joined}" )}, ] resp2 = self.ollama.chat(self.boost_model, messages2, stream=False, options={"temperature": 0.2, "num_ctx": 32768}) content2 = resp2.get("message", {}).get("content") or resp2.get("response", "") return [content2.strip()] def translate(self, parts: List[str], target_language: str = "ko") -> List[str]: translated: List[str] = [] sys_prompt = ( "당신은 전문 번역가입니다. 입력 텍스트를 대상 언어로 자연스럽고 충실하게 번역하세요. " "의미를 임의로 축약하거나 추가하지 마세요. 코드/수식/표기는 가능한 유지하세요." ) for p in parts: messages = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": f"아래 텍스트를 {target_language}로 번역하세요.\n\n{p}"}, ] resp = self.ollama.chat(self.boost_model, messages, stream=False, options={"temperature": 0.2, "num_ctx": 32768}) content = resp.get("message", {}).get("content") or resp.get("response", "") translated.append(content.strip()) return translated def build_html(self, basename: str, title: str, ko_text: str) -> str: # Ensure .html suffix and sanitize basename safe_base = Path(basename).stem + ".html" html_path = self.output_dir / "html" / safe_base html = f""" \n\n\n{title}\n\n\n\n
\n

{title}

\n""" for idx, para in enumerate(ko_text.split("\n\n")): if para.strip(): html += f"
{para}
\n" html += "
\n\n\n" html_path.write_text(html, encoding="utf-8") return str(html_path) def process( self, *, doc_id: str, text: str, index, generate_html: bool = True, translate: bool = True, target_language: str = "ko", summarize: bool = False, summary_sentences: int = 5, summary_language: str | None = None, html_basename: str | None = None, ) -> PipelineResult: parts = chunk_text(text, max_chars=1200, overlap=200) if summarize: # 요약 언어 기본값: 번역 언어와 동일, 번역 off면 ko로 요약(설정 없을 때) sum_lang = summary_language or (target_language if translate else "ko") summarized_parts = self.summarize(parts, target_language=sum_lang, sentences=summary_sentences) working_parts = summarized_parts else: working_parts = parts translated = self.translate(working_parts, target_language=target_language) if translate else working_parts to_append: List[IndexRow] = [] for i, t in enumerate(translated): vec = self.ollama.embeddings(self.embedding_model, t) to_append.append(IndexRow(id=f"pipeline:{doc_id}:{i}", text=t, vector=vec, source=f"pipeline/{doc_id}")) added = index.append(to_append) if to_append else 0 html_path: str | None = None if generate_html: title_suffix = "요약+번역본" if (summarize and translate) else ("요약본" if summarize else ("번역본" if translate else "원문")) basename = html_basename or f"{doc_id}.html" html_path = self.build_html(basename, title=f"문서 {doc_id} ({title_suffix})", ko_text="\n\n".join(translated)) return PipelineResult(doc_id=doc_id, html_path=html_path, added_chunks=added, chunks=len(translated))