feat: export pipeline outputs (HTML copy + upload archiving) via EXPORT_* envs

This commit is contained in:
hyungi
2025-08-13 08:53:05 +09:00
parent 8d87b1f46b
commit ef64aaec84
3 changed files with 30 additions and 3 deletions

View File

@@ -13,6 +13,11 @@ class Settings:
english_ratio_threshold: float = float(os.getenv("ENGLISH_RATIO_THRESHOLD", "0.65"))
embedding_model: str = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
index_path: str = os.getenv("INDEX_PATH", "data/index.jsonl")
output_dir: str = os.getenv("OUTPUT_DIR", "outputs")
# Optional export targets (e.g., Synology NAS shares)
export_html_dir: str = os.getenv("EXPORT_HTML_DIR", "")
export_upload_dir: str = os.getenv("EXPORT_UPLOAD_DIR", "")
# Paperless (user will provide API details)
paperless_base_url: str = os.getenv("PAPERLESS_BASE_URL", "")

View File

@@ -4,6 +4,8 @@ from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Dict, Any
import shutil
from pathlib import Path
from .config import settings
from .ollama_client import OllamaClient
@@ -29,7 +31,7 @@ app.add_middleware(
)
ollama = OllamaClient(settings.ollama_host)
index = JsonlIndex(settings.index_path)
pipeline = DocumentPipeline(ollama, settings.embedding_model, settings.boost_model)
pipeline = DocumentPipeline(ollama, settings.embedding_model, settings.boost_model, output_dir=settings.output_dir)
class ChatRequest(BaseModel):
@@ -178,7 +180,13 @@ def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_ke
summary_sentences=req.summary_sentences,
summary_language=req.summary_language,
)
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
exported_html: str | None = None
if result.html_path and settings.export_html_dir:
Path(settings.export_html_dir).mkdir(parents=True, exist_ok=True)
dst = str(Path(settings.export_html_dir) / Path(result.html_path).name)
shutil.copyfile(result.html_path, dst)
exported_html = dst
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path, "exported_html": exported_html}
@app.post("/pipeline/ingest_file")
@@ -226,7 +234,18 @@ async def pipeline_ingest_file(
translate=translate,
target_language=target_language,
)
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
exported_html: str | None = None
if result.html_path and settings.export_html_dir:
Path(settings.export_html_dir).mkdir(parents=True, exist_ok=True)
dst = str(Path(settings.export_html_dir) / Path(result.html_path).name)
shutil.copyfile(result.html_path, dst)
exported_html = dst
if settings.export_upload_dir:
Path(settings.export_upload_dir).mkdir(parents=True, exist_ok=True)
orig_name = f"{doc_id}__{file.filename}"
with open(str(Path(settings.export_upload_dir) / orig_name), "wb") as f:
f.write(raw)
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path, "exported_html": exported_html}
# Paperless webhook placeholder (to be wired with user-provided details)