feat: export pipeline outputs (HTML copy + upload archiving) via EXPORT_* envs

This commit is contained in:
hyungi
2025-08-13 08:53:05 +09:00
parent 8d87b1f46b
commit ef64aaec84
3 changed files with 30 additions and 3 deletions

View File

@@ -287,6 +287,9 @@ curl -s -X POST http://localhost:26000/paperless/sync \
- `PAPERLESS_BASE_URL`, `PAPERLESS_TOKEN`(선택): Paperless API 연동 시 사용
- `PAPERLESS_VERIFY_SSL`(기본 `true`): Paperless HTTPS 검증 비활성화는 `false`
- `PAPERLESS_CA_BUNDLE`(선택): 신뢰할 CA 번들 경로 지정 시 해당 번들로 검증
- `OUTPUT_DIR`(기본 `outputs`): 파이프라인 산출물(HTML) 저장 루트
- `EXPORT_HTML_DIR`(선택): HTML 산출물 사본을 내보낼 디렉터리(예: 시놀로지 공유 폴더)
- `EXPORT_UPLOAD_DIR`(선택): 업로드 원본 파일 보관 디렉터리
- `API_KEY`(선택): 설정 시 모든 민감 엔드포인트 호출에 `X-API-Key` 헤더 필요
- `CORS_ORIGINS`(선택): CORS 허용 오리진(쉼표 구분), 미설정 시 `*`

View File

@@ -13,6 +13,11 @@ class Settings:
english_ratio_threshold: float = float(os.getenv("ENGLISH_RATIO_THRESHOLD", "0.65"))
embedding_model: str = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
index_path: str = os.getenv("INDEX_PATH", "data/index.jsonl")
output_dir: str = os.getenv("OUTPUT_DIR", "outputs")
# Optional export targets (e.g., Synology NAS shares)
export_html_dir: str = os.getenv("EXPORT_HTML_DIR", "")
export_upload_dir: str = os.getenv("EXPORT_UPLOAD_DIR", "")
# Paperless (user will provide API details)
paperless_base_url: str = os.getenv("PAPERLESS_BASE_URL", "")

View File

@@ -4,6 +4,8 @@ from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Dict, Any
import shutil
from pathlib import Path
from .config import settings
from .ollama_client import OllamaClient
@@ -29,7 +31,7 @@ app.add_middleware(
)
ollama = OllamaClient(settings.ollama_host)
index = JsonlIndex(settings.index_path)
pipeline = DocumentPipeline(ollama, settings.embedding_model, settings.boost_model)
pipeline = DocumentPipeline(ollama, settings.embedding_model, settings.boost_model, output_dir=settings.output_dir)
class ChatRequest(BaseModel):
@@ -178,7 +180,13 @@ def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_ke
summary_sentences=req.summary_sentences,
summary_language=req.summary_language,
)
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
exported_html: str | None = None
if result.html_path and settings.export_html_dir:
Path(settings.export_html_dir).mkdir(parents=True, exist_ok=True)
dst = str(Path(settings.export_html_dir) / Path(result.html_path).name)
shutil.copyfile(result.html_path, dst)
exported_html = dst
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path, "exported_html": exported_html}
@app.post("/pipeline/ingest_file")
@@ -226,7 +234,18 @@ async def pipeline_ingest_file(
translate=translate,
target_language=target_language,
)
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
exported_html: str | None = None
if result.html_path and settings.export_html_dir:
Path(settings.export_html_dir).mkdir(parents=True, exist_ok=True)
dst = str(Path(settings.export_html_dir) / Path(result.html_path).name)
shutil.copyfile(result.html_path, dst)
exported_html = dst
if settings.export_upload_dir:
Path(settings.export_upload_dir).mkdir(parents=True, exist_ok=True)
orig_name = f"{doc_id}__{file.filename}"
with open(str(Path(settings.export_upload_dir) / orig_name), "wb") as f:
f.write(raw)
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path, "exported_html": exported_html}
# Paperless webhook placeholder (to be wired with user-provided details)