From 6346635ac17f415624ee2c75aa9670914141e98f Mon Sep 17 00:00:00 2001 From: hyungi Date: Wed, 13 Aug 2025 08:48:17 +0900 Subject: [PATCH] feat: add /pipeline/ingest_file endpoint for .txt/.pdf upload --- README.md | 12 ++++++++++++ server/main.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed0f4ea..191d5ff 100644 --- a/README.md +++ b/README.md @@ -233,6 +233,18 @@ curl -s -X POST http://localhost:26000/pipeline/ingest \ - 번역 켜짐(`translate=true`): 번역본이 `outputs/html/.html`로 생성되고, 번역문이 인덱스에 추가됩니다. - 번역 꺼짐(`translate=false`): 원문으로 HTML만 생성되고, 원문 텍스트가 인덱스에 추가됩니다. +파일 업로드 버전(`/pipeline/ingest_file`): `.txt`/`.pdf` 지원 + +```bash +curl -s -X POST http://localhost:26000/pipeline/ingest_file \ + -H 'X-API-Key: <키>' \ + -F 'file=@/path/to/file.pdf' \ + -F 'doc_id=doc-001' \ + -F 'generate_html=true' \ + -F 'translate=false' \ + -F 'target_language=ko' +``` + Paperless에서 다수 문서를 일괄 인덱싱합니다. diff --git a/server/main.py b/server/main.py index 817d506..0504e29 100644 --- a/server/main.py +++ b/server/main.py @@ -1,6 +1,6 @@ from __future__ import annotations -from fastapi import FastAPI, HTTPException, Depends +from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import List, Dict, Any @@ -175,6 +175,54 @@ def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_ke return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path} +@app.post("/pipeline/ingest_file") +async def pipeline_ingest_file( + _: None = Depends(require_api_key), + file: UploadFile = File(...), + doc_id: str = Form(...), + generate_html: bool = Form(True), + translate: bool = Form(True), + target_language: str = Form("ko"), +) -> Dict[str, Any]: + content_type = (file.content_type or "").lower() + raw = await file.read() + text = "" + if "text/plain" in content_type or file.filename.endswith(".txt"): + try: + text = raw.decode("utf-8") + except Exception: + text = raw.decode("latin-1", errors="ignore") + elif "pdf" in content_type or file.filename.endswith(".pdf"): + try: + from pypdf import PdfReader + from io import BytesIO + reader = PdfReader(BytesIO(raw)) + parts: List[str] = [] + for p in reader.pages: + try: + parts.append(p.extract_text() or "") + except Exception: + parts.append("") + text = "\n\n".join(parts) + except Exception as e: + raise HTTPException(status_code=400, detail=f"pdf_extract_error: {e}") + else: + raise HTTPException(status_code=400, detail="unsupported_file_type (only .txt/.pdf)") + + if not text.strip(): + raise HTTPException(status_code=400, detail="empty_text_after_extraction") + + result = pipeline.process( + doc_id=doc_id, + text=text, + index=index, + generate_html=generate_html, + translate=translate, + target_language=target_language, + ) + return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path} + + # Paperless webhook placeholder (to be wired with user-provided details) class PaperlessHook(BaseModel): document_id: int