feat: add /pipeline/ingest_file endpoint for .txt/.pdf upload
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Depends
|
||||
from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Any
|
||||
@@ -175,6 +175,54 @@ def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_ke
|
||||
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
|
||||
|
||||
|
||||
@app.post("/pipeline/ingest_file")
|
||||
async def pipeline_ingest_file(
|
||||
_: None = Depends(require_api_key),
|
||||
file: UploadFile = File(...),
|
||||
doc_id: str = Form(...),
|
||||
generate_html: bool = Form(True),
|
||||
translate: bool = Form(True),
|
||||
target_language: str = Form("ko"),
|
||||
) -> Dict[str, Any]:
|
||||
content_type = (file.content_type or "").lower()
|
||||
raw = await file.read()
|
||||
text = ""
|
||||
if "text/plain" in content_type or file.filename.endswith(".txt"):
|
||||
try:
|
||||
text = raw.decode("utf-8")
|
||||
except Exception:
|
||||
text = raw.decode("latin-1", errors="ignore")
|
||||
elif "pdf" in content_type or file.filename.endswith(".pdf"):
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
from io import BytesIO
|
||||
reader = PdfReader(BytesIO(raw))
|
||||
parts: List[str] = []
|
||||
for p in reader.pages:
|
||||
try:
|
||||
parts.append(p.extract_text() or "")
|
||||
except Exception:
|
||||
parts.append("")
|
||||
text = "\n\n".join(parts)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"pdf_extract_error: {e}")
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="unsupported_file_type (only .txt/.pdf)")
|
||||
|
||||
if not text.strip():
|
||||
raise HTTPException(status_code=400, detail="empty_text_after_extraction")
|
||||
|
||||
result = pipeline.process(
|
||||
doc_id=doc_id,
|
||||
text=text,
|
||||
index=index,
|
||||
generate_html=generate_html,
|
||||
translate=translate,
|
||||
target_language=target_language,
|
||||
)
|
||||
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
|
||||
|
||||
|
||||
# Paperless webhook placeholder (to be wired with user-provided details)
|
||||
class PaperlessHook(BaseModel):
|
||||
document_id: int
|
||||
|
||||
Reference in New Issue
Block a user