feat: Paperless integration (content endpoint, list+sync). Add /paperless/sync and docs

This commit is contained in:
hyungi
2025-08-13 08:02:59 +09:00
parent bcb1e543e6
commit 4c81686657
4 changed files with 107 additions and 8 deletions

View File

@@ -10,6 +10,7 @@ class Settings:
base_model: str = os.getenv("BASE_MODEL", "qwen2.5:7b-instruct")
boost_model: str = os.getenv("BOOST_MODEL", "qwen2.5:14b-instruct")
english_model: str = os.getenv("ENGLISH_MODEL", "llama3:8b-instruct")
english_ratio_threshold: float = float(os.getenv("ENGLISH_RATIO_THRESHOLD", "0.65"))
embedding_model: str = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
index_path: str = os.getenv("INDEX_PATH", "data/index.jsonl")

View File

@@ -90,7 +90,7 @@ def chat(req: ChatRequest) -> Dict[str, Any]:
non_ascii_letters = sum((not ch.isascii()) and ch.isalpha() for ch in user_text)
english_ratio = ascii_letters / max(ascii_letters + non_ascii_letters, 1)
total_chars = len(user_text)
if english_ratio > 0.8:
if english_ratio > settings.english_ratio_threshold:
model = settings.english_model
else:
model = settings.boost_model if (req.force_boost or total_chars > 2000) else settings.base_model
@@ -175,6 +175,55 @@ def paperless_hook(hook: PaperlessHook, _: None = Depends(require_api_key)) -> D
return {"status": "indexed", "document_id": hook.document_id, "chunks": added}
class PaperlessSyncRequest(BaseModel):
page_size: int = 50
ordering: str = "-created"
tags: List[int] | None = None
query: str | None = None
limit: int = 200
@app.post("/paperless/sync")
def paperless_sync(req: PaperlessSyncRequest, _: None = Depends(require_api_key)) -> Dict[str, Any]:
client = PaperlessClient(settings.paperless_base_url, settings.paperless_token)
from .index_store import IndexRow
added_total = 0
next_url: str | None = None
fetched = 0
while True:
if next_url:
import requests as _rq
resp = _rq.get(next_url, headers=client._headers(), timeout=60)
resp.raise_for_status()
data = resp.json()
else:
data = client.list_documents(page_size=req.page_size, ordering=req.ordering, tags=req.tags, query=req.query)
results = data.get("results", [])
to_append: List[IndexRow] = []
for doc in results:
doc_id = doc.get("id")
if not doc_id:
continue
text = client.get_document_text(int(doc_id))
if not text:
continue
parts = chunk_text(text)
for i, t in enumerate(parts):
vec = ollama.embeddings(settings.embedding_model, t)
to_append.append(IndexRow(id=f"paperless:{doc_id}:{i}", text=t, vector=vec, source="paperless"))
if to_append:
added_total += index.append(to_append)
fetched += len(results)
if fetched >= req.limit:
break
next_url = data.get("next")
if not next_url:
break
return {"status": "synced", "added": added_total}
# OpenAI-compatible chat completions (minimal)
class ChatCompletionsRequest(BaseModel):
model: str | None = None

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import os
from typing import Any, Dict
from typing import Any, Dict, List, Optional
import requests
@@ -16,15 +16,47 @@ class PaperlessClient:
headers["Authorization"] = f"Token {self.token}"
return headers
def get_document_text(self, doc_id: int) -> str:
def get_document(self, doc_id: int) -> Dict[str, Any]:
if not self.base_url:
raise RuntimeError("PAPERLESS_BASE_URL not configured")
# Example endpoint; adjust to real Paperless API
url = f"{self.base_url}/api/documents/{doc_id}/"
resp = requests.get(url, headers=self._headers(), timeout=60)
resp.raise_for_status()
data = resp.json()
# Prefer content field if available; else title
text = data.get("content", "") or data.get("notes", "") or data.get("title", "")
return text
return resp.json()
def get_document_text(self, doc_id: int) -> str:
if not self.base_url:
raise RuntimeError("PAPERLESS_BASE_URL not configured")
# Try content endpoint
url_content = f"{self.base_url}/api/documents/{doc_id}/content/"
try:
r = requests.get(url_content, headers=self._headers(), timeout=60)
if r.status_code == 200 and r.text:
return r.text
except Exception:
pass
# Try txt download
url_txt = f"{self.base_url}/api/documents/{doc_id}/download/?format=txt"
try:
r = requests.get(url_txt, headers=self._headers(), timeout=60)
if r.status_code == 200 and r.text:
return r.text
except Exception:
pass
# Fallback to metadata fields
data = self.get_document(doc_id)
return data.get("content", "") or data.get("notes", "") or data.get("title", "")
def list_documents(self, page_size: int = 50, ordering: str = "-created", tags: Optional[List[int]] = None, query: Optional[str] = None) -> Dict[str, Any]:
if not self.base_url:
raise RuntimeError("PAPERLESS_BASE_URL not configured")
params: Dict[str, Any] = {"page_size": page_size, "ordering": ordering}
if tags:
params["tags__id__in"] = ",".join(str(t) for t in tags)
if query:
params["query"] = query
url = f"{self.base_url}/api/documents/"
resp = requests.get(url, headers=self._headers(), params=params, timeout=60)
resp.raise_for_status()
return resp.json()