From 4c8168665745638aa97f5df492ff75cbefbb2e4f Mon Sep 17 00:00:00 2001 From: hyungi Date: Wed, 13 Aug 2025 08:02:59 +0900 Subject: [PATCH] feat: Paperless integration (content endpoint, list+sync). Add /paperless/sync and docs --- README.md | 17 +++++++++++++ server/config.py | 1 + server/main.py | 51 +++++++++++++++++++++++++++++++++++++- server/paperless_client.py | 46 ++++++++++++++++++++++++++++------ 4 files changed, 107 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6ff654b..86be96b 100644 --- a/README.md +++ b/README.md @@ -212,6 +212,22 @@ curl -s -X POST http://localhost:26000/paperless/hook \ 해당 훅은 문서 도착을 통지받는 용도로 제공됩니다. 실제 본문 텍스트는 Paperless API로 조회해 `/index/upsert`로 추가하세요. +### Paperless 배치 동기화(`/paperless/sync`) + +Paperless에서 다수 문서를 일괄 인덱싱합니다. + +```bash +curl -s -X POST http://localhost:26000/paperless/sync \ + -H 'Content-Type: application/json' -H 'X-API-Key: <키>' \ + -d '{ + "page_size": 50, + "ordering": "-created", + "tags": null, + "query": null, + "limit": 200 + }' +``` + ## 시놀로지 메일/오피스 연동 가이드(요약) - **검색/QA 호출 엔드포인트**: `http://:26000/search`, `http://:26000/chat` @@ -229,6 +245,7 @@ curl -s -X POST http://localhost:26000/paperless/hook \ - `BASE_MODEL`(기본 `qwen2.5:7b-instruct`) - `BOOST_MODEL`(기본 `qwen2.5:14b-instruct`) - `ENGLISH_MODEL`(기본 `llama3:8b-instruct`): 영어 감지 시 라우팅 대상 +- `ENGLISH_RATIO_THRESHOLD`(기본 `0.65`): 영어 비율 임계값(초과 시 영어 모델) - `EMBEDDING_MODEL`(기본 `nomic-embed-text`) - `INDEX_PATH`(기본 `data/index.jsonl`) - `PAPERLESS_BASE_URL`, `PAPERLESS_TOKEN`(선택): Paperless API 연동 시 사용 diff --git a/server/config.py b/server/config.py index c7afe5d..7939b88 100644 --- a/server/config.py +++ b/server/config.py @@ -10,6 +10,7 @@ class Settings: base_model: str = os.getenv("BASE_MODEL", "qwen2.5:7b-instruct") boost_model: str = os.getenv("BOOST_MODEL", "qwen2.5:14b-instruct") english_model: str = os.getenv("ENGLISH_MODEL", "llama3:8b-instruct") + english_ratio_threshold: float = float(os.getenv("ENGLISH_RATIO_THRESHOLD", "0.65")) embedding_model: str = os.getenv("EMBEDDING_MODEL", "nomic-embed-text") index_path: str = os.getenv("INDEX_PATH", "data/index.jsonl") diff --git a/server/main.py b/server/main.py index 58893bd..0ca0be4 100644 --- a/server/main.py +++ b/server/main.py @@ -90,7 +90,7 @@ def chat(req: ChatRequest) -> Dict[str, Any]: non_ascii_letters = sum((not ch.isascii()) and ch.isalpha() for ch in user_text) english_ratio = ascii_letters / max(ascii_letters + non_ascii_letters, 1) total_chars = len(user_text) - if english_ratio > 0.8: + if english_ratio > settings.english_ratio_threshold: model = settings.english_model else: model = settings.boost_model if (req.force_boost or total_chars > 2000) else settings.base_model @@ -175,6 +175,55 @@ def paperless_hook(hook: PaperlessHook, _: None = Depends(require_api_key)) -> D return {"status": "indexed", "document_id": hook.document_id, "chunks": added} +class PaperlessSyncRequest(BaseModel): + page_size: int = 50 + ordering: str = "-created" + tags: List[int] | None = None + query: str | None = None + limit: int = 200 + + +@app.post("/paperless/sync") +def paperless_sync(req: PaperlessSyncRequest, _: None = Depends(require_api_key)) -> Dict[str, Any]: + client = PaperlessClient(settings.paperless_base_url, settings.paperless_token) + from .index_store import IndexRow + added_total = 0 + next_url: str | None = None + fetched = 0 + + while True: + if next_url: + import requests as _rq + resp = _rq.get(next_url, headers=client._headers(), timeout=60) + resp.raise_for_status() + data = resp.json() + else: + data = client.list_documents(page_size=req.page_size, ordering=req.ordering, tags=req.tags, query=req.query) + results = data.get("results", []) + to_append: List[IndexRow] = [] + for doc in results: + doc_id = doc.get("id") + if not doc_id: + continue + text = client.get_document_text(int(doc_id)) + if not text: + continue + parts = chunk_text(text) + for i, t in enumerate(parts): + vec = ollama.embeddings(settings.embedding_model, t) + to_append.append(IndexRow(id=f"paperless:{doc_id}:{i}", text=t, vector=vec, source="paperless")) + if to_append: + added_total += index.append(to_append) + fetched += len(results) + if fetched >= req.limit: + break + next_url = data.get("next") + if not next_url: + break + + return {"status": "synced", "added": added_total} + + # OpenAI-compatible chat completions (minimal) class ChatCompletionsRequest(BaseModel): model: str | None = None diff --git a/server/paperless_client.py b/server/paperless_client.py index 9b53369..f3e3185 100644 --- a/server/paperless_client.py +++ b/server/paperless_client.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from typing import Any, Dict +from typing import Any, Dict, List, Optional import requests @@ -16,15 +16,47 @@ class PaperlessClient: headers["Authorization"] = f"Token {self.token}" return headers - def get_document_text(self, doc_id: int) -> str: + def get_document(self, doc_id: int) -> Dict[str, Any]: if not self.base_url: raise RuntimeError("PAPERLESS_BASE_URL not configured") - # Example endpoint; adjust to real Paperless API url = f"{self.base_url}/api/documents/{doc_id}/" resp = requests.get(url, headers=self._headers(), timeout=60) resp.raise_for_status() - data = resp.json() - # Prefer content field if available; else title - text = data.get("content", "") or data.get("notes", "") or data.get("title", "") - return text + return resp.json() + + def get_document_text(self, doc_id: int) -> str: + if not self.base_url: + raise RuntimeError("PAPERLESS_BASE_URL not configured") + # Try content endpoint + url_content = f"{self.base_url}/api/documents/{doc_id}/content/" + try: + r = requests.get(url_content, headers=self._headers(), timeout=60) + if r.status_code == 200 and r.text: + return r.text + except Exception: + pass + # Try txt download + url_txt = f"{self.base_url}/api/documents/{doc_id}/download/?format=txt" + try: + r = requests.get(url_txt, headers=self._headers(), timeout=60) + if r.status_code == 200 and r.text: + return r.text + except Exception: + pass + # Fallback to metadata fields + data = self.get_document(doc_id) + return data.get("content", "") or data.get("notes", "") or data.get("title", "") + + def list_documents(self, page_size: int = 50, ordering: str = "-created", tags: Optional[List[int]] = None, query: Optional[str] = None) -> Dict[str, Any]: + if not self.base_url: + raise RuntimeError("PAPERLESS_BASE_URL not configured") + params: Dict[str, Any] = {"page_size": page_size, "ordering": ordering} + if tags: + params["tags__id__in"] = ",".join(str(t) for t in tags) + if query: + params["query"] = query + url = f"{self.base_url}/api/documents/" + resp = requests.get(url, headers=self._headers(), params=params, timeout=60) + resp.raise_for_status() + return resp.json()