Files
hyungi_document_server/app/api/documents.py
Hyungi Ahn 4b695332b9 feat: implement Phase 2 core features
- Add document CRUD API (list/get/upload/update/delete with auth)
  - Upload saves to Inbox + auto-enqueues processing pipeline
  - Delete defaults to DB-only, explicit flag for file deletion
- Add hybrid search API (FTS 0.4 + trigram 0.2 + vector 0.4 weighted)
  - Modes: fts, trgm, vector, hybrid (default)
  - Vector search gracefully degrades if GPU unavailable
- Add Inbox file watcher (5min interval, new file + hash change detection)
- Register documents/search routers and file_watcher scheduler in main.py
- Add IVFFLAT vector index migration (lists=50, with tuning guide)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 14:49:12 +09:00

216 lines
6.2 KiB
Python

"""문서 CRUD API"""
import shutil
from datetime import datetime, timezone
from pathlib import Path
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query, UploadFile, status
from pydantic import BaseModel
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from core.config import settings
from core.database import get_session
from core.utils import file_hash
from models.document import Document
from models.queue import ProcessingQueue
from models.user import User
router = APIRouter()
# ─── 스키마 ───
class DocumentResponse(BaseModel):
id: int
file_path: str
file_format: str
file_size: int | None
file_type: str
title: str | None
ai_domain: str | None
ai_sub_group: str | None
ai_tags: list | None
ai_summary: str | None
source_channel: str | None
data_origin: str | None
extracted_at: datetime | None
ai_processed_at: datetime | None
embedded_at: datetime | None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class DocumentListResponse(BaseModel):
items: list[DocumentResponse]
total: int
page: int
page_size: int
class DocumentUpdate(BaseModel):
title: str | None = None
ai_domain: str | None = None
ai_sub_group: str | None = None
ai_tags: list | None = None
source_channel: str | None = None
data_origin: str | None = None
# ─── 엔드포인트 ───
@router.get("/", response_model=DocumentListResponse)
async def list_documents(
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
page: int = Query(1, ge=1),
page_size: int = Query(20, ge=1, le=100),
domain: str | None = None,
source: str | None = None,
format: str | None = None,
):
"""문서 목록 조회 (페이지네이션 + 필터)"""
query = select(Document)
if domain:
query = query.where(Document.ai_domain == domain)
if source:
query = query.where(Document.source_channel == source)
if format:
query = query.where(Document.file_format == format)
# 전체 건수
count_query = select(func.count()).select_from(query.subquery())
total = (await session.execute(count_query)).scalar()
# 페이지네이션
query = query.order_by(Document.created_at.desc())
query = query.offset((page - 1) * page_size).limit(page_size)
result = await session.execute(query)
items = result.scalars().all()
return DocumentListResponse(
items=[DocumentResponse.model_validate(doc) for doc in items],
total=total,
page=page,
page_size=page_size,
)
@router.get("/{doc_id}", response_model=DocumentResponse)
async def get_document(
doc_id: int,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
):
"""문서 단건 조회"""
doc = await session.get(Document, doc_id)
if not doc:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
return DocumentResponse.model_validate(doc)
@router.post("/", response_model=DocumentResponse, status_code=201)
async def upload_document(
file: UploadFile,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
):
"""파일 업로드 → Inbox 저장 + DB 등록 + 처리 큐 등록"""
if not file.filename:
raise HTTPException(status_code=400, detail="파일명이 필요합니다")
# Inbox에 파일 저장
inbox_dir = Path(settings.nas_mount_path) / "PKM" / "Inbox"
inbox_dir.mkdir(parents=True, exist_ok=True)
target = inbox_dir / file.filename
# 중복 파일명 처리
counter = 1
stem, suffix = target.stem, target.suffix
while target.exists():
target = inbox_dir / f"{stem}_{counter}{suffix}"
counter += 1
content = await file.read()
target.write_bytes(content)
# 상대 경로 (NAS 루트 기준)
rel_path = str(target.relative_to(Path(settings.nas_mount_path)))
fhash = file_hash(target)
ext = target.suffix.lstrip(".").lower() or "unknown"
# DB 등록
doc = Document(
file_path=rel_path,
file_hash=fhash,
file_format=ext,
file_size=len(content),
file_type="immutable",
title=target.stem,
source_channel="manual",
)
session.add(doc)
await session.flush()
# 처리 큐 등록
session.add(ProcessingQueue(
document_id=doc.id,
stage="extract",
status="pending",
))
await session.commit()
return DocumentResponse.model_validate(doc)
@router.patch("/{doc_id}", response_model=DocumentResponse)
async def update_document(
doc_id: int,
body: DocumentUpdate,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
):
"""문서 메타데이터 수정 (수동 오버라이드)"""
doc = await session.get(Document, doc_id)
if not doc:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
update_data = body.model_dump(exclude_unset=True)
for field, value in update_data.items():
setattr(doc, field, value)
doc.updated_at = datetime.now(timezone.utc)
await session.commit()
return DocumentResponse.model_validate(doc)
@router.delete("/{doc_id}")
async def delete_document(
doc_id: int,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
delete_file: bool = Query(False, description="NAS 파일도 함께 삭제"),
):
"""문서 삭제 (기본: DB만 삭제, 파일 유지)"""
doc = await session.get(Document, doc_id)
if not doc:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
if delete_file:
file_path = Path(settings.nas_mount_path) / doc.file_path
if file_path.exists():
file_path.unlink()
await session.delete(doc)
await session.commit()
return {"message": f"문서 {doc_id} 삭제됨", "file_deleted": delete_file}