d968b2d901
- 라벨 "복습 시작" → "문제풀이" - attempts.outcome 컬럼 + selected_choice nullable (correct/wrong/unsure) - 풀이 중 정답·해설·AI·비슷한 문제 모두 비노출, 답 클릭 시 자동 진행 - "모르겠음" 5번째 옵션 추가 - 결과 화면 = 정답/틀린/모르겠음 3 카테고리 탭, 카드 클릭 expand - 틀린 → PR-3 AI 해설 (RAG) - 모르겠음 → 분야(subject+scope) 설명 AI 즉석 생성 + 캐시 (PR-9 신규) - 분야 설명 RAG: 매핑 documents 청크 + 같은 분야 다른 문제·해설 → bge-reranker - 마이그레이션 200~205 (single-statement, asyncpg 호환) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
258 lines
7.4 KiB
Python
258 lines
7.4 KiB
Python
"""study_topic_subject_notes 분야 설명 RAG (PR-9).
|
|
|
|
PR-3 explanation_rag.py 와 비슷한 패턴 — 매핑 documents 청크 + 같은 토픽 같은 subject 다른 문제·해설 → bge-reranker 로 top-K 줄임.
|
|
|
|
쿼리는 "subject + scope" 키워드 기반 (단일 문제가 아니라 분야 키워드).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
from dataclasses import dataclass
|
|
|
|
import httpx
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from ai.client import AIClient
|
|
from models.chunk import DocumentChunk
|
|
from models.document import Document
|
|
from models.study_question import StudyQuestion
|
|
from models.study_topic import StudyTopicDocument
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# top-K
|
|
DOC_TOPK = 5
|
|
Q_TOPK = 3
|
|
MAX_RERANK_INPUT = 60
|
|
DOC_SNIPPET_LEN = 400
|
|
Q_SNIPPET_LEN = 300
|
|
MAX_QUESTION_CANDIDATES = 30
|
|
RERANK_TIMEOUT_S = 5.0
|
|
|
|
|
|
@dataclass
|
|
class EvidenceItem:
|
|
source_type: str
|
|
source_id: int
|
|
title: str
|
|
snippet: str
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"source_type": self.source_type,
|
|
"source_id": self.source_id,
|
|
"title": self.title,
|
|
"snippet": self.snippet,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class SubjectNoteContext:
|
|
documents: list[EvidenceItem]
|
|
questions: list[EvidenceItem]
|
|
|
|
@property
|
|
def all(self) -> list[EvidenceItem]:
|
|
return [*self.documents, *self.questions]
|
|
|
|
|
|
def _truncate(text: str, n: int) -> str:
|
|
if not text:
|
|
return ""
|
|
s = text.strip()
|
|
return s if len(s) <= n else s[:n].rstrip() + "…"
|
|
|
|
|
|
def _build_query(subject: str, scope: str) -> str:
|
|
parts = [subject]
|
|
if scope:
|
|
parts.append(scope)
|
|
parts.append("학습 자료")
|
|
return " ".join(parts)
|
|
|
|
|
|
async def _rerank(client: AIClient, query: str, texts: list[str], top_k: int) -> list[int]:
|
|
if not texts:
|
|
return []
|
|
if len(texts) > MAX_RERANK_INPUT:
|
|
texts = texts[:MAX_RERANK_INPUT]
|
|
try:
|
|
async with asyncio.timeout(RERANK_TIMEOUT_S):
|
|
results = await client.rerank(query, texts)
|
|
idxs = [r["index"] for r in results if 0 <= r.get("index", -1) < len(texts)]
|
|
return idxs[:top_k]
|
|
except (asyncio.TimeoutError, httpx.HTTPError) as e:
|
|
logger.warning("subject_note_rerank_fallback: %s: %s", type(e).__name__, e)
|
|
return list(range(min(top_k, len(texts))))
|
|
|
|
|
|
async def _gather_document_evidence(
|
|
session: AsyncSession,
|
|
user_id: int,
|
|
study_topic_id: int,
|
|
query: str,
|
|
client: AIClient,
|
|
) -> list[EvidenceItem]:
|
|
doc_id_rows = (
|
|
await session.execute(
|
|
select(StudyTopicDocument.document_id).where(
|
|
StudyTopicDocument.study_topic_id == study_topic_id,
|
|
StudyTopicDocument.user_id == user_id,
|
|
)
|
|
)
|
|
).scalars().all()
|
|
doc_ids = list(doc_id_rows)
|
|
if not doc_ids:
|
|
return []
|
|
|
|
doc_meta_rows = (
|
|
await session.execute(
|
|
select(Document.id, Document.title, Document.ai_summary).where(
|
|
Document.id.in_(doc_ids),
|
|
Document.deleted_at.is_(None),
|
|
)
|
|
)
|
|
).all()
|
|
doc_meta: dict[int, tuple[str | None, str | None]] = {
|
|
r.id: (r.title, r.ai_summary) for r in doc_meta_rows
|
|
}
|
|
if not doc_meta:
|
|
return []
|
|
valid_doc_ids = list(doc_meta.keys())
|
|
|
|
chunk_rows = (
|
|
await session.execute(
|
|
select(DocumentChunk.doc_id, DocumentChunk.chunk_index, DocumentChunk.text)
|
|
.where(
|
|
DocumentChunk.doc_id.in_(valid_doc_ids),
|
|
DocumentChunk.chunk_index < 4,
|
|
)
|
|
.order_by(DocumentChunk.doc_id, DocumentChunk.chunk_index)
|
|
)
|
|
).all()
|
|
|
|
candidates: list[tuple[int, str]] = []
|
|
for r in chunk_rows:
|
|
if r.text:
|
|
candidates.append((r.doc_id, r.text))
|
|
for did, (_title, summary) in doc_meta.items():
|
|
if summary:
|
|
candidates.append((did, summary))
|
|
|
|
if not candidates:
|
|
return []
|
|
|
|
texts = [_truncate(t, 800) for _, t in candidates]
|
|
top_idxs = await _rerank(client, query, texts, DOC_TOPK)
|
|
seen_doc_ids: set[int] = set()
|
|
out: list[EvidenceItem] = []
|
|
for i in top_idxs:
|
|
did, text = candidates[i]
|
|
if did in seen_doc_ids:
|
|
continue
|
|
seen_doc_ids.add(did)
|
|
title = doc_meta.get(did, (None, None))[0] or f"문서 #{did}"
|
|
out.append(EvidenceItem(
|
|
source_type="document",
|
|
source_id=did,
|
|
title=title,
|
|
snippet=_truncate(text, DOC_SNIPPET_LEN),
|
|
))
|
|
if len(out) >= DOC_TOPK:
|
|
break
|
|
return out
|
|
|
|
|
|
async def _gather_question_evidence(
|
|
session: AsyncSession,
|
|
user_id: int,
|
|
study_topic_id: int,
|
|
subject: str,
|
|
scope: str,
|
|
query: str,
|
|
client: AIClient,
|
|
) -> list[EvidenceItem]:
|
|
"""같은 토픽 같은 subject (+scope 일치 우선) 의 문제·해설."""
|
|
base_q = (
|
|
select(StudyQuestion)
|
|
.where(
|
|
StudyQuestion.user_id == user_id,
|
|
StudyQuestion.study_topic_id == study_topic_id,
|
|
StudyQuestion.deleted_at.is_(None),
|
|
)
|
|
)
|
|
# 같은 subject 우선. 없으면 같은 토픽 전체로 fallback.
|
|
rows = (
|
|
await session.execute(
|
|
base_q.where(StudyQuestion.subject == subject)
|
|
.order_by(StudyQuestion.created_at.desc())
|
|
.limit(MAX_QUESTION_CANDIDATES)
|
|
)
|
|
).scalars().all()
|
|
if not rows:
|
|
rows = (
|
|
await session.execute(
|
|
base_q.order_by(StudyQuestion.created_at.desc())
|
|
.limit(MAX_QUESTION_CANDIDATES)
|
|
)
|
|
).scalars().all()
|
|
if not rows:
|
|
return []
|
|
|
|
candidates_text: list[str] = []
|
|
for q in rows:
|
|
parts = [q.question_text or ""]
|
|
if q.explanation:
|
|
parts.append(q.explanation)
|
|
if q.ai_explanation and q.ai_explanation_status == "ready":
|
|
parts.append(q.ai_explanation)
|
|
candidates_text.append(" | ".join(parts))
|
|
|
|
top_idxs = await _rerank(client, query, candidates_text, Q_TOPK)
|
|
out: list[EvidenceItem] = []
|
|
for i in top_idxs:
|
|
q = rows[i]
|
|
title_head = _truncate(q.question_text or "", 40)
|
|
out.append(EvidenceItem(
|
|
source_type="question",
|
|
source_id=q.id,
|
|
title=f"Q{q.id}: {title_head}",
|
|
snippet=_truncate(candidates_text[i], Q_SNIPPET_LEN),
|
|
))
|
|
return out
|
|
|
|
|
|
async def gather_subject_note_context(
|
|
session: AsyncSession,
|
|
user_id: int,
|
|
study_topic_id: int,
|
|
subject: str,
|
|
scope: str,
|
|
) -> SubjectNoteContext:
|
|
"""분야 (subject, scope) 의 RAG 근거 수집."""
|
|
client = AIClient()
|
|
query = _build_query(subject, scope)
|
|
try:
|
|
docs, questions = await asyncio.gather(
|
|
_gather_document_evidence(session, user_id, study_topic_id, query, client),
|
|
_gather_question_evidence(session, user_id, study_topic_id, subject, scope, query, client),
|
|
)
|
|
return SubjectNoteContext(documents=docs, questions=questions)
|
|
finally:
|
|
await client.close()
|
|
|
|
|
|
def render_evidence_block(items: list[EvidenceItem]) -> str:
|
|
if not items:
|
|
return "(없음)"
|
|
lines = []
|
|
for it in items:
|
|
if it.source_type == "document":
|
|
lines.append(f"- [자료: {it.title}] {it.snippet}")
|
|
else:
|
|
lines.append(f"- [{it.title}] {it.snippet}")
|
|
return "\n".join(lines)
|