"""study_topic_subject_notes 분야 설명 RAG (PR-9). PR-3 explanation_rag.py 와 비슷한 패턴 — 매핑 documents 청크 + 같은 토픽 같은 subject 다른 문제·해설 → bge-reranker 로 top-K 줄임. 쿼리는 "subject + scope" 키워드 기반 (단일 문제가 아니라 분야 키워드). """ from __future__ import annotations import asyncio import logging from dataclasses import dataclass import httpx from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from ai.client import AIClient from models.chunk import DocumentChunk from models.document import Document from models.study_question import StudyQuestion from models.study_topic import StudyTopicDocument from services.search.license_filter import restricted_exclude_orm logger = logging.getLogger(__name__) # top-K DOC_TOPK = 5 Q_TOPK = 3 MAX_RERANK_INPUT = 60 DOC_SNIPPET_LEN = 400 Q_SNIPPET_LEN = 300 MAX_QUESTION_CANDIDATES = 30 RERANK_TIMEOUT_S = 5.0 @dataclass class EvidenceItem: source_type: str source_id: int title: str snippet: str def to_dict(self) -> dict: return { "source_type": self.source_type, "source_id": self.source_id, "title": self.title, "snippet": self.snippet, } @dataclass class SubjectNoteContext: documents: list[EvidenceItem] questions: list[EvidenceItem] @property def all(self) -> list[EvidenceItem]: return [*self.documents, *self.questions] def _truncate(text: str, n: int) -> str: if not text: return "" s = text.strip() return s if len(s) <= n else s[:n].rstrip() + "…" def _build_query(subject: str, scope: str) -> str: parts = [subject] if scope: parts.append(scope) parts.append("학습 자료") return " ".join(parts) async def _rerank(client: AIClient, query: str, texts: list[str], top_k: int) -> list[int]: if not texts: return [] if len(texts) > MAX_RERANK_INPUT: texts = texts[:MAX_RERANK_INPUT] try: async with asyncio.timeout(RERANK_TIMEOUT_S): results = await client.rerank(query, texts) idxs = [r["index"] for r in results if 0 <= r.get("index", -1) < len(texts)] return idxs[:top_k] except (asyncio.TimeoutError, httpx.HTTPError) as e: logger.warning("subject_note_rerank_fallback: %s: %s", type(e).__name__, e) return list(range(min(top_k, len(texts)))) async def _gather_document_evidence( session: AsyncSession, user_id: int, study_topic_id: int, query: str, client: AIClient, ) -> list[EvidenceItem]: doc_id_rows = ( await session.execute( select(StudyTopicDocument.document_id).where( StudyTopicDocument.study_topic_id == study_topic_id, StudyTopicDocument.user_id == user_id, ) ) ).scalars().all() doc_ids = list(doc_id_rows) if not doc_ids: return [] doc_meta_rows = ( await session.execute( select(Document.id, Document.title, Document.ai_summary).where( Document.id.in_(doc_ids), Document.deleted_at.is_(None), # B-4: licensed_restricted 제외 — explanation_rag 와 동일 술어(a안 U-2①). 누락 시 # 구매 자료 verbatim 이 분야노트 RAG 로 새던 보안 drift(복제 과정 누락). restricted_exclude_orm(), ) ) ).all() doc_meta: dict[int, tuple[str | None, str | None]] = { r.id: (r.title, r.ai_summary) for r in doc_meta_rows } if not doc_meta: return [] valid_doc_ids = list(doc_meta.keys()) chunk_rows = ( await session.execute( select(DocumentChunk.doc_id, DocumentChunk.chunk_index, DocumentChunk.text) .where( DocumentChunk.doc_id.in_(valid_doc_ids), DocumentChunk.chunk_index < 4, # Hier-Decomp-1 c2: 교체된 doc 의 legacy(in_corpus=false) chunk 중복 로드 방지. DocumentChunk.in_corpus.is_(True), ) .order_by(DocumentChunk.doc_id, DocumentChunk.chunk_index) ) ).all() candidates: list[tuple[int, str]] = [] for r in chunk_rows: if r.text: candidates.append((r.doc_id, r.text)) for did, (_title, summary) in doc_meta.items(): if summary: candidates.append((did, summary)) if not candidates: return [] texts = [_truncate(t, 800) for _, t in candidates] top_idxs = await _rerank(client, query, texts, DOC_TOPK) seen_doc_ids: set[int] = set() out: list[EvidenceItem] = [] for i in top_idxs: did, text = candidates[i] if did in seen_doc_ids: continue seen_doc_ids.add(did) title = doc_meta.get(did, (None, None))[0] or f"문서 #{did}" out.append(EvidenceItem( source_type="document", source_id=did, title=title, snippet=_truncate(text, DOC_SNIPPET_LEN), )) if len(out) >= DOC_TOPK: break return out async def _gather_question_evidence( session: AsyncSession, user_id: int, study_topic_id: int, subject: str, scope: str, query: str, client: AIClient, ) -> list[EvidenceItem]: """같은 토픽 같은 subject (+scope 일치 우선) 의 문제·해설.""" base_q = ( select(StudyQuestion) .where( StudyQuestion.user_id == user_id, StudyQuestion.study_topic_id == study_topic_id, StudyQuestion.deleted_at.is_(None), ) ) # 같은 subject 우선. 없으면 같은 토픽 전체로 fallback. rows = ( await session.execute( base_q.where(StudyQuestion.subject == subject) .order_by(StudyQuestion.created_at.desc()) .limit(MAX_QUESTION_CANDIDATES) ) ).scalars().all() if not rows: rows = ( await session.execute( base_q.order_by(StudyQuestion.created_at.desc()) .limit(MAX_QUESTION_CANDIDATES) ) ).scalars().all() if not rows: return [] candidates_text: list[str] = [] for q in rows: parts = [q.question_text or ""] if q.explanation: parts.append(q.explanation) if q.ai_explanation and q.ai_explanation_status == "ready": parts.append(q.ai_explanation) candidates_text.append(" | ".join(parts)) top_idxs = await _rerank(client, query, candidates_text, Q_TOPK) out: list[EvidenceItem] = [] for i in top_idxs: q = rows[i] title_head = _truncate(q.question_text or "", 40) out.append(EvidenceItem( source_type="question", source_id=q.id, title=f"Q{q.id}: {title_head}", snippet=_truncate(candidates_text[i], Q_SNIPPET_LEN), )) return out async def gather_subject_note_context( session: AsyncSession, user_id: int, study_topic_id: int, subject: str, scope: str, ) -> SubjectNoteContext: """분야 (subject, scope) 의 RAG 근거 수집.""" client = AIClient() query = _build_query(subject, scope) try: # 같은 AsyncSession 동시 execute 회피 — 순차 직렬화(백그라운드 prefetch). # explanation_rag.gather_explanation_context 와 동형(R2 공유세션 동시성 수정). docs = await _gather_document_evidence( session, user_id, study_topic_id, query, client ) questions = await _gather_question_evidence( session, user_id, study_topic_id, subject, scope, query, client ) return SubjectNoteContext(documents=docs, questions=questions) finally: await client.close() def render_evidence_block(items: list[EvidenceItem]) -> str: if not items: return "(없음)" lines = [] for it in items: if it.source_type == "document": lines.append(f"- [자료: {it.title}] {it.snippet}") else: lines.append(f"- [{it.title}] {it.snippet}") return "\n".join(lines)