Files
hyungi_document_server/app/services/study/subject_note_rag.py
T
hyungi f940f50c60 feat(search): route retrieval through corpus_chunks view (Hier-Decomp-1 c2)
baseline chunk 벡터검색을 document_chunks → corpus_chunks 뷰(in_corpus=true)로 rewire.
in_corpus=false(비활성 hier leaf 등) 자동 제외 = 검색 오염 구조적 차단(B choke point).

- retrieval_service: baseline chunks_table=corpus_chunks, _VALID_CHUNKS_TABLE 에 corpus_chunks 허용,
  snapshot_clause 조건 corpus_chunks 포함(eval snapshot 보존). candidate(cand_*) 경로 불변.
  documents 측(FTS+doc embedding) 무변경 — doc row 는 교체 무관.
- models/chunk: 5 신규 컬럼 매핑(parent_id/level/node_type/is_leaf/in_corpus). server_default 로
  기존 chunk_worker INSERT 무영향(legacy=in_corpus true/is_leaf false).
- subject_note_rag/explanation_rag: RAG chunk 로드에 in_corpus=true 필터(교체 doc legacy 중복 방지).

게이트: G4b(rewire 불변) before/after IDENTICAL(현재 view==table no-op) / G4a(누출) synthetic
in_corpus=false leaf 가 corpus_chunks 0건·document_chunks raw top(dist 0.0) 양방향 증명. /health 200.

plan: hierarchical-decomposition-tiered-nesting-marmot.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 12:58:28 +00:00

260 lines
7.5 KiB
Python

"""study_topic_subject_notes 분야 설명 RAG (PR-9).
PR-3 explanation_rag.py 와 비슷한 패턴 — 매핑 documents 청크 + 같은 토픽 같은 subject 다른 문제·해설 → bge-reranker 로 top-K 줄임.
쿼리는 "subject + scope" 키워드 기반 (단일 문제가 아니라 분야 키워드).
"""
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass
import httpx
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from ai.client import AIClient
from models.chunk import DocumentChunk
from models.document import Document
from models.study_question import StudyQuestion
from models.study_topic import StudyTopicDocument
logger = logging.getLogger(__name__)
# top-K
DOC_TOPK = 5
Q_TOPK = 3
MAX_RERANK_INPUT = 60
DOC_SNIPPET_LEN = 400
Q_SNIPPET_LEN = 300
MAX_QUESTION_CANDIDATES = 30
RERANK_TIMEOUT_S = 5.0
@dataclass
class EvidenceItem:
source_type: str
source_id: int
title: str
snippet: str
def to_dict(self) -> dict:
return {
"source_type": self.source_type,
"source_id": self.source_id,
"title": self.title,
"snippet": self.snippet,
}
@dataclass
class SubjectNoteContext:
documents: list[EvidenceItem]
questions: list[EvidenceItem]
@property
def all(self) -> list[EvidenceItem]:
return [*self.documents, *self.questions]
def _truncate(text: str, n: int) -> str:
if not text:
return ""
s = text.strip()
return s if len(s) <= n else s[:n].rstrip() + ""
def _build_query(subject: str, scope: str) -> str:
parts = [subject]
if scope:
parts.append(scope)
parts.append("학습 자료")
return " ".join(parts)
async def _rerank(client: AIClient, query: str, texts: list[str], top_k: int) -> list[int]:
if not texts:
return []
if len(texts) > MAX_RERANK_INPUT:
texts = texts[:MAX_RERANK_INPUT]
try:
async with asyncio.timeout(RERANK_TIMEOUT_S):
results = await client.rerank(query, texts)
idxs = [r["index"] for r in results if 0 <= r.get("index", -1) < len(texts)]
return idxs[:top_k]
except (asyncio.TimeoutError, httpx.HTTPError) as e:
logger.warning("subject_note_rerank_fallback: %s: %s", type(e).__name__, e)
return list(range(min(top_k, len(texts))))
async def _gather_document_evidence(
session: AsyncSession,
user_id: int,
study_topic_id: int,
query: str,
client: AIClient,
) -> list[EvidenceItem]:
doc_id_rows = (
await session.execute(
select(StudyTopicDocument.document_id).where(
StudyTopicDocument.study_topic_id == study_topic_id,
StudyTopicDocument.user_id == user_id,
)
)
).scalars().all()
doc_ids = list(doc_id_rows)
if not doc_ids:
return []
doc_meta_rows = (
await session.execute(
select(Document.id, Document.title, Document.ai_summary).where(
Document.id.in_(doc_ids),
Document.deleted_at.is_(None),
)
)
).all()
doc_meta: dict[int, tuple[str | None, str | None]] = {
r.id: (r.title, r.ai_summary) for r in doc_meta_rows
}
if not doc_meta:
return []
valid_doc_ids = list(doc_meta.keys())
chunk_rows = (
await session.execute(
select(DocumentChunk.doc_id, DocumentChunk.chunk_index, DocumentChunk.text)
.where(
DocumentChunk.doc_id.in_(valid_doc_ids),
DocumentChunk.chunk_index < 4,
# Hier-Decomp-1 c2: 교체된 doc 의 legacy(in_corpus=false) chunk 중복 로드 방지.
DocumentChunk.in_corpus.is_(True),
)
.order_by(DocumentChunk.doc_id, DocumentChunk.chunk_index)
)
).all()
candidates: list[tuple[int, str]] = []
for r in chunk_rows:
if r.text:
candidates.append((r.doc_id, r.text))
for did, (_title, summary) in doc_meta.items():
if summary:
candidates.append((did, summary))
if not candidates:
return []
texts = [_truncate(t, 800) for _, t in candidates]
top_idxs = await _rerank(client, query, texts, DOC_TOPK)
seen_doc_ids: set[int] = set()
out: list[EvidenceItem] = []
for i in top_idxs:
did, text = candidates[i]
if did in seen_doc_ids:
continue
seen_doc_ids.add(did)
title = doc_meta.get(did, (None, None))[0] or f"문서 #{did}"
out.append(EvidenceItem(
source_type="document",
source_id=did,
title=title,
snippet=_truncate(text, DOC_SNIPPET_LEN),
))
if len(out) >= DOC_TOPK:
break
return out
async def _gather_question_evidence(
session: AsyncSession,
user_id: int,
study_topic_id: int,
subject: str,
scope: str,
query: str,
client: AIClient,
) -> list[EvidenceItem]:
"""같은 토픽 같은 subject (+scope 일치 우선) 의 문제·해설."""
base_q = (
select(StudyQuestion)
.where(
StudyQuestion.user_id == user_id,
StudyQuestion.study_topic_id == study_topic_id,
StudyQuestion.deleted_at.is_(None),
)
)
# 같은 subject 우선. 없으면 같은 토픽 전체로 fallback.
rows = (
await session.execute(
base_q.where(StudyQuestion.subject == subject)
.order_by(StudyQuestion.created_at.desc())
.limit(MAX_QUESTION_CANDIDATES)
)
).scalars().all()
if not rows:
rows = (
await session.execute(
base_q.order_by(StudyQuestion.created_at.desc())
.limit(MAX_QUESTION_CANDIDATES)
)
).scalars().all()
if not rows:
return []
candidates_text: list[str] = []
for q in rows:
parts = [q.question_text or ""]
if q.explanation:
parts.append(q.explanation)
if q.ai_explanation and q.ai_explanation_status == "ready":
parts.append(q.ai_explanation)
candidates_text.append(" | ".join(parts))
top_idxs = await _rerank(client, query, candidates_text, Q_TOPK)
out: list[EvidenceItem] = []
for i in top_idxs:
q = rows[i]
title_head = _truncate(q.question_text or "", 40)
out.append(EvidenceItem(
source_type="question",
source_id=q.id,
title=f"Q{q.id}: {title_head}",
snippet=_truncate(candidates_text[i], Q_SNIPPET_LEN),
))
return out
async def gather_subject_note_context(
session: AsyncSession,
user_id: int,
study_topic_id: int,
subject: str,
scope: str,
) -> SubjectNoteContext:
"""분야 (subject, scope) 의 RAG 근거 수집."""
client = AIClient()
query = _build_query(subject, scope)
try:
docs, questions = await asyncio.gather(
_gather_document_evidence(session, user_id, study_topic_id, query, client),
_gather_question_evidence(session, user_id, study_topic_id, subject, scope, query, client),
)
return SubjectNoteContext(documents=docs, questions=questions)
finally:
await client.close()
def render_evidence_block(items: list[EvidenceItem]) -> str:
if not items:
return "(없음)"
lines = []
for it in items:
if it.source_type == "document":
lines.append(f"- [자료: {it.title}] {it.snippet}")
else:
lines.append(f"- [{it.title}] {it.snippet}")
return "\n".join(lines)