hyungi_document_server/app/services/study/study_memo_card_guards.py

"""공부 암기노트 카드별 가드 — 추출된 카드 1장 검증 파이프라인.

explanation 워커의 단일 answer_choice 환각가드를 카드 배열로 확장한다. 가드 4종:
  1. 형식 유효성 — format in {qa, cloze}, cue/fact 비공백, cloze 는 cloze_text + 빈칸 마커 필요.
  2. 근거(hallucination) — 정답토큰(fact)이 신뢰 텍스트에 등장해야 채택.
       정량 토큰(숫자 포함): evidence 원문 snippet 에 등장 필수 (평문화된 ai_explanation 만으론 불충분).
       비정량(개념): ai_explanation 또는 evidence snippet 에 등장.
  3. 누출 — cue 에 정답 노출 / cloze 평문에 정답 노출 시 drop.
  4. dedup — (source_question_id, format, normalize(정답토큰)) hash. 배치 내 중복 1장.

무결성은 구조로(메모리 규칙): dedup_hash PARTIAL UNIQUE(migration 288)가 DB 최종 방어선,
본 가드는 1차. 전부 drop 이면 빈 리스트 → 워커가 all_dropped 로 종결.
"""

from __future__ import annotations

from dataclasses import dataclass, field

from services.study import card_normalize as cn

_VALID_FORMATS = {"qa", "cloze"}


@dataclass
class GuardedCard:
    format: str
    cue: str
    fact: str
    cloze_text: str | None
    dedup_hash: str
    matched_evidence: list[dict] = field(default_factory=list)


def guard_card(
    card: dict,
    *,
    source_question_id: int | None,
    ai_explanation: str | None,
    evidence_refs: list[dict],
) -> GuardedCard | None:
    """카드 1장 검증. 통과하면 GuardedCard, 탈락하면 None."""
    fmt = (card.get("format") or "").strip()
    cue = (card.get("cue") or "").strip()
    fact = (card.get("fact") or "").strip()
    cloze_text = card.get("cloze_text")
    cloze_text = cloze_text.strip() if isinstance(cloze_text, str) else None

    # 1. 형식 유효성
    if fmt not in _VALID_FORMATS or not cue or not fact:
        return None
    if fmt == "cloze":
        if not cloze_text or not cn._BLANK.search(cloze_text):
            return None

    # 3. 누출 (정답 노출)
    if cn.is_cue_leak(cue, fact):
        return None
    if fmt == "cloze" and cn.is_cloze_self_leak(cloze_text, fact):
        return None

    # 2. 근거 (hallucination 차단)
    matched = cn.matching_evidence(fact, evidence_refs)
    if cn.is_quantitative(fact):
        # 정량 토큰은 evidence 원문 등장 필수
        if not matched:
            return None
    else:
        # 비정량은 ai_explanation 또는 evidence 에 등장
        if not matched and not cn.text_contains(ai_explanation, fact):
            return None

    return GuardedCard(
        format=fmt,
        cue=cue,
        fact=fact,
        cloze_text=cloze_text if fmt == "cloze" else None,
        dedup_hash=cn.compute_dedup_hash(source_question_id, fmt, fact),
        matched_evidence=matched,
    )


def guard_cards(
    cards: list[dict],
    *,
    source_question_id: int | None,
    ai_explanation: str | None,
    evidence_refs: list[dict],
) -> list[GuardedCard]:
    """카드 배열 검증 + 배치 내 dedup_hash 중복 1장. 통과 카드만 반환."""
    out: list[GuardedCard] = []
    seen: set[str] = set()
    for card in cards or []:
        if not isinstance(card, dict):
            continue
        g = guard_card(
            card,
            source_question_id=source_question_id,
            ai_explanation=ai_explanation,
            evidence_refs=evidence_refs,
        )
        if g is None or g.dedup_hash in seen:
            continue
        seen.add(g.dedup_hash)
        out.append(g)
    return out