From 8074be6b6d3640d58a265aed6f0b6b44f164963d Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Sat, 2 May 2026 07:33:57 +0900 Subject: [PATCH] =?UTF-8?q?feat(study):=20Phase=204-D=20=EC=9A=B4=EC=98=81?= =?UTF-8?q?=20=EA=B4=80=EC=B0=B0=20+=20confidence=20calibration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4-B v1 첫 검증 결과 자료 부족 토픽인데도 모델이 confidence='high' 박는 케이스 발견. 정의 (high = 자료 + 다른 ai_explanation 으로 패턴 명확) 보다 과신 — UX 신뢰도 위험. 자동 cap 보정 + 운영 관찰 SQL 추가. confidence calibration (services/study/session_summary_guard): - calibrate_confidence(c, ctx_docs_count, ready_explanation_count) 신규 · ctx_docs_count == 0 AND ready_explanation_count == 0 → 'low' cap · ctx_docs_count == 0 (ready 만 있음) → 'medium' cap · ctx_docs_count >= 1 → 모델 값 그대로 - 모델이 정의보다 더 보수적인 값 박은 경우 (모델 'low' + cap 'medium') 는 보존 — 더 보수적인 값을 절대 올리지 않음 worker 적용 (study_session_analysis_worker): - ctx_docs_count = len(ctx_docs) - ready_explanation_count = sum(1 for a in prompt_attempts if a.get('ai_explanation')) - calibrate_confidence 호출 → study_quiz_session_analysis.confidence 박힘 - job.payload 에 운영 분석 metadata 보존: · ctx_docs_count / ready_explanation_count · model_confidence_raw (모델 응답) vs calibrated_confidence (cap 후) · prompt_attempts / valid_attempts_total / summary_len → SQL 4 번 쿼리가 cap 작동 빈도 측정 scripts/phase4_health.sql (신규 운영 점검 SQL 7 섹션): 1. 4-A study_question_jobs status × error_code 분포 2. 4-B study_quiz_session_jobs status × error_code 분포 3. 4-B confidence 분포 (calibrated) 4. 4-B model_confidence_raw vs calibrated 차이 (cap 작동 빈도) 5. 4-A/4-B 최근 7일 처리 지연 p50/p95/max/avg 6. 4-A/4-B skipped 사유 분포 7. 4-B guard_fail / parse_fail / llm_timeout 비율 ship gate (단위 테스트): - test_calibrate_confidence_no_evidence_caps_to_low (3 케이스) - test_calibrate_confidence_only_explanations_caps_to_medium (3 케이스) - test_calibrate_confidence_with_documents_passthrough (3 케이스) - test_calibrate_confidence_normalizes_invalid_first (2 케이스) Plan: ~/.claude/plans/nifty-sparking-spindle.md (Phase 4-B v1 후속) Co-Authored-By: Claude Opus 4.7 (1M context) --- app/services/study/session_summary_guard.py | 29 +++++ app/workers/study_session_analysis_worker.py | 32 ++++- scripts/phase4_health.sql | 119 +++++++++++++++++++ tests/test_session_summary_guard_pattern.py | 39 +++++- 4 files changed, 215 insertions(+), 4 deletions(-) create mode 100644 scripts/phase4_health.sql diff --git a/app/services/study/session_summary_guard.py b/app/services/study/session_summary_guard.py index 7e961b3..449437d 100644 --- a/app/services/study/session_summary_guard.py +++ b/app/services/study/session_summary_guard.py @@ -26,6 +26,7 @@ GUARD_PATTERN = re.compile( ) _VALID_CONFIDENCE = {"high", "medium", "low"} +_CONFIDENCE_ORD = {"low": 0, "medium": 1, "high": 2} def normalize_confidence(value: object) -> str: @@ -37,3 +38,31 @@ def normalize_confidence(value: object) -> str: return "low" v = value.strip().lower() return v if v in _VALID_CONFIDENCE else "low" + + +def calibrate_confidence( + confidence: object, + *, + ctx_docs_count: int, + ready_explanation_count: int, +) -> str: + """Phase 4-D: 자료 부족 토픽에서 모델이 high 박는 과신 방지. + + cap 정책 (보수적): + - 문서 evidence 0건 + ready ai_explanation 0건 → 'low' 로 cap + - 문서 evidence 0건 (ready ai_explanation 만 있음) → 'medium' 으로 cap + - 그 외 (문서 evidence 1건 이상) → 모델이 박은 값 그대로 + + 모델이 정의보다 더 보수적인 값 박은 경우 (예: 모델 'low' 인데 cap 'medium') + 는 그대로 유지 — 더 보수적인 값을 절대 올리지 않음. + """ + base = normalize_confidence(confidence) + if ctx_docs_count == 0 and ready_explanation_count == 0: + cap = "low" + elif ctx_docs_count == 0: + cap = "medium" + else: + return base + if _CONFIDENCE_ORD[base] > _CONFIDENCE_ORD[cap]: + return cap + return base diff --git a/app/workers/study_session_analysis_worker.py b/app/workers/study_session_analysis_worker.py index 361a920..c073812 100644 --- a/app/workers/study_session_analysis_worker.py +++ b/app/workers/study_session_analysis_worker.py @@ -33,7 +33,11 @@ from models.study_quiz_session import StudyQuizSession from models.study_quiz_session_analysis import StudyQuizSessionAnalysis from models.study_quiz_session_job import StudyQuizSessionJob from services.search.llm_gate import get_mlx_gate -from services.study.session_summary_guard import GUARD_PATTERN, normalize_confidence +from services.study.session_summary_guard import ( + GUARD_PATTERN, + calibrate_confidence, + normalize_confidence, +) from services.study.session_summary_rag import gather_session_summary_context logger = logging.getLogger(__name__) @@ -272,8 +276,30 @@ async def run_session_analysis_job(session: AsyncSession, job: StudyQuizSessionJ if len(summary_md) > SUMMARY_MAX_CHARS: summary_md = summary_md[:SUMMARY_MAX_CHARS].rstrip() + "…" - # 8. confidence normalize - confidence = normalize_confidence(confidence_raw) + # 8. Phase 4-D: confidence calibrate — 자료 부족 토픽 과신 방지. + # 문서 evidence 0 + ready ai_explanation 0 → 'low' cap. + # 문서 0 + ready 만 있음 → 'medium' cap. + # 문서 1+ → 모델 값 그대로. + ctx_docs_count = len(ctx_docs) + ready_expl_count = sum(1 for a in prompt_attempts if a.get("ai_explanation")) + confidence = calibrate_confidence( + confidence_raw, + ctx_docs_count=ctx_docs_count, + ready_explanation_count=ready_expl_count, + ) + + # job.payload 에 evidence count + 모델 raw confidence 보존 (운영 분석) + job_payload = dict(job.payload or {}) + job_payload.update({ + "ctx_docs_count": ctx_docs_count, + "ready_explanation_count": ready_expl_count, + "model_confidence_raw": normalize_confidence(confidence_raw), + "calibrated_confidence": confidence, + "prompt_attempts": len(prompt_attempts), + "valid_attempts_total": len(valid_attempts), + "summary_len": len(summary_md), + }) + job.payload = job_payload # 9. UPSERT ts = now() diff --git a/scripts/phase4_health.sql b/scripts/phase4_health.sql new file mode 100644 index 0000000..f575836 --- /dev/null +++ b/scripts/phase4_health.sql @@ -0,0 +1,119 @@ +-- Phase 4 운영 점검 SQL — 4-A (study_question_jobs) + 4-B (study_quiz_session_jobs) +-- 사용: +-- ssh gpu 'docker exec -i hyungi_document_server-postgres-1 psql -U pkm pkm' < scripts/phase4_health.sql +-- 또는 개별 SECTION 만 골라 실행. 모든 섹션은 read-only. + +\echo '── 1. 4-A study_question_jobs status × error_code 분포 ──' +SELECT + status, + COALESCE(error_code, '(none)') AS error_code, + COUNT(*) AS cnt +FROM study_question_jobs +GROUP BY status, error_code +ORDER BY status, error_code; + +\echo '' +\echo '── 2. 4-B study_quiz_session_jobs status × error_code 분포 ──' +SELECT + status, + COALESCE(error_code, '(none)') AS error_code, + COUNT(*) AS cnt +FROM study_quiz_session_jobs +GROUP BY status, error_code +ORDER BY status, error_code; + +\echo '' +\echo '── 3. 4-B study_quiz_session_analysis confidence 분포 (calibrated) ──' +SELECT + COALESCE(confidence, '(null)') AS confidence, + COUNT(*) AS cnt, + COUNT(*) FILTER (WHERE is_stale) AS stale_count +FROM study_quiz_session_analysis +GROUP BY confidence +ORDER BY + CASE COALESCE(confidence, '(null)') + WHEN 'high' THEN 0 + WHEN 'medium' THEN 1 + WHEN 'low' THEN 2 + ELSE 3 + END; + +\echo '' +\echo '── 4. 4-B confidence calibration 차이 (job.payload 기반) ──' +\echo ' model_confidence_raw vs calibrated_confidence — 자료 부족 cap 작동 빈도 측정' +SELECT + payload->>'model_confidence_raw' AS model_raw, + payload->>'calibrated_confidence' AS calibrated, + (payload->>'ctx_docs_count')::int AS docs_n, + (payload->>'ready_explanation_count')::int AS ready_n, + COUNT(*) AS cnt +FROM study_quiz_session_jobs +WHERE status = 'completed' + AND payload IS NOT NULL + AND payload ? 'model_confidence_raw' +GROUP BY model_raw, calibrated, docs_n, ready_n +ORDER BY cnt DESC +LIMIT 20; + +\echo '' +\echo '── 5. 4-A/4-B 최근 7일 처리 지연 (created_at → completed_at) ──' +\echo ' p50/p95/max 단순 ROUND(EXTRACT). 4-A 와 4-B 분리.' +SELECT + 'study_question_jobs' AS source, + COUNT(*) AS terminal_n, + ROUND(AVG(EXTRACT(EPOCH FROM (completed_at - created_at)))::numeric, 1) AS avg_sec, + ROUND(MAX(EXTRACT(EPOCH FROM (completed_at - created_at)))::numeric, 1) AS max_sec, + ROUND((PERCENTILE_CONT(0.5) WITHIN GROUP ( + ORDER BY EXTRACT(EPOCH FROM (completed_at - created_at)) + ))::numeric, 1) AS p50_sec, + ROUND((PERCENTILE_CONT(0.95) WITHIN GROUP ( + ORDER BY EXTRACT(EPOCH FROM (completed_at - created_at)) + ))::numeric, 1) AS p95_sec +FROM study_question_jobs +WHERE created_at >= NOW() - INTERVAL '7 days' + AND completed_at IS NOT NULL +UNION ALL +SELECT + 'study_quiz_session_jobs', + COUNT(*), + ROUND(AVG(EXTRACT(EPOCH FROM (completed_at - created_at)))::numeric, 1), + ROUND(MAX(EXTRACT(EPOCH FROM (completed_at - created_at)))::numeric, 1), + ROUND((PERCENTILE_CONT(0.5) WITHIN GROUP ( + ORDER BY EXTRACT(EPOCH FROM (completed_at - created_at)) + ))::numeric, 1), + ROUND((PERCENTILE_CONT(0.95) WITHIN GROUP ( + ORDER BY EXTRACT(EPOCH FROM (completed_at - created_at)) + ))::numeric, 1) +FROM study_quiz_session_jobs +WHERE created_at >= NOW() - INTERVAL '7 days' + AND completed_at IS NOT NULL; + +\echo '' +\echo '── 6. 4-A/4-B skipped 사유 분포 (어떤 데이터 부족이 가장 많이 막는가) ──' +SELECT + 'study_question_jobs' AS source, + error_code, + COUNT(*) AS cnt +FROM study_question_jobs +WHERE status = 'skipped' +GROUP BY error_code +UNION ALL +SELECT + 'study_quiz_session_jobs', + error_code, + COUNT(*) +FROM study_quiz_session_jobs +WHERE status = 'skipped' +GROUP BY error_code +ORDER BY source, cnt DESC; + +\echo '' +\echo '── 7. 4-B guard_fail / parse_fail / llm_timeout 비율 (전체 job 대비) ──' +SELECT + error_code, + COUNT(*) AS cnt, + ROUND(100.0 * COUNT(*) / NULLIF((SELECT COUNT(*) FROM study_quiz_session_jobs), 0), 1) AS pct +FROM study_quiz_session_jobs +WHERE error_code IN ('guard_fail', 'parse_fail', 'llm_timeout', 'unknown') +GROUP BY error_code +ORDER BY cnt DESC; diff --git a/tests/test_session_summary_guard_pattern.py b/tests/test_session_summary_guard_pattern.py index a31ca71..a23cb32 100644 --- a/tests/test_session_summary_guard_pattern.py +++ b/tests/test_session_summary_guard_pattern.py @@ -12,7 +12,11 @@ from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT / "app")) -from services.study.session_summary_guard import GUARD_PATTERN, normalize_confidence # noqa: E402 +from services.study.session_summary_guard import ( # noqa: E402 + GUARD_PATTERN, + calibrate_confidence, + normalize_confidence, +) # ─── GUARD_PATTERN 허용 케이스 (search() == None 이어야 함) ─── @@ -63,10 +67,43 @@ def test_normalize_confidence_nonstandard_values(): assert normalize_confidence(v) == "low" +# ─── Phase 4-D calibrate_confidence — 자료 부족 토픽 cap ─── + +def test_calibrate_confidence_no_evidence_caps_to_low(): + # 문서 0 + ready ai_explanation 0 → cap 'low' + assert calibrate_confidence("high", ctx_docs_count=0, ready_explanation_count=0) == "low" + assert calibrate_confidence("medium", ctx_docs_count=0, ready_explanation_count=0) == "low" + assert calibrate_confidence("low", ctx_docs_count=0, ready_explanation_count=0) == "low" + + +def test_calibrate_confidence_only_explanations_caps_to_medium(): + # 문서 0 + ready ai_explanation 있음 → cap 'medium' + assert calibrate_confidence("high", ctx_docs_count=0, ready_explanation_count=3) == "medium" + assert calibrate_confidence("medium", ctx_docs_count=0, ready_explanation_count=3) == "medium" + assert calibrate_confidence("low", ctx_docs_count=0, ready_explanation_count=3) == "low" + + +def test_calibrate_confidence_with_documents_passthrough(): + # 문서 1건 이상 → 모델 값 그대로 통과 + assert calibrate_confidence("high", ctx_docs_count=2, ready_explanation_count=0) == "high" + assert calibrate_confidence("medium", ctx_docs_count=1, ready_explanation_count=0) == "medium" + assert calibrate_confidence("low", ctx_docs_count=5, ready_explanation_count=10) == "low" + + +def test_calibrate_confidence_normalizes_invalid_first(): + # 비표준 값은 normalize_confidence 가 'low' 로 박은 후 cap 적용 — 결과는 'low' + assert calibrate_confidence("unknown", ctx_docs_count=0, ready_explanation_count=0) == "low" + assert calibrate_confidence(None, ctx_docs_count=5, ready_explanation_count=5) == "low" + + if __name__ == "__main__": # 직접 실행 시 모든 케이스 빠른 점검 test_guard_pattern_allows_normal_summary() test_guard_pattern_blocks_numeric_hallucination() test_normalize_confidence_standard_values() test_normalize_confidence_nonstandard_values() + test_calibrate_confidence_no_evidence_caps_to_low() + test_calibrate_confidence_only_explanations_caps_to_medium() + test_calibrate_confidence_with_documents_passthrough() + test_calibrate_confidence_normalizes_invalid_first() print("OK")