From 59e38d80b04d5b4724c2438c43773ca46fc67a57 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Thu, 16 Apr 2026 13:52:14 +0900 Subject: [PATCH] =?UTF-8?q?feat(api):=20Phase=20E.1=20=E2=80=94=20ask=5Fev?= =?UTF-8?q?ents=20=EC=B8=A1=EC=A0=95=20=ED=95=84=EB=93=9C=20=ED=99=95?= =?UTF-8?q?=EC=9E=A5=20(answer=5Flength/prompt=5Fversion)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit E.3 400→600자 튜닝 전후 비교 + 단계 5 failure mode 분석의 기준 필드 추가. - migrations/135: answer_length/covered_aspects/missing_aspects/model_name/prompt_version 컬럼 + prompt_version 인덱스 - ORM: ask_event.py에 동일 5개 필드 매핑 - prompt_versions.py: ASK_PROMPT_VERSION="search_synthesis.v1-400char" 상수 + resolve_primary_model() helper - search_telemetry.record_ask_event: 시그니처에 keyword-only 필드 5개 추가 (하위 호환) - search.py: refused + success 두 호출사이트에서 새 필드 전달. answer_length는 len(sr.answer or ""), model_name/prompt_version은 상수 모듈 기반 기존 호출 구조(이미 search_telemetry+background_tasks로 DB insert 중)는 유지. 순수 확장 커밋. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/api/search.py | 13 +++++++ app/models/ask_event.py | 6 ++++ app/services/prompt_versions.py | 40 +++++++++++++++++++++ app/services/search_telemetry.py | 20 ++++++++++- migrations/135_ask_events_observability.sql | 12 +++++++ 5 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 app/services/prompt_versions.py create mode 100644 migrations/135_ask_events_observability.sql diff --git a/app/api/search.py b/app/api/search.py index 39acca1..d77b6a1 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -29,6 +29,7 @@ from services.search.refusal_gate import RefusalDecision, decide as refusal_deci from services.search.search_pipeline import PipelineResult, run_search from services.search.synthesis_service import SynthesisResult, synthesize from services.search.verifier_service import VerifierResult, verify +from services.prompt_versions import ASK_PROMPT_VERSION, resolve_primary_model from services.search_telemetry import record_ask_event, record_search_event # logs/search.log + stdout 동시 출력 (Phase 0.4) @@ -493,6 +494,12 @@ async def ask( sum(sorted(all_rerank_scores, reverse=True)[:3]), [], len(evidence), 0, defense_log, int(total_ms), + # Phase E.1 측정 필드 + answer_length=0, + covered_aspects=classifier_result.covered_aspects or None, + missing_aspects=classifier_result.missing_aspects or None, + model_name=resolve_primary_model(), + prompt_version=ASK_PROMPT_VERSION, ) debug_obj = None if debug: @@ -684,6 +691,12 @@ async def ask( sum(sorted(all_rerank_scores, reverse=True)[:3]), sr.hallucination_flags, len(evidence), len(citations), defense_log, int(total_ms), + # Phase E.1 측정 필드 + answer_length=len(sr.answer or ""), + covered_aspects=covered_aspects, + missing_aspects=missing_aspects, + model_name=resolve_primary_model(), + prompt_version=ASK_PROMPT_VERSION, ) debug_obj = None diff --git a/app/models/ask_event.py b/app/models/ask_event.py index ebcc84b..b70c061 100644 --- a/app/models/ask_event.py +++ b/app/models/ask_event.py @@ -33,6 +33,12 @@ class AskEvent(Base): citation_count: Mapped[int | None] = mapped_column(Integer) defense_layers: Mapped[dict[str, Any] | None] = mapped_column(JSONB) total_ms: Mapped[int | None] = mapped_column(Integer) + # Phase E.1: 측정 필드 확장 (answer_length가 E.3 400→600자 비교 핵심) + answer_length: Mapped[int | None] = mapped_column(Integer) + covered_aspects: Mapped[list[Any] | None] = mapped_column(JSONB) + missing_aspects: Mapped[list[Any] | None] = mapped_column(JSONB) + model_name: Mapped[str | None] = mapped_column(Text) + prompt_version: Mapped[str | None] = mapped_column(Text) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), default=datetime.now, nullable=False ) diff --git a/app/services/prompt_versions.py b/app/services/prompt_versions.py new file mode 100644 index 0000000..759c134 --- /dev/null +++ b/app/services/prompt_versions.py @@ -0,0 +1,40 @@ +"""프롬프트/모델 버전 상수 — telemetry 기록용 (Phase E.1) + +목적: ask_events / analyze_events 에 prompt_version 과 model_name 을 기록해서 +튜닝 전/후 비교와 실험 분기를 식별 가능하게 함. + +규칙: + - 프롬프트 파일이 의미 있게 바뀌면 해당 상수 문자열을 bump (예: v1-400char → v2-600char) + - 하드코딩 금지. 파이프라인은 여기 상수만 참조. + - 모델명은 런타임 config(settings.ai.primary.model)에서 읽어서 resolve_primary_model() 사용. + +E.3 배포 타임라인: + - v1-400char → 현재 (search_synthesis.txt 17행 "400 characters max") + - v2-600char → E.3 배포 시 bump (동일 파일 "600 characters max") +""" + +from __future__ import annotations + +# ─── ask (/search/ask) 프롬프트 버전 ───────────────────────── +# synthesis_service.py 가 로드하는 app/prompts/search_synthesis.txt 기준 +ASK_PROMPT_VERSION: str = "search_synthesis.v1-400char" + +# ─── /analyze 프롬프트 버전 ────────────────────────────────── +# documents.py analyze 라우트가 로드하는 app/prompts/document_analyze.txt 기준 +ANALYZE_PROMPT_VERSION: str = "document_analyze.v1" + + +def resolve_primary_model() -> str | None: + """런타임 config에서 primary 모델명을 resolve. + + settings.ai 가 미구성이면 None. + telemetry 기록은 None 허용 (측정 필드는 nullable). + """ + try: + from core.config import settings + + if settings.ai and settings.ai.primary: + return settings.ai.primary.model + except Exception: + pass + return None diff --git a/app/services/search_telemetry.py b/app/services/search_telemetry.py index 2f22db7..178cc6f 100644 --- a/app/services/search_telemetry.py +++ b/app/services/search_telemetry.py @@ -327,8 +327,21 @@ async def record_ask_event( citation_count: int, defense_layers: dict[str, Any], total_ms: int, + # Phase E.1: 측정 필드 확장 + answer_length: int | None = None, + covered_aspects: list[str] | None = None, + missing_aspects: list[str] | None = None, + model_name: str | None = None, + prompt_version: str | None = None, ) -> None: - """ask_events INSERT. background task에서 호출 — 에러 삼킴.""" + """ask_events INSERT. background task에서 호출 — 에러 삼킴. + + Phase E.1 확장 필드(키워드 전달 권장): + - answer_length: len(ai_answer or "") — 400→600자 효과 측정 핵심 + - covered_aspects / missing_aspects: classifier 결과 그대로 + - model_name: resolve_primary_model() 또는 호출사이트 명시 + - prompt_version: ASK_PROMPT_VERSION 상수 + """ try: async with async_session() as session: row = AskEvent( @@ -346,6 +359,11 @@ async def record_ask_event( citation_count=citation_count, defense_layers=defense_layers, total_ms=total_ms, + answer_length=answer_length, + covered_aspects=covered_aspects, + missing_aspects=missing_aspects, + model_name=model_name, + prompt_version=prompt_version, ) session.add(row) await session.commit() diff --git a/migrations/135_ask_events_observability.sql b/migrations/135_ask_events_observability.sql new file mode 100644 index 0000000..884c80f --- /dev/null +++ b/migrations/135_ask_events_observability.sql @@ -0,0 +1,12 @@ +-- Phase E.1: ask_events 측정 필드 확장 +-- answer_length/covered_aspects/missing_aspects/model_name/prompt_version 추가 +-- E.3 (400→600자) 전후 비교 기준 + 단계 5 2-모델 체인 failure mode 분석 근거 + +ALTER TABLE ask_events + ADD COLUMN IF NOT EXISTS answer_length INT, + ADD COLUMN IF NOT EXISTS covered_aspects JSONB, + ADD COLUMN IF NOT EXISTS missing_aspects JSONB, + ADD COLUMN IF NOT EXISTS model_name TEXT, + ADD COLUMN IF NOT EXISTS prompt_version TEXT; + +CREATE INDEX IF NOT EXISTS idx_ask_events_prompt_version ON ask_events(prompt_version);