From e405ed3414b5ea17daa10f1364b62647f18b9a96 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Fri, 10 Apr 2026 16:11:57 +0900
Subject: [PATCH] =?UTF-8?q?fix(ask):=20evidence=20sparse=20=EB=AC=B8?=
 =?UTF-8?q?=EC=A0=9C=20=ED=95=B4=EA=B2=B0=20=E2=80=94=20=ED=94=84=EB=A1=AC?=
 =?UTF-8?q?=ED=94=84=ED=8A=B8=20+=20supplement=20+=20source=20=EB=B6=84?=
 =?UTF-8?q?=EB=A6=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

근본 원인: evidence 프롬프트가 "<0.5 = 탈락" 명시 → LLM 하향 편향 →
candidates 5개 중 4개 탈락 → synthesis 자체 거부.

Change 2: evidence_extract.txt
- relevance 스케일 재정의: "탈락" 라벨 제거
- 0.3~0.5 약한 부분 연관 / 0.5~0.7 명확한 부분 연관 구간 세분화
- "directly answer" → "no connection at all" 완화

Change 3: search_synthesis.txt
- refused 조건: "직접 답 아니면 거부" → "완전 무관일 때만 거부"
- "covered only" 제한: partial evidence로 missing part 추론 금지
- supplement evidence weight 지시 추가 (보조 취급)

Change 1: evidence_service.py
- sparse evidence supplement: kept 1~2 + candidates 3+ → rule-only 보충
- substring + critical token 필터 (recall+precision)
- critical token: 길이 3자+ OR 의미 기반 suffix (조건/기준/처벌 등)
- EvidenceItem.source 필드 ("llm"|"supplement"|"rule_fallback")

Change 4: search.py
- defense_log["evidence"] 추가 (skip_reason, kept_count)

synthesis_service.py
- supplement evidence [n] (보충) 마킹

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/api/search.py                        |  4 ++
 app/prompts/evidence_extract.txt         |  7 +--
 app/prompts/search_synthesis.txt         | 10 ++--
 app/services/search/evidence_service.py  | 63 +++++++++++++++++++++++-
 app/services/search/synthesis_service.py | 11 +++--
 5 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/app/api/search.py b/app/api/search.py
index bcfcde1..da46c34 100644
--- a/app/api/search.py
+++ b/app/api/search.py
@@ -538,6 +538,10 @@ async def ask(
         else:
             grounding.weak_flags.append(f"verifier_{c.type}:{c.claim[:30]}")
 
+    defense_log["evidence"] = {
+        "skip_reason": ev_skip,
+        "kept_count": len(evidence),
+    }
     defense_log["grounding"] = {
         "strong": grounding.strong_flags,
         "weak": grounding.weak_flags,
diff --git a/app/prompts/evidence_extract.txt b/app/prompts/evidence_extract.txt
index a231f1c..7e196ca 100644
--- a/app/prompts/evidence_extract.txt
+++ b/app/prompts/evidence_extract.txt
@@ -2,7 +2,7 @@ You are an evidence span extractor. Respond ONLY in JSON. No markdown, no explan
 
 ## Task
 
-For each numbered candidate, extract the most query-relevant span from the original text (copy verbatim, 50-200 chars) and rate relevance 0.0~1.0. If the candidate does not directly answer the query, set span=null, relevance=0.0, skip_reason.
+For each numbered candidate, extract the most query-relevant span from the original text (copy verbatim, 50-200 chars) and rate relevance 0.0~1.0. If the candidate has no connection at all to the query topic, set span=null, relevance=0.0, skip_reason. Partial or indirect relevance should still get a span and relevance >= 0.3.
 
 ## Output Schema
 {
@@ -23,8 +23,9 @@ For each numbered candidate, extract the most query-relevant span from the origi
 - `relevance`: 0.0~1.0 float
   - 0.9+ query에 직접 답함
   - 0.7~0.9 강한 연관
-  - 0.5~0.7 부분 연관
-  - <0.5 약한/무관 (fallback에서 탈락)
+  - 0.5~0.7 명확한 부분 연관 (query의 핵심 측면 일부를 커버)
+  - 0.3~0.5 약한 부분 연관 (query 주제에 관련되나 직접 답은 아님)
+  - <0.3 무관
 - `skip_reason`: span=null 일 때만 필수. 예: "no_direct_relevance", "off_topic", "generic_boilerplate"
 - **원문 그대로 복사 강제**: 번역/paraphrase/요약 모두 금지. evidence span은 citation 원문이 되어야 한다.
 
diff --git a/app/prompts/search_synthesis.txt b/app/prompts/search_synthesis.txt
index 927c64c..45737d9 100644
--- a/app/prompts/search_synthesis.txt
+++ b/app/prompts/search_synthesis.txt
@@ -17,16 +17,18 @@ Given a query and numbered evidence spans, write a short answer that cites speci
 - `answer`: **400 characters max**. Must contain inline `[n]` citations. Every claim sentence ends with at least one `[n]`. Multiple sources: `[1][3]`. **Only use facts present in evidence. No outside knowledge, no guessing, no paraphrasing what is not there.**
 - `used_citations`: integer list of `n` values that actually appear in `answer` (for cross-check). Must be sorted ascending, no duplicates.
 - `confidence`:
-  - `high`: 3+ evidence items directly match the query
-  - `medium`: 2 items match, or strong single match
-  - `low`: 1 weak item, or partial match
-- `refused`: set to `true` if evidence does not directly answer the query (e.g. off-topic, too generic, missing key facts). When refused:
+  - `high`: 3+ evidence items with strong relevance
+  - `medium`: 2 items match, or 1 strong match
+  - `low`: 1-2 weak/partial items
+- `refused`: set to `true` ONLY if evidence is completely off-topic (e.g., query about 연차휴가 but evidence only about 산업안전). If evidence is partially relevant or covers a related aspect, attempt an answer with low confidence instead of refusing. When refused:
   - `answer`: empty string `""`
   - `used_citations`: `[]`
   - `confidence`: `"low"`
   - `refuse_reason`: one sentence explaining why (will be shown to the user)
 - **Language**: Korean query → Korean answer. English query → English answer. Match query language.
 - **Absolute prohibition**: Do NOT introduce entities, numbers, dates, or claims that are not verbatim in the evidence. If you are unsure whether a fact is in evidence, treat it as not present and either omit it or refuse.
+- **Partial coverage**: If evidence covers only PART of the query, answer ONLY the covered part. Do NOT infer or guess missing parts. Explicitly state what the evidence covers.
+- **Supplementary evidence**: Evidence marked (보충) is supplementary context, less reliable than primary evidence. Use it only as supporting detail. If primary and supplementary evidence conflict, trust primary.
 
 ## Example 1 (happy path, high confidence)
 query: `산업안전보건법 제6장 주요 내용`
diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py
index 18f3542..ba9dbf9 100644
--- a/app/services/search/evidence_service.py
+++ b/app/services/search/evidence_service.py
@@ -49,6 +49,7 @@ top1 p50 > 0.75. 셋 다 충족해야 켠다.
 from __future__ import annotations
 
 import asyncio
+import re
 import time
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
@@ -69,6 +70,7 @@ EVIDENCE_MIN_RERANK = 0.25  # 1차 rule cut — rerank score 이 미만은 제
 MAX_EVIDENCE_CANDIDATES = 6  # LLM 입력 상한
 MAX_PER_DOC = 2
 CANDIDATE_SNIPPET_CHARS = 800  # LLM 이 볼 원문 창 크기
+MIN_EVIDENCE_FOR_SYNTHESIS = 3  # sparse evidence supplement 임계값
 
 MIN_RELEVANCE_KEEP = 0.5  # LLM 출력 필터
 SPAN_MIN_CHARS = 80  # 이 미만이면 window enlarge
@@ -103,6 +105,7 @@ class EvidenceItem:
     relevance: float  # LLM 0~1 (fallback 시 rerank_score 복사)
     rerank_score: float  # raw reranker 점수
     full_snippet: str  # 원본 800자 (debug/citation 전용, synthesis 금지)
+    source: str = "llm"  # "llm" | "supplement" | "rule_fallback"
 
 
 # ─── 프롬프트 로딩 (module 초기화 1회) ───────────────────
@@ -216,6 +219,7 @@ def _build_rule_only_evidence(
                 relevance=float(c.rerank_score or c.score or 0.0),
                 rerank_score=float(c.rerank_score or c.score or 0.0),
                 full_snippet=full,
+                source="rule_fallback",
             )
         )
     return items
@@ -397,6 +401,63 @@ async def extract_evidence(
         )
         return items, "all_llm_rejected"
 
+    # ── Sparse evidence supplement (Phase 3.5b) ────────
+    # dead zone 해소: LLM kept 1~2 + candidates 충분 → rule-only 보충
+    supplement_skip = None
+    if 0 < len(llm_items) < MIN_EVIDENCE_FOR_SYNTHESIS and len(candidates) >= MIN_EVIDENCE_FOR_SYNTHESIS:
+        llm_n_set = {it.n for it in llm_items}
+        supplement_count = MIN_EVIDENCE_FOR_SYNTHESIS - len(llm_items)
+        supplemented = 0
+
+        # substring + critical token 필터 준비
+        query_tokens = re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}', query)
+        _IMPORTANT_SUFFIXES = {"조건", "기준", "요건", "처벌", "벌칙",
+                               "정의", "차이", "절차", "방법", "계산"}
+        critical_tokens = [
+            t for t in query_tokens
+            if len(t) >= 3 or any(s in t for s in _IMPORTANT_SUFFIXES)
+        ]
+
+        for idx, (c, full) in enumerate(zip(candidates, full_snippets), 1):
+            if idx in llm_n_set or supplement_count <= 0:
+                continue
+            span = _extract_window(full, query, target_chars=200)
+            span, _ = _normalize_span(span, full, query)
+
+            # substring match (recall)
+            has_match = any(qt in span for qt in query_tokens)
+            # critical token check (precision)
+            has_critical = (
+                any(ct in span for ct in critical_tokens)
+                if critical_tokens else has_match
+            )
+            if not (has_match and has_critical):
+                continue
+
+            llm_items.append(
+                EvidenceItem(
+                    n=idx,
+                    chunk_id=c.chunk_id,
+                    doc_id=c.id,
+                    title=c.title,
+                    section_title=c.section_title,
+                    span_text=span,
+                    relevance=float(c.rerank_score or c.score or 0.0) * 0.8,
+                    rerank_score=float(c.rerank_score or c.score or 0.0),
+                    full_snippet=full,
+                    source="supplement",
+                )
+            )
+            supplemented += 1
+            supplement_count -= 1
+
+        if supplemented > 0:
+            supplement_skip = "sparse_evidence_supplemented"
+            logger.info(
+                "evidence sparse_supplement query=%r llm_kept=%d supplemented=%d total=%d",
+                query[:80], len(llm_items) - supplemented, supplemented, len(llm_items),
+            )
+
     # ── doc-group ordering + n 재부여 ───────────────────
     llm_items = _apply_doc_group_ordering(llm_items, results)
 
@@ -404,4 +465,4 @@ async def extract_evidence(
         "evidence ok query=%r candidates=%d kept=%d short_span_expanded=%d elapsed_ms=%.0f",
         query[:80], len(candidates), len(llm_items), short_span_expanded, elapsed_ms,
     )
-    return llm_items, None
+    return llm_items, supplement_skip
diff --git a/app/services/search/synthesis_service.py b/app/services/search/synthesis_service.py
index 3d70589..74ac125 100644
--- a/app/services/search/synthesis_service.py
+++ b/app/services/search/synthesis_service.py
@@ -159,10 +159,15 @@ def _render_prompt(query: str, evidence: list["EvidenceItem"]) -> str:
     제한 뷰만 만들어서 full_snippet 접근을 문법적으로 어렵게 만든다.
     """
     # 제한 뷰 — 이 튜플에는 span_text 외의 snippet 필드가 없다
-    spans: list[tuple[int, str, str]] = [
-        (i.n, (i.title or "").strip(), i.span_text) for i in evidence
+    # source="supplement" 항목은 (보충) 마킹 — synthesis 가 보조 취급하도록
+    spans: list[tuple[int, str, str, str]] = [
+        (i.n, (i.title or "").strip(), i.span_text, getattr(i, "source", "llm"))
+        for i in evidence
+    ]
+    lines = [
+        f"[{n}] {'(보충) ' if src == 'supplement' else ''}{title}\n{span}"
+        for n, title, span, src in spans
     ]
-    lines = [f"[{n}] {title}\n{span}" for n, title, span in spans]
     numbered_block = "\n\n".join(lines)
     return SYNTHESIS_PROMPT.replace("{query}", query).replace(
         "{numbered_evidence}", numbered_block