99672292d3
프로덕션 컨테이너는 /app 을 cwd 로 실행하고 import 는 `from api...`, `from core...`, `from workers...` 처럼 무접두 스타일을 사용한다. PR-A 내부 import 가 `from app.policy...`, `from app.ai.envelope` 로 되어 있어서 컨테이너에서 ModuleNotFoundError 발생. 변경: - app/policy/*.py: `from app.policy.X` → `from policy.X` - app/services/prompt_versions.py: lazy import 도 `from policy.prompt_render` - app/ai/envelope.py: 영향 없음 (내부 import 없음) - tests/policy/*.py: 모두 `from policy.X` / `from ai.envelope` 로 통일 - tests/policy/conftest.py: 로컬 pytest 용 sys.path.insert(app/) 추가 (MacBook 에서 repo-root 기준 실행 시 app/ 를 package root 로 취급) CI: pytest tests/policy/ -q → 98 passed (로컬, 동일 결과) 프로덕션: docker exec fastapi python -c "from policy.loader import load_policy" → OK Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
291 lines
11 KiB
Python
291 lines
11 KiB
Python
"""INV-2, INV-3, INV-4, INV-5, INV-6 — 결정론적 불변식 검증."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from policy.routing import (
|
|
REASON_FALLBACK_DOMAIN,
|
|
REASON_HIGH_IMPACT,
|
|
REASON_LONG_CONTEXT,
|
|
REASON_LOW_CONFIDENCE,
|
|
REASON_MULTI_DOC,
|
|
REASON_RISK_FLAG,
|
|
decide_routing,
|
|
)
|
|
|
|
|
|
# =====================================================================
|
|
# INV-2: risk_flag_requires_26b_forces_escalation
|
|
# =====================================================================
|
|
|
|
|
|
def test_risk_flag_forces_escalation(policy):
|
|
"""INV-2: requires_26b=True flag 가 있으면 무조건 escalate."""
|
|
# safety_legal_interpretation 은 requires_26b=true
|
|
decision = decide_routing(
|
|
subject_domain="news_item", # 자체 high_impact=false
|
|
content_chars=500,
|
|
self_declared_high_impact=False,
|
|
self_declared_risk_flags=["safety_legal_interpretation"],
|
|
confidence=0.95, # high confidence 여도
|
|
policy=policy,
|
|
)
|
|
assert decision.escalate_to_26b is True
|
|
assert REASON_RISK_FLAG in decision.escalation_reasons
|
|
assert "safety_legal_interpretation" in decision.risk_flags
|
|
|
|
|
|
def test_pii_flag_does_not_force_escalation_on_its_own(policy):
|
|
"""pii_present 는 requires_26b=false → 단독으로는 escalate 안 시킴."""
|
|
decision = decide_routing(
|
|
subject_domain="news_item",
|
|
content_chars=500,
|
|
self_declared_high_impact=False,
|
|
self_declared_risk_flags=["pii_present"],
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert "pii_present" in decision.risk_flags
|
|
assert decision.escalate_to_26b is False # 다른 조건 없으면 escalate 안 함
|
|
|
|
|
|
# =====================================================================
|
|
# INV-3: context_cap_forces_escalation
|
|
# =====================================================================
|
|
|
|
|
|
def test_context_cap_forces_escalation(policy):
|
|
"""INV-3: content_chars > context_char_cap_4b → long_context escalation."""
|
|
cap = policy.escalation.context_char_cap_4b
|
|
decision = decide_routing(
|
|
subject_domain="news_item",
|
|
content_chars=cap + 1,
|
|
self_declared_high_impact=False,
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert decision.escalate_to_26b is True
|
|
assert REASON_LONG_CONTEXT in decision.escalation_reasons
|
|
|
|
|
|
def test_context_at_cap_does_not_escalate(policy):
|
|
"""경계값: content_chars == cap 는 escalate 안 함 (strict >)."""
|
|
cap = policy.escalation.context_char_cap_4b
|
|
decision = decide_routing(
|
|
subject_domain="news_item",
|
|
content_chars=cap,
|
|
self_declared_high_impact=False,
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
# news_item 은 high_impact=false 이고 다른 조건 없음
|
|
assert REASON_LONG_CONTEXT not in decision.escalation_reasons
|
|
|
|
|
|
# =====================================================================
|
|
# INV-4: multi_doc_forces_escalation
|
|
# =====================================================================
|
|
|
|
|
|
def test_multi_doc_forces_escalation(policy):
|
|
"""INV-4: evidence_doc_count >= threshold → multi_doc escalation + derived flag."""
|
|
threshold = policy.escalation.escalate_on_multi_doc_count
|
|
decision = decide_routing(
|
|
subject_domain="news_item",
|
|
content_chars=500,
|
|
evidence_doc_count=threshold, # = 3
|
|
self_declared_high_impact=False,
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert decision.escalate_to_26b is True
|
|
assert REASON_MULTI_DOC in decision.escalation_reasons
|
|
assert "multi_doc_dependency" in decision.risk_flags
|
|
|
|
|
|
def test_multi_doc_below_threshold_no_escalation(policy):
|
|
"""경계값: 2개는 escalate 안 함."""
|
|
decision = decide_routing(
|
|
subject_domain="news_item",
|
|
content_chars=500,
|
|
evidence_doc_count=2,
|
|
self_declared_high_impact=False,
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert REASON_MULTI_DOC not in decision.escalation_reasons
|
|
assert "multi_doc_dependency" not in decision.risk_flags
|
|
|
|
|
|
# =====================================================================
|
|
# INV-5: risk_flags_union
|
|
# =====================================================================
|
|
|
|
|
|
def test_risk_flags_union_default_plus_self_declared(policy):
|
|
"""INV-5: default + self_declared 가 UNION. 둘 다 포함돼야 함."""
|
|
# safety_reference 의 default = [safety_legal_interpretation]
|
|
decision = decide_routing(
|
|
subject_domain="safety_reference",
|
|
content_chars=1000,
|
|
self_declared_high_impact=False,
|
|
self_declared_risk_flags=["pii_present"], # 다른 flag 추가
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert "safety_legal_interpretation" in decision.risk_flags # default
|
|
assert "pii_present" in decision.risk_flags # self_declared
|
|
# 둘 다 포함되어 있으면 UNION 통과
|
|
|
|
|
|
def test_risk_flags_union_with_derived_flags(policy):
|
|
"""default + self + derived (long_context, low_confidence, multi_doc) 모두 합쳐짐."""
|
|
cap = policy.escalation.context_char_cap_4b
|
|
decision = decide_routing(
|
|
subject_domain="safety_reference",
|
|
content_chars=cap + 1, # long_context → low_confidence_reasoning NOT added here
|
|
evidence_doc_count=3, # multi_doc_dependency added
|
|
self_declared_high_impact=False,
|
|
self_declared_risk_flags=["pii_present"],
|
|
confidence=0.5, # < 0.7 → low_confidence_reasoning added
|
|
policy=policy,
|
|
)
|
|
# 4개 flag 다 있어야 함
|
|
assert "safety_legal_interpretation" in decision.risk_flags # default
|
|
assert "pii_present" in decision.risk_flags # self
|
|
assert "multi_doc_dependency" in decision.risk_flags # derived (INV-4)
|
|
assert "low_confidence_reasoning" in decision.risk_flags # derived (low_conf)
|
|
|
|
|
|
def test_risk_flags_is_sorted_tuple(policy):
|
|
"""RoutingDecision.risk_flags 는 정렬된 tuple (재현성)."""
|
|
decision = decide_routing(
|
|
subject_domain="news_item",
|
|
content_chars=500,
|
|
self_declared_risk_flags=["pii_present", "safety_legal_interpretation"],
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert isinstance(decision.risk_flags, tuple)
|
|
assert list(decision.risk_flags) == sorted(decision.risk_flags)
|
|
|
|
|
|
# =====================================================================
|
|
# INV-6: fallback_domain for unknown
|
|
# =====================================================================
|
|
|
|
|
|
def test_fallback_domain_used_for_unknown(policy):
|
|
"""INV-6: 미정의 subject_domain 주면 fallback_domain 적용."""
|
|
decision = decide_routing(
|
|
subject_domain="__nonexistent_domain__",
|
|
content_chars=500,
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert decision is not None
|
|
assert decision.used_fallback is True
|
|
assert decision.subject_domain_used == policy.fallback_domain.name
|
|
assert REASON_FALLBACK_DOMAIN in decision.escalation_reasons
|
|
|
|
|
|
def test_fallback_still_respects_other_invariants(policy):
|
|
"""fallback 이어도 INV-3 (long_context) 은 그대로 작동."""
|
|
cap = policy.escalation.context_char_cap_4b
|
|
decision = decide_routing(
|
|
subject_domain="__nonexistent__",
|
|
content_chars=cap + 1, # long context
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert decision.used_fallback is True
|
|
assert REASON_LONG_CONTEXT in decision.escalation_reasons
|
|
assert decision.escalate_to_26b is True
|
|
|
|
|
|
def test_fallback_default_risk_flags_applied(policy):
|
|
"""fallback.default_risk_flags = [low_confidence_reasoning] 가 결과에 반영."""
|
|
decision = decide_routing(
|
|
subject_domain="__unknown__",
|
|
content_chars=500,
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
# fallback 의 default = ["low_confidence_reasoning"] 는 requires_26b=true 이므로 escalate
|
|
assert "low_confidence_reasoning" in decision.risk_flags
|
|
|
|
|
|
# =====================================================================
|
|
# low_confidence escalation (not a numbered invariant but required)
|
|
# =====================================================================
|
|
|
|
|
|
def test_low_confidence_forces_escalation(policy):
|
|
"""confidence < threshold → low_confidence escalation + derived flag."""
|
|
threshold = policy.escalation.confidence_threshold
|
|
decision = decide_routing(
|
|
subject_domain="news_item",
|
|
content_chars=500,
|
|
self_declared_high_impact=False,
|
|
confidence=threshold - 0.01,
|
|
policy=policy,
|
|
)
|
|
assert decision.escalate_to_26b is True
|
|
assert REASON_LOW_CONFIDENCE in decision.escalation_reasons
|
|
assert "low_confidence_reasoning" in decision.risk_flags
|
|
|
|
|
|
# =====================================================================
|
|
# 도메인 × 시나리오 스냅샷 (테이블 드리븐)
|
|
# =====================================================================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"domain,expected_escalate,expected_high_impact",
|
|
[
|
|
("safety_reference", True, True),
|
|
("safety_operational", True, True),
|
|
("msds", True, True),
|
|
("hazard_specific", True, True),
|
|
("incident_report", True, True),
|
|
("health_record", True, True),
|
|
("safety_video", False, False),
|
|
("news_item", False, False),
|
|
("news_digest_request", True, True),
|
|
],
|
|
)
|
|
def test_default_escalation_per_domain(policy, domain, expected_escalate, expected_high_impact):
|
|
"""각 도메인 기본 상태 (high confidence, 짧은 본문, self_declare=false) 의 escalate 여부."""
|
|
decision = decide_routing(
|
|
subject_domain=domain,
|
|
content_chars=1000,
|
|
self_declared_high_impact=False,
|
|
self_declared_risk_flags=[],
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
assert decision.high_impact_task is expected_high_impact, (
|
|
f"domain={domain}: high_impact expected={expected_high_impact}, got={decision.high_impact_task}"
|
|
)
|
|
assert decision.escalate_to_26b is expected_escalate, (
|
|
f"domain={domain}: escalate expected={expected_escalate}, got={decision.escalate_to_26b}, "
|
|
f"reasons={decision.escalation_reasons}"
|
|
)
|
|
|
|
|
|
def test_synthesis_directives_collected(policy):
|
|
"""requires_26b flag 의 synthesis_directive 가 결과에 수집됨."""
|
|
decision = decide_routing(
|
|
subject_domain="msds", # default=[chemical_hazard, safety_legal_interpretation]
|
|
content_chars=1000,
|
|
self_declared_high_impact=False,
|
|
confidence=0.95,
|
|
policy=policy,
|
|
)
|
|
# 둘 다 synthesis_directive 가 yaml 에 있음
|
|
assert len(decision.synthesis_directives) >= 2
|
|
# 문자열이 비어있지 않아야 함
|
|
for d in decision.synthesis_directives:
|
|
assert len(d) > 0
|