976 lines
47 KiB
Python
976 lines
47 KiB
Python
from __future__ import annotations
|
||
|
||
import sys
|
||
import unittest
|
||
from pathlib import Path
|
||
|
||
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||
|
||
import domain_case_loop as dcl
|
||
import domain_truth_harness as dth
|
||
|
||
|
||
class DomainCaseLoopStepStateTests(unittest.TestCase):
|
||
def test_preserves_mcp_catalog_alignment_debug_fields(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="planner_alignment_demo",
|
||
domain="planner_autonomy",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Alignment visibility",
|
||
"depends_on": [],
|
||
"question_template": "show planner alignment",
|
||
},
|
||
step_index=1,
|
||
question_resolved="show planner alignment",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual",
|
||
"text": "Confirmed answer",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "counterparty_turnover",
|
||
"selected_recipe": "counterparty_turnover_by_period",
|
||
"capability_id": "confirmed_counterparty_turnover",
|
||
"mcp_discovery_catalog_chain_alignment_status": "selected_matches_top",
|
||
"mcp_discovery_catalog_chain_top_match": "value_flow",
|
||
"mcp_discovery_catalog_chain_selected_matches_top": True,
|
||
"mcp_discovery_route_candidate_status": "needs_user_scope",
|
||
"mcp_discovery_route_candidate_fact_family": "value_flow",
|
||
"mcp_discovery_route_candidate_action_family": "turnover",
|
||
"mcp_discovery_route_candidate_missing_axes": ["organization"],
|
||
"mcp_discovery_route_candidate_provided_axes": ["period"],
|
||
"mcp_discovery_route_candidate_executable_now": False,
|
||
"mcp_discovery_route_candidate_enablement_reason": "Missing scope axes: organization",
|
||
"mcp_discovery_route_candidate_next_action": "Ask the user for the missing scope axes before MCP execution.",
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["mcp_discovery_catalog_chain_alignment_status"], "selected_matches_top")
|
||
self.assertEqual(step_state["mcp_discovery_catalog_chain_top_match"], "value_flow")
|
||
self.assertTrue(step_state["mcp_discovery_catalog_chain_selected_matches_top"])
|
||
self.assertEqual(step_state["mcp_discovery_route_candidate_status"], "needs_user_scope")
|
||
self.assertEqual(step_state["mcp_discovery_route_candidate_missing_axes"], ["organization"])
|
||
self.assertEqual(step_state["mcp_discovery_route_candidate_provided_axes"], ["period"])
|
||
self.assertFalse(step_state["mcp_discovery_route_candidate_executable_now"])
|
||
|
||
def test_repair_targets_promote_route_candidate_enablement_gaps(self) -> None:
|
||
repair_targets = dcl.build_deterministic_repair_targets(
|
||
{"pack_id": "route_candidate_pack", "domain": "open_world", "final_status": "accepted"},
|
||
[
|
||
{
|
||
"scenario_id": "route_candidate_demo",
|
||
"title": "Route candidate demo",
|
||
"artifact_dir": "artifacts/domain_runs/route_candidate_demo",
|
||
"scenario_state": {
|
||
"step_outputs": {
|
||
"step_01": {
|
||
"status": "accepted",
|
||
"acceptance_status": "accepted",
|
||
"execution_status": "validated",
|
||
"question_resolved": "можно ли построить анализ качества поставщиков за 2020?",
|
||
"reply_type": "partial_coverage",
|
||
"mcp_discovery_selected_chain_id": "business_overview",
|
||
"mcp_discovery_route_candidate_status": "needs_route_enablement",
|
||
"mcp_discovery_route_candidate_fact_family": "supplier_quality",
|
||
"mcp_discovery_route_candidate_action_family": "risk_or_quality_analysis",
|
||
"mcp_discovery_route_candidate_missing_axes": [],
|
||
"mcp_discovery_route_candidate_executable_now": False,
|
||
"mcp_discovery_route_candidate_enablement_reason": "Reviewed supplier-quality route is not wired yet.",
|
||
"mcp_discovery_route_candidate_next_action": "Enable a reviewed supplier-quality route before claiming this fact.",
|
||
}
|
||
}
|
||
},
|
||
}
|
||
],
|
||
)
|
||
|
||
self.assertEqual(repair_targets["target_count"], 1)
|
||
self.assertEqual(repair_targets["severity_counts"]["P1"], 1)
|
||
target = repair_targets["targets"][0]
|
||
self.assertEqual(target["problem_type"], "route_candidate_enablement_gap")
|
||
self.assertEqual(target["target_source"], "route_candidate_enablement")
|
||
self.assertEqual(target["route_candidate"]["candidate_status"], "needs_route_enablement")
|
||
self.assertEqual(repair_targets["route_candidate_status_counts"], {"needs_route_enablement": 1})
|
||
self.assertEqual(repair_targets["route_candidate_groups"][0]["selected_chain_id"], "business_overview")
|
||
|
||
def test_lead_handoff_surfaces_route_candidate_groups(self) -> None:
|
||
repair_targets = dcl.build_deterministic_repair_targets(
|
||
{"pack_id": "route_candidate_pack", "domain": "open_world", "final_status": "accepted"},
|
||
[
|
||
{
|
||
"scenario_id": "route_candidate_demo",
|
||
"title": "Route candidate demo",
|
||
"artifact_dir": "artifacts/domain_runs/route_candidate_demo",
|
||
"scenario_state": {
|
||
"step_outputs": {
|
||
"step_01": {
|
||
"status": "accepted",
|
||
"acceptance_status": "accepted",
|
||
"execution_status": "validated",
|
||
"question_resolved": "можно ли построить анализ качества поставщиков за 2020?",
|
||
"reply_type": "partial_coverage",
|
||
"mcp_discovery_selected_chain_id": "business_overview",
|
||
"mcp_discovery_route_candidate_status": "needs_route_enablement",
|
||
"mcp_discovery_route_candidate_fact_family": "supplier_quality",
|
||
"mcp_discovery_route_candidate_action_family": "risk_or_quality_analysis",
|
||
"mcp_discovery_route_candidate_missing_axes": [],
|
||
"mcp_discovery_route_candidate_executable_now": False,
|
||
"mcp_discovery_route_candidate_enablement_reason": "Reviewed supplier-quality route is not wired yet.",
|
||
"mcp_discovery_route_candidate_next_action": "Enable a reviewed supplier-quality route before claiming this fact.",
|
||
}
|
||
}
|
||
},
|
||
}
|
||
],
|
||
)
|
||
handoff = dcl.build_lead_coder_handoff(
|
||
loop_state={"loop_id": "route_candidate_loop"},
|
||
iteration_id="iteration_00",
|
||
pack_dir=Path("artifacts/domain_runs/route_candidate_pack"),
|
||
analyst_verdict_path=Path("artifacts/domain_runs/route_candidate_pack/analyst_verdict.json"),
|
||
repair_targets_path=Path("artifacts/domain_runs/route_candidate_pack/repair_targets.json"),
|
||
business_audit_path=Path("artifacts/domain_runs/route_candidate_pack/business_audit.md"),
|
||
analyst_verdict={"quality_score": 75},
|
||
repair_targets=repair_targets,
|
||
target_score=88,
|
||
loop_decision="needs_exact_capability",
|
||
analyst_accepted_gate=False,
|
||
accepted_gate=False,
|
||
deterministic_gate_ok=False,
|
||
deterministic_gate_reason="repair_targets_remaining=P0:0,P1:1",
|
||
requires_user_decision=False,
|
||
user_decision_type="none",
|
||
user_decision_prompt=None,
|
||
)
|
||
markdown = dcl.build_lead_coder_handoff_markdown(handoff)
|
||
|
||
self.assertEqual(handoff["route_candidate_groups"][0]["candidate_status"], "needs_route_enablement")
|
||
self.assertEqual(handoff["route_candidate_enablement_targets"][0]["problem_type"], "route_candidate_enablement_gap")
|
||
self.assertIn("## Route Candidate Handoff Groups", markdown)
|
||
self.assertIn("route_candidate_demo:step_01", markdown)
|
||
|
||
def test_analysis_context_date_is_not_implicit_business_filter(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="stage_pack_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "All-time summary",
|
||
"depends_on": [],
|
||
"question_template": "all-time money summary",
|
||
},
|
||
step_index=1,
|
||
question_resolved="all-time money summary",
|
||
analysis_context={"as_of_date": "2026-05-09", "source": "stage_pack"},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual_with_explanation",
|
||
"text": "Short: all-time confirmed money summary.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertNotIn("missing_required_filter", step_state["violated_invariants"])
|
||
self.assertNotIn("wrong_as_of_date", step_state["violated_invariants"])
|
||
|
||
def test_analysis_context_date_is_required_for_explicit_date_carryover(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="date_carryover_demo",
|
||
domain="inventory",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Date carryover",
|
||
"depends_on": [],
|
||
"question_template": "stock on that date",
|
||
"required_carryover_invariants": ["date_scope"],
|
||
},
|
||
step_index=1,
|
||
question_resolved="stock on that date",
|
||
analysis_context={"as_of_date": "2021-03-31"},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual",
|
||
"text": "Short: stock confirmed.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "inventory_on_hand_as_of_date",
|
||
"selected_recipe": "address_inventory_on_hand_as_of_date_v1",
|
||
"capability_id": "confirmed_inventory_on_hand_as_of_date",
|
||
"capability_route_mode": "exact",
|
||
"fallback_type": "none",
|
||
"extracted_filters": {"as_of_date": "2020-03-31"},
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertIn("wrong_as_of_date", step_state["violated_invariants"])
|
||
|
||
def test_temporal_reset_question_skips_carried_date_scope(self) -> None:
|
||
self.assertTrue(dcl.question_resets_temporal_scope("show money za all time"))
|
||
self.assertTrue(dcl.question_resets_temporal_scope("сколько всего денег за все доступное время"))
|
||
|
||
carried = dcl.carry_forward_analysis_context(
|
||
{
|
||
"semantic_memory": {
|
||
"date_scope": {
|
||
"as_of_date": "2020-12-31",
|
||
"period_from": "2020-10-01",
|
||
"period_to": "2020-12-31",
|
||
},
|
||
"organization_scope": {"label": "ООО Альтернатива Плюс"},
|
||
}
|
||
},
|
||
{},
|
||
prefer_carryover=True,
|
||
carry_date_scope=False,
|
||
)
|
||
|
||
self.assertNotIn("as_of_date", carried)
|
||
self.assertEqual(carried["organization_scope"], {"label": "ООО Альтернатива Плюс"})
|
||
|
||
def test_merge_scenario_date_scope_keeps_current_scope_over_stale_previous(self) -> None:
|
||
merged = dcl.merge_scenario_date_scope(
|
||
{
|
||
"as_of_date": "2020-12-31",
|
||
"period_from": "2020-10-01",
|
||
"period_to": "2020-12-31",
|
||
"source": "scenario_state_carryover",
|
||
},
|
||
{
|
||
"as_of_date": "2021-03-31",
|
||
"period_from": "2021-03-01",
|
||
"period_to": "2021-03-31",
|
||
"source": "current_turn",
|
||
},
|
||
depends_on=["previous_step"],
|
||
)
|
||
|
||
self.assertEqual(merged["as_of_date"], "2021-03-31")
|
||
self.assertEqual(merged["source"], "current_turn")
|
||
|
||
def test_mcp_business_overview_all_time_scope_overrides_stale_session_date(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_overview_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "All-time money",
|
||
"depends_on": ["previous_step"],
|
||
"question_template": "all-time money summary",
|
||
"expected_intents": ["business_overview"],
|
||
},
|
||
step_index=1,
|
||
question_resolved="all-time money summary",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Short: all-time confirmed money summary.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
|
||
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
|
||
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
|
||
"mcp_discovery_response_applied": True,
|
||
"mcp_discovery_selected_chain_id": "business_overview",
|
||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||
"mcp_discovery_response_candidate_v1": {
|
||
"candidate_status": "ready_for_guarded_use",
|
||
"reply_type": "partial_coverage",
|
||
},
|
||
"assistant_mcp_discovery_entry_point_v1": {
|
||
"bridge": {
|
||
"pilot": {
|
||
"derived_business_overview": {
|
||
"period_scope": None,
|
||
}
|
||
}
|
||
}
|
||
},
|
||
},
|
||
"session_summary": {
|
||
"address_navigation_state": {
|
||
"session_context": {
|
||
"active_result_set_id": "rs-stale",
|
||
"date_scope": {
|
||
"as_of_date": "2020-12-31",
|
||
"period_from": "2020-10-01",
|
||
"period_to": "2020-12-31",
|
||
},
|
||
}
|
||
}
|
||
},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["date_scope"]["scope"], "all_time")
|
||
self.assertIsNone(step_state["date_scope"]["as_of_date"])
|
||
self.assertEqual(step_state["active_result_set_id"], "mcp-discovery-msg-1")
|
||
self.assertNotIn("wrong_date_scope_state", step_state["violated_invariants"])
|
||
|
||
def test_applied_ready_mcp_discovery_chain_satisfies_expected_intent(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_overview_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Business overview",
|
||
"depends_on": [],
|
||
"question_template": "business overview for 2020",
|
||
"expected_intents": ["business_overview"],
|
||
},
|
||
step_index=1,
|
||
question_resolved="business overview for 2020",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Short: business overview from confirmed 1C rows.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
|
||
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
|
||
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
|
||
"mcp_discovery_response_applied": True,
|
||
"mcp_discovery_selected_chain_id": "business_overview",
|
||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||
"mcp_discovery_response_candidate_v1": {
|
||
"candidate_status": "ready_for_guarded_use",
|
||
"reply_type": "partial_coverage",
|
||
},
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["mcp_discovery_effective_intents"], ["business_overview"])
|
||
self.assertNotIn("wrong_intent", step_state["violated_invariants"])
|
||
|
||
def test_ready_bounded_mcp_answer_can_validate_without_exact_route(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_overview_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Business overview",
|
||
"depends_on": [],
|
||
"question_template": "business overview for 2020",
|
||
"expected_intents": ["business_overview"],
|
||
"required_answer_shape": "direct_answer_first",
|
||
},
|
||
step_index=1,
|
||
question_resolved="business overview for 2020",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Short: confirmed bounded business overview from 1C rows.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
|
||
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
|
||
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
|
||
"mcp_discovery_response_applied": True,
|
||
"mcp_discovery_selected_chain_id": "business_overview",
|
||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||
"mcp_discovery_response_candidate_v1": {
|
||
"candidate_status": "ready_for_guarded_use",
|
||
"reply_type": "partial_coverage",
|
||
},
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["execution_status"], "partial")
|
||
self.assertTrue(step_state["bounded_mcp_answer_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||
|
||
def test_required_answer_patterns_block_generic_bounded_mcp_summary(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="summary_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Summary",
|
||
"depends_on": [],
|
||
"question_template": "summarize company and SVK separately",
|
||
"required_answer_shape": "direct_answer_first",
|
||
"required_answer_patterns_all": ["SVK", "company"],
|
||
},
|
||
step_index=1,
|
||
question_resolved="summarize company and SVK separately",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Short: company money summary only.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"mcp_discovery_response_applied": True,
|
||
"mcp_discovery_selected_chain_id": "business_overview",
|
||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||
"mcp_discovery_response_candidate_v1": {
|
||
"candidate_status": "ready_for_guarded_use",
|
||
"reply_type": "partial_coverage",
|
||
},
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertIn("required_answer_patterns_all_missing", step_state["violated_invariants"])
|
||
self.assertFalse(step_state["bounded_mcp_answer_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "rejected")
|
||
|
||
def test_memory_checkpoint_can_validate_honest_no_scope_answer(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="memory_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Memory checkpoint",
|
||
"depends_on": [],
|
||
"question_template": "is any company or counterparty selected in the current dialog?",
|
||
"semantic_tags": ["memory", "scope_guard"],
|
||
"required_answer_shape": "direct_answer_first",
|
||
},
|
||
step_index=1,
|
||
question_resolved="is any company or counterparty selected in the current dialog?",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "В текущем диалоге не выбрана компания или контрагент; память не выдумываю.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "customer_revenue_and_payments",
|
||
"fallback_type": "no_rows",
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["execution_status"], "partial")
|
||
self.assertTrue(step_state["memory_checkpoint_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||
|
||
def test_deterministic_chat_memory_checkpoint_validates_without_exact_capability(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="memory_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Memory checkpoint",
|
||
"depends_on": [],
|
||
"question_template": "current dialog memory checkpoint",
|
||
"semantic_tags": ["memory", "scope_guard"],
|
||
"required_answer_shape": "direct_answer_first",
|
||
},
|
||
step_index=1,
|
||
question_resolved="current dialog memory checkpoint",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual_with_explanation",
|
||
"text": (
|
||
"Коротко: в текущем диалоге я не вижу выбранной компании, контрагента или позиции. "
|
||
"Память про «Группа СВК» в этом диалоге не подтверждена."
|
||
),
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "chat",
|
||
"fallback_type": "none",
|
||
"living_router_reason": "memory_recap_followup_detected",
|
||
"living_chat_response_source": "deterministic_memory_recap_contract",
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["execution_status"], "partial")
|
||
self.assertTrue(step_state["memory_checkpoint_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||
|
||
def test_confirmed_runtime_factual_answer_can_validate_without_exact_route_mode(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="runtime_factual_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Account 60 tails",
|
||
"depends_on": [],
|
||
"question_template": "show account 60 tails",
|
||
"required_answer_shape": "direct_answer_first",
|
||
},
|
||
step_index=1,
|
||
question_resolved="show account 60 tails",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual",
|
||
"text": "Коротко: по счету 60 найдено 8 строк хвостов; контрагентов с сигналом: 6.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "open_items_by_counterparty_or_contract",
|
||
"selected_recipe": "address_open_items_by_party_or_contract_v1",
|
||
"capability_id": "address_open_items_by_counterparty_or_contract",
|
||
"capability_route_mode": "heuristic",
|
||
"fallback_type": "none",
|
||
"mcp_call_status": "matched_non_empty",
|
||
"response_type": "FACTUAL_LIST",
|
||
"result_mode": "confirmed_balance",
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["execution_status"], "partial")
|
||
self.assertTrue(step_state["runtime_factual_answer_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||
|
||
def test_exact_confirmed_document_followup_sets_runtime_factual_validation(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="svk_pivot",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "s02_svk_docs",
|
||
"title": "Counterparty documents follow-up",
|
||
"depends_on": ["s01_svk_money"],
|
||
"question_template": "show documents by this chain",
|
||
"semantic_tags": ["counterparty", "documents", "scope_guard"],
|
||
"required_answer_shape": "direct_answer_first",
|
||
},
|
||
step_index=2,
|
||
question_resolved="show documents by this chain",
|
||
analysis_context={"as_of_date": "2026-05-09"},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual",
|
||
"text": "Контрагент: Группа СВК. Найдено документов: 19.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "list_documents_by_counterparty",
|
||
"selected_recipe": "address_documents_by_counterparty_v1",
|
||
"capability_id": "documents_drilldown",
|
||
"capability_route_mode": "exact",
|
||
"fallback_type": "none",
|
||
"mcp_call_status": "matched_non_empty",
|
||
"response_type": "FACTUAL_LIST",
|
||
"truth_mode": "confirmed",
|
||
"answer_shape": "confirmed_factual",
|
||
"coverage_status": "full",
|
||
"evidence_grade": "strong",
|
||
"extracted_filters": {"counterparty": "Группа СВК", "as_of_date": "2026-05-09"},
|
||
"focus_object": {
|
||
"object_type": "counterparty",
|
||
"object_id": "counterparty:группа свк",
|
||
"label": "Группа СВК",
|
||
},
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[{"item": "2021-11-10T12:00:07Z"}],
|
||
)
|
||
|
||
self.assertEqual(step_state["execution_status"], "exact")
|
||
self.assertTrue(step_state["runtime_factual_answer_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||
|
||
def test_heuristic_open_items_guarded_insufficiency_validates_separately(self) -> None:
|
||
answer_text = (
|
||
"\u041a\u043e\u0440\u043e\u0442\u043a\u043e: \u0442\u043e\u0447\u043d\u044b\u0439 "
|
||
"\u043e\u0442\u043a\u0440\u044b\u0442\u044b\u0439 \u043e\u0441\u0442\u0430\u0442\u043e\u043a "
|
||
"\u043f\u043e \u0441\u0447\u0435\u0442\u0443 60 \u043d\u0435 "
|
||
"\u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d; \u043d\u0438\u0436\u0435 "
|
||
"\u0442\u043e\u043b\u044c\u043a\u043e \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0435 "
|
||
"\u0441\u0438\u0433\u043d\u0430\u043b\u044b \u043f\u043e \u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f\u043c: 8 "
|
||
"\u0441\u0442\u0440\u043e\u043a.\n"
|
||
"\u042d\u0442\u043e \u043d\u0435 \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d\u043d\u043e\u0435 "
|
||
"\u0441\u0430\u043b\u044c\u0434\u043e: \u0442\u0435\u043a\u0443\u0449\u0438\u0439 "
|
||
"\u043a\u043e\u043d\u0442\u0443\u0440 \u0432\u0438\u0434\u0438\u0442 "
|
||
"\u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f-\u043a\u0430\u043d\u0434\u0438\u0434\u0430\u0442\u044b, "
|
||
"\u043d\u043e \u043d\u0435 \u0434\u043e\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442 "
|
||
"\u043e\u0441\u0442\u0430\u0442\u043e\u043a."
|
||
)
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="runtime_factual_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Account 60 limited tails",
|
||
"depends_on": [],
|
||
"question_template": "show account 60 tails; say if exact data is unavailable",
|
||
"required_answer_shape": "direct_answer_first",
|
||
},
|
||
step_index=1,
|
||
question_resolved="show account 60 tails; say if exact data is unavailable",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual",
|
||
"text": answer_text,
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "open_items_by_counterparty_or_contract",
|
||
"selected_recipe": "address_open_items_by_party_or_contract_v1",
|
||
"capability_id": "address_open_items_by_counterparty_or_contract",
|
||
"capability_route_mode": "heuristic",
|
||
"fallback_type": "none",
|
||
"mcp_call_status": "matched_non_empty",
|
||
"response_type": "FACTUAL_LIST",
|
||
"result_mode": "heuristic_candidates",
|
||
"balance_confirmed": False,
|
||
"truth_mode": "limited",
|
||
"answer_shape": "limited_with_reason",
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["execution_status"], "partial")
|
||
self.assertEqual(step_state["truth_mode"], "limited")
|
||
self.assertEqual(step_state["answer_shape"], "limited_with_reason")
|
||
self.assertFalse(step_state["runtime_factual_answer_validated"])
|
||
self.assertTrue(step_state["guarded_insufficiency_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||
|
||
def test_heuristic_open_items_without_limitation_is_rejected(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="runtime_factual_demo",
|
||
domain="agentic_loop",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Account 60 unguarded tails",
|
||
"depends_on": [],
|
||
"question_template": "show account 60 tails",
|
||
"required_answer_shape": "direct_answer_first",
|
||
},
|
||
step_index=1,
|
||
question_resolved="show account 60 tails",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual",
|
||
"text": "Short: account 60 has 8 open-item rows and 6 counterparties.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "open_items_by_counterparty_or_contract",
|
||
"selected_recipe": "address_open_items_by_party_or_contract_v1",
|
||
"capability_id": "address_open_items_by_counterparty_or_contract",
|
||
"capability_route_mode": "heuristic",
|
||
"fallback_type": "none",
|
||
"mcp_call_status": "matched_non_empty",
|
||
"response_type": "FACTUAL_LIST",
|
||
"result_mode": "heuristic_candidates",
|
||
"balance_confirmed": False,
|
||
"truth_mode": "limited",
|
||
"answer_shape": "limited_with_reason",
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["execution_status"], "partial")
|
||
self.assertFalse(step_state["runtime_factual_answer_validated"])
|
||
self.assertFalse(step_state["guarded_insufficiency_validated"])
|
||
self.assertEqual(step_state["acceptance_status"], "rejected")
|
||
|
||
def test_truth_harness_warns_on_catalog_alignment_divergence(self) -> None:
|
||
reviewed = dth.evaluate_truth_step(
|
||
step={
|
||
"step_id": "step_01",
|
||
"question_template": "show planner alignment",
|
||
"criticality": "critical",
|
||
"allowed_reply_types": [],
|
||
},
|
||
step_state={
|
||
"question_resolved": "show planner alignment",
|
||
"reply_type": "factual",
|
||
"assistant_text": "Confirmed answer",
|
||
"actual_direct_answer": "Confirmed answer",
|
||
"detected_intent": "counterparty_turnover",
|
||
"selected_recipe": "counterparty_turnover_by_period",
|
||
"capability_id": "confirmed_counterparty_turnover",
|
||
"mcp_discovery_catalog_chain_alignment_status": "selected_outside_match_set",
|
||
"mcp_discovery_catalog_chain_top_match": "value_flow_comparison",
|
||
"mcp_discovery_catalog_chain_selected_matches_top": False,
|
||
"extracted_filters": {},
|
||
},
|
||
step_results={},
|
||
bindings={},
|
||
runtime_bindings={},
|
||
)
|
||
|
||
self.assertEqual(reviewed["review_status"], "warning")
|
||
self.assertEqual(reviewed["warning_findings_count"], 1)
|
||
self.assertEqual(reviewed["review_findings"][0]["code"], "catalog_alignment_divergence")
|
||
self.assertEqual(reviewed["review_findings"][0]["severity"], "warning")
|
||
|
||
def test_truth_harness_checks_expected_catalog_alignment_fields(self) -> None:
|
||
reviewed = dth.evaluate_truth_step(
|
||
step={
|
||
"step_id": "step_01",
|
||
"question_template": "show planner alignment",
|
||
"criticality": "critical",
|
||
"allowed_reply_types": [],
|
||
"expected_catalog_alignment_status": "selected_matches_top",
|
||
"expected_catalog_chain_top_match": "value_flow_comparison",
|
||
"expected_catalog_selected_matches_top": True,
|
||
},
|
||
step_state={
|
||
"question_resolved": "show planner alignment",
|
||
"reply_type": "factual",
|
||
"assistant_text": "Confirmed answer",
|
||
"actual_direct_answer": "Confirmed answer",
|
||
"detected_intent": "counterparty_turnover",
|
||
"selected_recipe": "counterparty_turnover_by_period",
|
||
"capability_id": "confirmed_counterparty_turnover",
|
||
"mcp_discovery_catalog_chain_alignment_status": "selected_matches_top",
|
||
"mcp_discovery_catalog_chain_top_match": "value_flow",
|
||
"mcp_discovery_catalog_chain_selected_matches_top": True,
|
||
"extracted_filters": {},
|
||
},
|
||
step_results={},
|
||
bindings={},
|
||
runtime_bindings={},
|
||
)
|
||
|
||
self.assertEqual(reviewed["review_status"], "fail")
|
||
self.assertEqual(reviewed["critical_findings_count"], 1)
|
||
self.assertEqual(reviewed["review_findings"][0]["code"], "wrong_catalog_chain_top_match")
|
||
|
||
def test_truth_harness_checks_expected_route_candidate_fields(self) -> None:
|
||
reviewed = dth.evaluate_truth_step(
|
||
step={
|
||
"step_id": "step_01",
|
||
"question_template": "show route candidate",
|
||
"criticality": "critical",
|
||
"allowed_reply_types": [],
|
||
"expected_route_candidate_status": "needs_user_scope",
|
||
"expected_route_candidate_executable_now": False,
|
||
"expected_route_candidate_missing_axes": ["organization", "period"],
|
||
},
|
||
step_state={
|
||
"question_resolved": "show route candidate",
|
||
"reply_type": "clarification_required",
|
||
"assistant_text": "Please choose organization.",
|
||
"actual_direct_answer": "Please choose organization.",
|
||
"detected_intent": "counterparty_turnover",
|
||
"selected_recipe": None,
|
||
"capability_id": None,
|
||
"mcp_discovery_route_candidate_status": "needs_user_scope",
|
||
"mcp_discovery_route_candidate_missing_axes": ["organization"],
|
||
"mcp_discovery_route_candidate_executable_now": False,
|
||
"extracted_filters": {},
|
||
},
|
||
step_results={},
|
||
bindings={},
|
||
runtime_bindings={},
|
||
)
|
||
|
||
self.assertEqual(reviewed["review_status"], "fail")
|
||
self.assertEqual(reviewed["critical_findings_count"], 1)
|
||
self.assertEqual(reviewed["review_findings"][0]["code"], "missing_route_candidate_axes")
|
||
|
||
def test_business_first_review_flags_dirty_direct_answer_surface(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": "какой у нас самый доходный год",
|
||
},
|
||
step_index=1,
|
||
question_resolved="какой у нас самый доходный год",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Коротко: Ограниченный бизнес-обзор по подтвержденным строкам 1С. " + ("лишний текст " * 220),
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
review = step_state["business_first_review"]
|
||
self.assertFalse(review["direct_answer_first_ok"])
|
||
self.assertFalse(review["business_usefulness_ok"])
|
||
self.assertIn("business_direct_answer_missing", review["issue_codes"])
|
||
self.assertIn("answer_layering_noise", review["issue_codes"])
|
||
self.assertIn("business_answer_too_verbose", review["issue_codes"])
|
||
self.assertIn("business_direct_answer_missing", step_state["violated_invariants"])
|
||
|
||
def test_business_first_review_accepts_compact_direct_answer_surface(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": "какой у нас самый доходный год",
|
||
},
|
||
step_index=1,
|
||
question_resolved="какой у нас самый доходный год",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Коротко: самый доходный год в доступном денежном контуре 1С — 2015: 136 723 459,73 руб.\nМетод: считаю по подтвержденным входящим поступлениям.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
review = step_state["business_first_review"]
|
||
self.assertTrue(review["direct_answer_first_ok"])
|
||
self.assertTrue(review["business_usefulness_ok"])
|
||
self.assertEqual(review["issue_codes"], [])
|
||
|
||
def test_business_first_review_separates_direct_answer_from_later_technical_leak(self) -> None:
|
||
question = "\u043a\u0430\u043a\u043e\u0439 \u0443 \u043d\u0430\u0441 \u0441\u0430\u043c\u044b\u0439 \u0434\u043e\u0445\u043e\u0434\u043d\u044b\u0439 \u0433\u043e\u0434"
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": question,
|
||
},
|
||
step_index=1,
|
||
question_resolved=question,
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "2015 \u2014 \u0441\u0430\u043c\u044b\u0439 \u0434\u043e\u0445\u043e\u0434\u043d\u044b\u0439 \u0433\u043e\u0434 \u043f\u043e \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d\u043d\u044b\u043c \u0432\u0445\u043e\u0434\u044f\u0449\u0438\u043c \u0434\u0435\u043d\u044c\u0433\u0430\u043c.\nservice: capability_id=business_overview_route_template_v1",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
review = step_state["business_first_review"]
|
||
self.assertTrue(review["direct_answer_first_ok"])
|
||
self.assertTrue(review["technical_garbage_present"])
|
||
self.assertIn("technical_garbage_in_answer", review["issue_codes"])
|
||
self.assertNotIn("business_direct_answer_missing", review["issue_codes"])
|
||
|
||
def test_truth_harness_promotes_business_review_issues_to_findings(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": "какой у нас самый доходный год",
|
||
},
|
||
step_index=1,
|
||
question_resolved="какой у нас самый доходный год",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Коротко: Ограниченный бизнес-обзор по подтвержденным строкам 1С. " + ("лишний текст " * 220),
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
reviewed = dth.evaluate_truth_step(
|
||
step={
|
||
"step_id": "step_01",
|
||
"question_template": "какой у нас самый доходный год",
|
||
"criticality": "critical",
|
||
"allowed_reply_types": [],
|
||
},
|
||
step_state=step_state,
|
||
step_results={},
|
||
bindings={},
|
||
runtime_bindings={},
|
||
)
|
||
|
||
codes = [item["code"] for item in reviewed["review_findings"]]
|
||
self.assertIn("business_review:business_direct_answer_missing", codes)
|
||
self.assertIn("business_review:answer_layering_noise", codes)
|
||
self.assertEqual(reviewed["review_status"], "fail")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
unittest.main()
|