261 lines
12 KiB
Python
261 lines
12 KiB
Python
from __future__ import annotations
|
||
|
||
import sys
|
||
import unittest
|
||
from pathlib import Path
|
||
|
||
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||
|
||
import domain_case_loop as dcl
|
||
import domain_truth_harness as dth
|
||
|
||
|
||
class DomainCaseLoopStepStateTests(unittest.TestCase):
|
||
def test_preserves_mcp_catalog_alignment_debug_fields(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="planner_alignment_demo",
|
||
domain="planner_autonomy",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Alignment visibility",
|
||
"depends_on": [],
|
||
"question_template": "show planner alignment",
|
||
},
|
||
step_index=1,
|
||
question_resolved="show planner alignment",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "factual",
|
||
"text": "Confirmed answer",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {
|
||
"detected_mode": "address_query",
|
||
"detected_intent": "counterparty_turnover",
|
||
"selected_recipe": "counterparty_turnover_by_period",
|
||
"capability_id": "confirmed_counterparty_turnover",
|
||
"mcp_discovery_catalog_chain_alignment_status": "selected_matches_top",
|
||
"mcp_discovery_catalog_chain_top_match": "value_flow",
|
||
"mcp_discovery_catalog_chain_selected_matches_top": True,
|
||
},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
self.assertEqual(step_state["mcp_discovery_catalog_chain_alignment_status"], "selected_matches_top")
|
||
self.assertEqual(step_state["mcp_discovery_catalog_chain_top_match"], "value_flow")
|
||
self.assertTrue(step_state["mcp_discovery_catalog_chain_selected_matches_top"])
|
||
|
||
def test_truth_harness_warns_on_catalog_alignment_divergence(self) -> None:
|
||
reviewed = dth.evaluate_truth_step(
|
||
step={
|
||
"step_id": "step_01",
|
||
"question_template": "show planner alignment",
|
||
"criticality": "critical",
|
||
"allowed_reply_types": [],
|
||
},
|
||
step_state={
|
||
"question_resolved": "show planner alignment",
|
||
"reply_type": "factual",
|
||
"assistant_text": "Confirmed answer",
|
||
"actual_direct_answer": "Confirmed answer",
|
||
"detected_intent": "counterparty_turnover",
|
||
"selected_recipe": "counterparty_turnover_by_period",
|
||
"capability_id": "confirmed_counterparty_turnover",
|
||
"mcp_discovery_catalog_chain_alignment_status": "selected_outside_match_set",
|
||
"mcp_discovery_catalog_chain_top_match": "value_flow_comparison",
|
||
"mcp_discovery_catalog_chain_selected_matches_top": False,
|
||
"extracted_filters": {},
|
||
},
|
||
step_results={},
|
||
bindings={},
|
||
runtime_bindings={},
|
||
)
|
||
|
||
self.assertEqual(reviewed["review_status"], "warning")
|
||
self.assertEqual(reviewed["warning_findings_count"], 1)
|
||
self.assertEqual(reviewed["review_findings"][0]["code"], "catalog_alignment_divergence")
|
||
self.assertEqual(reviewed["review_findings"][0]["severity"], "warning")
|
||
|
||
def test_truth_harness_checks_expected_catalog_alignment_fields(self) -> None:
|
||
reviewed = dth.evaluate_truth_step(
|
||
step={
|
||
"step_id": "step_01",
|
||
"question_template": "show planner alignment",
|
||
"criticality": "critical",
|
||
"allowed_reply_types": [],
|
||
"expected_catalog_alignment_status": "selected_matches_top",
|
||
"expected_catalog_chain_top_match": "value_flow_comparison",
|
||
"expected_catalog_selected_matches_top": True,
|
||
},
|
||
step_state={
|
||
"question_resolved": "show planner alignment",
|
||
"reply_type": "factual",
|
||
"assistant_text": "Confirmed answer",
|
||
"actual_direct_answer": "Confirmed answer",
|
||
"detected_intent": "counterparty_turnover",
|
||
"selected_recipe": "counterparty_turnover_by_period",
|
||
"capability_id": "confirmed_counterparty_turnover",
|
||
"mcp_discovery_catalog_chain_alignment_status": "selected_matches_top",
|
||
"mcp_discovery_catalog_chain_top_match": "value_flow",
|
||
"mcp_discovery_catalog_chain_selected_matches_top": True,
|
||
"extracted_filters": {},
|
||
},
|
||
step_results={},
|
||
bindings={},
|
||
runtime_bindings={},
|
||
)
|
||
|
||
self.assertEqual(reviewed["review_status"], "fail")
|
||
self.assertEqual(reviewed["critical_findings_count"], 1)
|
||
self.assertEqual(reviewed["review_findings"][0]["code"], "wrong_catalog_chain_top_match")
|
||
|
||
def test_business_first_review_flags_dirty_direct_answer_surface(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": "какой у нас самый доходный год",
|
||
},
|
||
step_index=1,
|
||
question_resolved="какой у нас самый доходный год",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Коротко: Ограниченный бизнес-обзор по подтвержденным строкам 1С. " + ("лишний текст " * 220),
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
review = step_state["business_first_review"]
|
||
self.assertFalse(review["direct_answer_first_ok"])
|
||
self.assertFalse(review["business_usefulness_ok"])
|
||
self.assertIn("business_direct_answer_missing", review["issue_codes"])
|
||
self.assertIn("answer_layering_noise", review["issue_codes"])
|
||
self.assertIn("business_answer_too_verbose", review["issue_codes"])
|
||
self.assertIn("business_direct_answer_missing", step_state["violated_invariants"])
|
||
|
||
def test_business_first_review_accepts_compact_direct_answer_surface(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": "какой у нас самый доходный год",
|
||
},
|
||
step_index=1,
|
||
question_resolved="какой у нас самый доходный год",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Коротко: самый доходный год в доступном денежном контуре 1С — 2015: 136 723 459,73 руб.\nМетод: считаю по подтвержденным входящим поступлениям.",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
review = step_state["business_first_review"]
|
||
self.assertTrue(review["direct_answer_first_ok"])
|
||
self.assertTrue(review["business_usefulness_ok"])
|
||
self.assertEqual(review["issue_codes"], [])
|
||
|
||
def test_business_first_review_separates_direct_answer_from_later_technical_leak(self) -> None:
|
||
question = "\u043a\u0430\u043a\u043e\u0439 \u0443 \u043d\u0430\u0441 \u0441\u0430\u043c\u044b\u0439 \u0434\u043e\u0445\u043e\u0434\u043d\u044b\u0439 \u0433\u043e\u0434"
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": question,
|
||
},
|
||
step_index=1,
|
||
question_resolved=question,
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "2015 \u2014 \u0441\u0430\u043c\u044b\u0439 \u0434\u043e\u0445\u043e\u0434\u043d\u044b\u0439 \u0433\u043e\u0434 \u043f\u043e \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d\u043d\u044b\u043c \u0432\u0445\u043e\u0434\u044f\u0449\u0438\u043c \u0434\u0435\u043d\u044c\u0433\u0430\u043c.\nservice: capability_id=business_overview_route_template_v1",
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
|
||
review = step_state["business_first_review"]
|
||
self.assertTrue(review["direct_answer_first_ok"])
|
||
self.assertTrue(review["technical_garbage_present"])
|
||
self.assertIn("technical_garbage_in_answer", review["issue_codes"])
|
||
self.assertNotIn("business_direct_answer_missing", review["issue_codes"])
|
||
|
||
def test_truth_harness_promotes_business_review_issues_to_findings(self) -> None:
|
||
step_state = dcl.build_scenario_step_state(
|
||
scenario_id="business_surface_demo",
|
||
domain="business_overview",
|
||
step={
|
||
"step_id": "step_01",
|
||
"title": "Top year",
|
||
"depends_on": [],
|
||
"question_template": "какой у нас самый доходный год",
|
||
},
|
||
step_index=1,
|
||
question_resolved="какой у нас самый доходный год",
|
||
analysis_context={},
|
||
turn_artifact={
|
||
"assistant_message": {
|
||
"reply_type": "partial_coverage",
|
||
"text": "Коротко: Ограниченный бизнес-обзор по подтвержденным строкам 1С. " + ("лишний текст " * 220),
|
||
"message_id": "msg-1",
|
||
"trace_id": "trace-1",
|
||
},
|
||
"technical_debug_payload": {},
|
||
"session_summary": {},
|
||
},
|
||
entries=[],
|
||
)
|
||
reviewed = dth.evaluate_truth_step(
|
||
step={
|
||
"step_id": "step_01",
|
||
"question_template": "какой у нас самый доходный год",
|
||
"criticality": "critical",
|
||
"allowed_reply_types": [],
|
||
},
|
||
step_state=step_state,
|
||
step_results={},
|
||
bindings={},
|
||
runtime_bindings={},
|
||
)
|
||
|
||
codes = [item["code"] for item in reviewed["review_findings"]]
|
||
self.assertIn("business_review:business_direct_answer_missing", codes)
|
||
self.assertIn("business_review:answer_layering_noise", codes)
|
||
self.assertEqual(reviewed["review_status"], "fail")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
unittest.main()
|