112 lines
3.2 KiB
JSON
112 lines
3.2 KiB
JSON
{
|
|
"run_id": "eval-YfhJafdzmT",
|
|
"timestamp": "2026-03-26T12:41:09.003Z",
|
|
"mode": "single-pass-strict",
|
|
"use_mock": true,
|
|
"prompt_version": "normalizer_v2_0_2",
|
|
"schema_version": "v2_0_2",
|
|
"dataset": {
|
|
"source": "inline_raw_questions",
|
|
"file": null,
|
|
"raw_questions_count": 2
|
|
},
|
|
"cases_total": 2,
|
|
"metrics": {
|
|
"schema_validation_pass_rate": 100,
|
|
"scope_detection_accuracy": null,
|
|
"scope_in_scope_rate": 100,
|
|
"multi_intent_detected_rate": 0,
|
|
"clarification_required_rate": 0,
|
|
"avg_fragments_per_message": 1,
|
|
"out_of_scope_fragment_rate": 0,
|
|
"routed_fragment_rate": 100,
|
|
"no_route_fragment_rate": 0,
|
|
"route_resolution_accuracy": null,
|
|
"no_route_precision": null,
|
|
"false_no_route_rate": null,
|
|
"execution_state_consistency_rate": 100,
|
|
"executable_with_soft_assumptions_rate": 100,
|
|
"soft_assumption_used_fragment_rate": 100,
|
|
"clarification_precision": null,
|
|
"clarification_recall": null,
|
|
"false_clarification_rate": null
|
|
},
|
|
"budget": {
|
|
"requests_total": 0,
|
|
"retries_used": 0
|
|
},
|
|
"clarification_eval": {
|
|
"labeled_cases": 0,
|
|
"true_positive": 0,
|
|
"false_positive": 0,
|
|
"false_negative": 0
|
|
},
|
|
"route_eval": {
|
|
"labeled_cases": 0,
|
|
"correct_cases": 0,
|
|
"expected_routed_cases": 0,
|
|
"no_route_true_positive": 0,
|
|
"no_route_false_positive": 0
|
|
},
|
|
"scope_eval": {
|
|
"labeled_cases": 0,
|
|
"correct_cases": 0
|
|
},
|
|
"execution_state_eval": {
|
|
"checks_total": 2,
|
|
"checks_passed": 2
|
|
},
|
|
"route_distribution": {
|
|
"store_feature_risk": 2
|
|
},
|
|
"fallback_distribution": {
|
|
"none": 2
|
|
},
|
|
"results": [
|
|
{
|
|
"case_id": "BQ-001",
|
|
"raw_question": "Проверь счет 60 за июнь 2020",
|
|
"validation_passed": true,
|
|
"message_in_scope": true,
|
|
"scope_confidence": "high",
|
|
"contains_multiple_tasks": false,
|
|
"fragments_total": 1,
|
|
"in_scope_fragments": 1,
|
|
"out_of_scope_fragments": 0,
|
|
"unclear_fragments": 0,
|
|
"fallback_type": "none",
|
|
"predicted_route_status": "routed",
|
|
"expected_route_status": null,
|
|
"predicted_no_route_reason": null,
|
|
"expected_no_route_reason": null,
|
|
"predicted_clarification_required": false,
|
|
"expected_clarification_required": null,
|
|
"executable_with_soft_assumptions_fragments": 1,
|
|
"trace_id": "1ueLJSRaJedoxg",
|
|
"request_count_for_case": 0
|
|
},
|
|
{
|
|
"case_id": "BQ-002",
|
|
"raw_question": "Покажи риски по счету 97",
|
|
"validation_passed": true,
|
|
"message_in_scope": true,
|
|
"scope_confidence": "high",
|
|
"contains_multiple_tasks": false,
|
|
"fragments_total": 1,
|
|
"in_scope_fragments": 1,
|
|
"out_of_scope_fragments": 0,
|
|
"unclear_fragments": 0,
|
|
"fallback_type": "none",
|
|
"predicted_route_status": "routed",
|
|
"expected_route_status": null,
|
|
"predicted_no_route_reason": null,
|
|
"expected_no_route_reason": null,
|
|
"predicted_clarification_required": false,
|
|
"expected_clarification_required": null,
|
|
"executable_with_soft_assumptions_fragments": 1,
|
|
"trace_id": "f_76vMb-8Q45n2",
|
|
"request_count_for_case": 0
|
|
}
|
|
]
|
|
}
|