136 lines
4.1 KiB
JSON
136 lines
4.1 KiB
JSON
{
|
|
"run_id": "eval-Vgie6SCY1Z",
|
|
"timestamp": "2026-03-24T10:18:46.934Z",
|
|
"mode": "single-pass-strict",
|
|
"use_mock": true,
|
|
"prompt_version": "normalizer_v2_0_2",
|
|
"schema_version": "v2_0_2",
|
|
"dataset": {
|
|
"source": "inline_raw_questions",
|
|
"file": null,
|
|
"raw_questions_count": 3
|
|
},
|
|
"cases_total": 3,
|
|
"metrics": {
|
|
"schema_validation_pass_rate": 100,
|
|
"scope_detection_accuracy": null,
|
|
"scope_in_scope_rate": 33.33,
|
|
"multi_intent_detected_rate": 0,
|
|
"clarification_required_rate": 0,
|
|
"avg_fragments_per_message": 1,
|
|
"out_of_scope_fragment_rate": 33.33,
|
|
"routed_fragment_rate": 33.33,
|
|
"no_route_fragment_rate": 66.67,
|
|
"route_resolution_accuracy": null,
|
|
"no_route_precision": null,
|
|
"false_no_route_rate": null,
|
|
"execution_state_consistency_rate": 100,
|
|
"executable_with_soft_assumptions_rate": 100,
|
|
"soft_assumption_used_fragment_rate": 100,
|
|
"clarification_precision": null,
|
|
"clarification_recall": null,
|
|
"false_clarification_rate": null
|
|
},
|
|
"budget": {
|
|
"requests_total": 0,
|
|
"retries_used": 0
|
|
},
|
|
"clarification_eval": {
|
|
"labeled_cases": 0,
|
|
"true_positive": 0,
|
|
"false_positive": 0,
|
|
"false_negative": 0
|
|
},
|
|
"route_eval": {
|
|
"labeled_cases": 0,
|
|
"correct_cases": 0,
|
|
"expected_routed_cases": 0,
|
|
"no_route_true_positive": 0,
|
|
"no_route_false_positive": 0
|
|
},
|
|
"scope_eval": {
|
|
"labeled_cases": 0,
|
|
"correct_cases": 0
|
|
},
|
|
"execution_state_eval": {
|
|
"checks_total": 3,
|
|
"checks_passed": 3
|
|
},
|
|
"route_distribution": {
|
|
"store_feature_risk": 1,
|
|
"no_route": 2
|
|
},
|
|
"fallback_distribution": {
|
|
"none": 1,
|
|
"out_of_scope": 2
|
|
},
|
|
"results": [
|
|
{
|
|
"case_id": "BQ-001",
|
|
"raw_question": "Проверь хвосты по поставщикам и разложи цепочку",
|
|
"validation_passed": true,
|
|
"message_in_scope": true,
|
|
"scope_confidence": "high",
|
|
"contains_multiple_tasks": false,
|
|
"fragments_total": 1,
|
|
"in_scope_fragments": 1,
|
|
"out_of_scope_fragments": 0,
|
|
"unclear_fragments": 0,
|
|
"fallback_type": "none",
|
|
"predicted_route_status": "routed",
|
|
"expected_route_status": null,
|
|
"predicted_no_route_reason": null,
|
|
"expected_no_route_reason": null,
|
|
"predicted_clarification_required": false,
|
|
"expected_clarification_required": null,
|
|
"executable_with_soft_assumptions_fragments": 1,
|
|
"trace_id": "2Iv06y6I4tdoa_",
|
|
"request_count_for_case": 0
|
|
},
|
|
{
|
|
"case_id": "BQ-002",
|
|
"raw_question": "Как вообще по ФСБУ",
|
|
"validation_passed": true,
|
|
"message_in_scope": false,
|
|
"scope_confidence": "low",
|
|
"contains_multiple_tasks": false,
|
|
"fragments_total": 1,
|
|
"in_scope_fragments": 0,
|
|
"out_of_scope_fragments": 1,
|
|
"unclear_fragments": 0,
|
|
"fallback_type": "out_of_scope",
|
|
"predicted_route_status": "no_route",
|
|
"expected_route_status": null,
|
|
"predicted_no_route_reason": "out_of_scope",
|
|
"expected_no_route_reason": null,
|
|
"predicted_clarification_required": false,
|
|
"expected_clarification_required": null,
|
|
"executable_with_soft_assumptions_fragments": 0,
|
|
"trace_id": "Mx14pWPKBmvEDx",
|
|
"request_count_for_case": 0
|
|
},
|
|
{
|
|
"case_id": "BQ-003",
|
|
"raw_question": "Покажи топ рисков за июнь 2020",
|
|
"validation_passed": true,
|
|
"message_in_scope": false,
|
|
"scope_confidence": "low",
|
|
"contains_multiple_tasks": false,
|
|
"fragments_total": 1,
|
|
"in_scope_fragments": 0,
|
|
"out_of_scope_fragments": 0,
|
|
"unclear_fragments": 1,
|
|
"fallback_type": "out_of_scope",
|
|
"predicted_route_status": "no_route",
|
|
"expected_route_status": null,
|
|
"predicted_no_route_reason": "insufficient_specificity",
|
|
"expected_no_route_reason": null,
|
|
"predicted_clarification_required": false,
|
|
"expected_clarification_required": null,
|
|
"executable_with_soft_assumptions_fragments": 0,
|
|
"trace_id": "mWZV1v9J6SfNlH",
|
|
"request_count_for_case": 0
|
|
}
|
|
]
|
|
}
|