NODEDC_1C/llm_normalizer/data/eval_cases/eval-KEtQ8SYPKI.report.json

137 lines
4.1 KiB
JSON

{
"run_id": "eval-KEtQ8SYPKI",
"timestamp": "2026-03-28T09:23:28.807Z",
"mode": "single-pass-strict",
"use_mock": true,
"prompt_version": "normalizer_v2_0_2",
"schema_version": "v2_0_2",
"dataset": {
"source": "inline_raw_questions",
"file": null,
"raw_questions_count": 3
},
"cases_total": 3,
"metrics": {
"schema_validation_pass_rate": 100,
"scope_detection_accuracy": null,
"scope_in_scope_rate": 33.33,
"multi_intent_detected_rate": 0,
"clarification_required_rate": 0,
"avg_fragments_per_message": 1,
"out_of_scope_fragment_rate": 33.33,
"routed_fragment_rate": 66.67,
"no_route_fragment_rate": 33.33,
"route_resolution_accuracy": null,
"no_route_precision": null,
"false_no_route_rate": null,
"execution_state_consistency_rate": 66.67,
"executable_with_soft_assumptions_rate": 100,
"soft_assumption_used_fragment_rate": 100,
"clarification_precision": null,
"clarification_recall": null,
"false_clarification_rate": null
},
"budget": {
"requests_total": 0,
"retries_used": 0
},
"clarification_eval": {
"labeled_cases": 0,
"true_positive": 0,
"false_positive": 0,
"false_negative": 0
},
"route_eval": {
"labeled_cases": 0,
"correct_cases": 0,
"expected_routed_cases": 0,
"no_route_true_positive": 0,
"no_route_false_positive": 0
},
"scope_eval": {
"labeled_cases": 0,
"correct_cases": 0
},
"execution_state_eval": {
"checks_total": 3,
"checks_passed": 2
},
"route_distribution": {
"hybrid_store_plus_live": 1,
"no_route": 1,
"batch_refresh_then_store": 1
},
"fallback_distribution": {
"none": 1,
"out_of_scope": 1,
"clarification": 1
},
"results": [
{
"case_id": "BQ-001",
"raw_question": "Проверь хвосты по поставщикам и разложи цепочку",
"validation_passed": true,
"message_in_scope": true,
"scope_confidence": "high",
"contains_multiple_tasks": false,
"fragments_total": 1,
"in_scope_fragments": 1,
"out_of_scope_fragments": 0,
"unclear_fragments": 0,
"fallback_type": "none",
"predicted_route_status": "routed",
"expected_route_status": null,
"predicted_no_route_reason": null,
"expected_no_route_reason": null,
"predicted_clarification_required": false,
"expected_clarification_required": null,
"executable_with_soft_assumptions_fragments": 1,
"trace_id": "wKL8CXbPIJDi5V",
"request_count_for_case": 0
},
{
"case_id": "BQ-002",
"raw_question": "Как вообще по ФСБУ",
"validation_passed": true,
"message_in_scope": false,
"scope_confidence": "low",
"contains_multiple_tasks": false,
"fragments_total": 1,
"in_scope_fragments": 0,
"out_of_scope_fragments": 1,
"unclear_fragments": 0,
"fallback_type": "out_of_scope",
"predicted_route_status": "no_route",
"expected_route_status": null,
"predicted_no_route_reason": "out_of_scope",
"expected_no_route_reason": null,
"predicted_clarification_required": false,
"expected_clarification_required": null,
"executable_with_soft_assumptions_fragments": 0,
"trace_id": "VGu1HWqb9Ka5QF",
"request_count_for_case": 0
},
{
"case_id": "BQ-003",
"raw_question": "Покажи топ рисков за июнь 2020",
"validation_passed": true,
"message_in_scope": false,
"scope_confidence": "low",
"contains_multiple_tasks": false,
"fragments_total": 1,
"in_scope_fragments": 0,
"out_of_scope_fragments": 0,
"unclear_fragments": 1,
"fallback_type": "clarification",
"predicted_route_status": "routed",
"expected_route_status": null,
"predicted_no_route_reason": null,
"expected_no_route_reason": null,
"predicted_clarification_required": false,
"expected_clarification_required": null,
"executable_with_soft_assumptions_fragments": 0,
"trace_id": "rvXo7PioBUelzY",
"request_count_for_case": 0
}
]
}