205 lines
6.7 KiB
JSON
205 lines
6.7 KiB
JSON
{
|
||
"run_id": "eval-Z9wlxFh4Wa",
|
||
"timestamp": "2026-03-23T15:49:49.563Z",
|
||
"mode": "single-pass-strict",
|
||
"use_mock": true,
|
||
"prompt_version": "normalizer_v1_1_2",
|
||
"dataset": {
|
||
"source": "file",
|
||
"file": "normalizer_eval_v1_1_30cases.json"
|
||
},
|
||
"cases_total": 5,
|
||
"metrics": {
|
||
"schema_validation_pass_rate": 100,
|
||
"intent_class_accuracy": 100,
|
||
"route_hint_accuracy": 100,
|
||
"causal_flag_accuracy": 100,
|
||
"high_confidence_error_rate": 0
|
||
},
|
||
"baseline_metrics": {
|
||
"schema_validation_pass_rate": 100,
|
||
"intent_class_accuracy": 72.73,
|
||
"route_hint_accuracy": 90.91,
|
||
"causal_flag_accuracy": 81.82,
|
||
"high_confidence_error_rate": 9.09
|
||
},
|
||
"baseline_delta": {
|
||
"schema_validation_pass_rate": 0,
|
||
"intent_class_accuracy": 27.27,
|
||
"route_hint_accuracy": 9.09,
|
||
"causal_flag_accuracy": 18.18,
|
||
"high_confidence_error_rate": -9.09
|
||
},
|
||
"class_accuracy": {
|
||
"heavy_analytical": {
|
||
"total": 3,
|
||
"passed": 3,
|
||
"accuracy_percent": 100
|
||
},
|
||
"period_close_risk": {
|
||
"total": 2,
|
||
"passed": 2,
|
||
"accuracy_percent": 100
|
||
}
|
||
},
|
||
"budget": {
|
||
"requests_total": 0,
|
||
"retries_used": 0,
|
||
"guidance": {
|
||
"forensic_calls_max": 10,
|
||
"final_eval_calls_max": 30,
|
||
"target_total_calls_max": 40,
|
||
"hard_cap_calls_max": 45
|
||
}
|
||
},
|
||
"mismatches": [],
|
||
"bad_confidence_cases": [],
|
||
"results": [
|
||
{
|
||
"case_id": "NQ-002",
|
||
"raw_question": "Сделай рейтинг самых рисковых хвостов перед закрытием периода за июнь.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "VsAOz81dbRjpxp",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-007",
|
||
"raw_question": "Что у нас выглядит самым проблемным перед закрытием июня, если смотреть на компанию в целом?",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "i6vYaeziX0JPGh",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-HA-004",
|
||
"raw_question": "Дай обзорный риск-срез перед сдачей отчетности: где максимальная концентрация ошибок.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "JU-GLRW5-TVJYu",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-OT-003",
|
||
"raw_question": "Перед закрытием периода что у нас может взорваться в последний день?",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "period_close_risk",
|
||
"actual_intent_class": "period_close_risk",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "EJimLLNg_2cB-D",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-OT-005",
|
||
"raw_question": "Че-то все криво на предзакрытии, где самые опасные места?",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "period_close_risk",
|
||
"actual_intent_class": "period_close_risk",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "2d9EQnCBaYkcsB",
|
||
"request_count_for_case": 0
|
||
}
|
||
]
|
||
}
|