{ "run_id": "eval-Z9wlxFh4Wa", "timestamp": "2026-03-23T15:49:49.563Z", "mode": "single-pass-strict", "use_mock": true, "prompt_version": "normalizer_v1_1_2", "dataset": { "source": "file", "file": "normalizer_eval_v1_1_30cases.json" }, "cases_total": 5, "metrics": { "schema_validation_pass_rate": 100, "intent_class_accuracy": 100, "route_hint_accuracy": 100, "causal_flag_accuracy": 100, "high_confidence_error_rate": 0 }, "baseline_metrics": { "schema_validation_pass_rate": 100, "intent_class_accuracy": 72.73, "route_hint_accuracy": 90.91, "causal_flag_accuracy": 81.82, "high_confidence_error_rate": 9.09 }, "baseline_delta": { "schema_validation_pass_rate": 0, "intent_class_accuracy": 27.27, "route_hint_accuracy": 9.09, "causal_flag_accuracy": 18.18, "high_confidence_error_rate": -9.09 }, "class_accuracy": { "heavy_analytical": { "total": 3, "passed": 3, "accuracy_percent": 100 }, "period_close_risk": { "total": 2, "passed": 2, "accuracy_percent": 100 } }, "budget": { "requests_total": 0, "retries_used": 0, "guidance": { "forensic_calls_max": 10, "final_eval_calls_max": 30, "target_total_calls_max": 40, "hard_cap_calls_max": 45 } }, "mismatches": [], "bad_confidence_cases": [], "results": [ { "case_id": "NQ-002", "raw_question": "Сделай рейтинг самых рисковых хвостов перед закрытием периода за июнь.", "validation_passed": true, "intent_match": true, "route_match": true, "causal_flags_match": true, "expected_intent_class": "heavy_analytical", "actual_intent_class": "heavy_analytical", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": true, "needs_anomaly_summary": true, "needs_runtime_truth": false, "needs_period_cut": true, "needs_evidence": false }, "confidence_overall": "medium", "trace_id": "VsAOz81dbRjpxp", "request_count_for_case": 0 }, { "case_id": "NQ-007", "raw_question": "Что у нас выглядит самым проблемным перед закрытием июня, если смотреть на компанию в целом?", "validation_passed": true, "intent_match": true, "route_match": true, "causal_flags_match": true, "expected_intent_class": "heavy_analytical", "actual_intent_class": "heavy_analytical", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": true, "needs_anomaly_summary": false, "needs_runtime_truth": false, "needs_period_cut": true, "needs_evidence": false }, "confidence_overall": "medium", "trace_id": "i6vYaeziX0JPGh", "request_count_for_case": 0 }, { "case_id": "V11-HA-004", "raw_question": "Дай обзорный риск-срез перед сдачей отчетности: где максимальная концентрация ошибок.", "validation_passed": true, "intent_match": true, "route_match": true, "causal_flags_match": true, "expected_intent_class": "heavy_analytical", "actual_intent_class": "heavy_analytical", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": true, "needs_anomaly_summary": true, "needs_runtime_truth": false, "needs_period_cut": false, "needs_evidence": false }, "confidence_overall": "low", "trace_id": "JU-GLRW5-TVJYu", "request_count_for_case": 0 }, { "case_id": "V11-OT-003", "raw_question": "Перед закрытием периода что у нас может взорваться в последний день?", "validation_passed": true, "intent_match": true, "route_match": true, "causal_flags_match": true, "expected_intent_class": "period_close_risk", "actual_intent_class": "period_close_risk", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": false, "needs_anomaly_summary": false, "needs_runtime_truth": false, "needs_period_cut": true, "needs_evidence": false }, "confidence_overall": "medium", "trace_id": "EJimLLNg_2cB-D", "request_count_for_case": 0 }, { "case_id": "V11-OT-005", "raw_question": "Че-то все криво на предзакрытии, где самые опасные места?", "validation_passed": true, "intent_match": true, "route_match": true, "causal_flags_match": true, "expected_intent_class": "period_close_risk", "actual_intent_class": "period_close_risk", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": false, "needs_anomaly_summary": false, "needs_runtime_truth": false, "needs_period_cut": false, "needs_evidence": false }, "confidence_overall": "low", "trace_id": "2d9EQnCBaYkcsB", "request_count_for_case": 0 } ] }