{ "run_id": "eval-LyUcB42oK9", "timestamp": "2026-03-23T15:21:58.788Z", "mode": "single-pass-strict", "use_mock": true, "prompt_version": "normalizer_v1_1_1", "dataset": { "source": "file", "file": "normalizer_eval_v1_1_30cases.json" }, "cases_total": 5, "metrics": { "schema_validation_pass_rate": 100, "intent_class_accuracy": 40, "route_hint_accuracy": 100, "causal_flag_accuracy": 100, "high_confidence_error_rate": 0 }, "baseline_metrics": { "schema_validation_pass_rate": 100, "intent_class_accuracy": 72.73, "route_hint_accuracy": 90.91, "causal_flag_accuracy": 81.82, "high_confidence_error_rate": 9.09 }, "baseline_delta": { "schema_validation_pass_rate": 0, "intent_class_accuracy": -32.73, "route_hint_accuracy": 9.09, "causal_flag_accuracy": 18.18, "high_confidence_error_rate": -9.09 }, "class_accuracy": { "drilldown_explain": { "total": 2, "passed": 2, "accuracy_percent": 100 }, "period_close_risk": { "total": 2, "passed": 0, "accuracy_percent": 0 }, "anomaly_probe": { "total": 1, "passed": 0, "accuracy_percent": 0 } }, "budget": { "requests_total": 0, "retries_used": 0, "guidance": { "forensic_calls_max": 10, "final_eval_calls_max": 30, "target_total_calls_max": 40, "hard_cap_calls_max": 45 } }, "mismatches": [ { "case_id": "V11-OT-003", "expected_intent_class": "period_close_risk", "actual_intent_class": "heavy_analytical", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": true, "needs_anomaly_summary": false, "needs_runtime_truth": false, "needs_period_cut": true, "needs_evidence": false }, "comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.", "trace_id": "LyziNYqDRnmIav" }, { "case_id": "V11-OT-004", "expected_intent_class": "anomaly_probe", "actual_intent_class": "rule_based_account_control", "expected_route_hint": "store_feature_risk", "actual_route_hint": "store_feature_risk", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": false, "needs_anomaly_summary": true, "needs_runtime_truth": false, "needs_period_cut": true, "needs_evidence": false }, "comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.", "trace_id": "mo09Q4j0hOVWgw" }, { "case_id": "V11-OT-005", "expected_intent_class": "period_close_risk", "actual_intent_class": "heavy_analytical", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": true, "needs_anomaly_summary": false, "needs_runtime_truth": false, "needs_period_cut": false, "needs_evidence": false }, "comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.", "trace_id": "zLXsDrsg2vR5Qs" } ], "bad_confidence_cases": [], "results": [ { "case_id": "NQ-008", "raw_question": "Покажи по банку документ №TRX-88 и связанную проводку по 51.", "validation_passed": true, "intent_match": true, "route_match": true, "causal_flags_match": true, "expected_intent_class": "drilldown_explain", "actual_intent_class": "drilldown_explain", "expected_route_hint": "live_mcp_drilldown", "actual_route_hint": "live_mcp_drilldown", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": true, "needs_ranking": false, "needs_anomaly_summary": false, "needs_runtime_truth": true, "needs_period_cut": false, "needs_evidence": true }, "confidence_overall": "low", "trace_id": "rnUogTOSYvvR0g", "request_count_for_case": 0 }, { "case_id": "V11-DD-005", "raw_question": "Покажи карточку конкретной операции DOC-7781 и связанную проводку.", "validation_passed": true, "intent_match": true, "route_match": true, "causal_flags_match": true, "expected_intent_class": "drilldown_explain", "actual_intent_class": "drilldown_explain", "expected_route_hint": "live_mcp_drilldown", "actual_route_hint": "live_mcp_drilldown", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": true, "needs_ranking": false, "needs_anomaly_summary": false, "needs_runtime_truth": true, "needs_period_cut": false, "needs_evidence": true }, "confidence_overall": "low", "trace_id": "LSKGq9-z1wUR10", "request_count_for_case": 0 }, { "case_id": "V11-OT-003", "raw_question": "Перед закрытием периода что у нас может взорваться в последний день?", "validation_passed": true, "intent_match": false, "route_match": true, "causal_flags_match": true, "expected_intent_class": "period_close_risk", "actual_intent_class": "heavy_analytical", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": true, "needs_anomaly_summary": false, "needs_runtime_truth": false, "needs_period_cut": true, "needs_evidence": false }, "confidence_overall": "medium", "trace_id": "LyziNYqDRnmIav", "request_count_for_case": 0 }, { "case_id": "V11-OT-004", "raw_question": "Где по июню выглядит подозрительно, но без точечного документа, просто дай зоны риска.", "validation_passed": true, "intent_match": false, "route_match": true, "causal_flags_match": true, "expected_intent_class": "anomaly_probe", "actual_intent_class": "rule_based_account_control", "expected_route_hint": "store_feature_risk", "actual_route_hint": "store_feature_risk", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": false, "needs_anomaly_summary": true, "needs_runtime_truth": false, "needs_period_cut": true, "needs_evidence": false }, "confidence_overall": "medium", "trace_id": "mo09Q4j0hOVWgw", "request_count_for_case": 0 }, { "case_id": "V11-OT-005", "raw_question": "Че-то все криво на предзакрытии, где самые опасные места?", "validation_passed": true, "intent_match": false, "route_match": true, "causal_flags_match": true, "expected_intent_class": "period_close_risk", "actual_intent_class": "heavy_analytical", "expected_route_hint": "batch_refresh_then_store", "actual_route_hint": "batch_refresh_then_store", "expected_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false }, "actual_requires": { "needs_cross_entity_join": false, "needs_causal_chain": false, "needs_exact_object_trace": false, "needs_ranking": true, "needs_anomaly_summary": false, "needs_runtime_truth": false, "needs_period_cut": false, "needs_evidence": false }, "confidence_overall": "low", "trace_id": "zLXsDrsg2vR5Qs", "request_count_for_case": 0 } ] }