NODEDC_1C/llm_normalizer/data/eval_cases/eval-LyUcB42oK9.report.json

279 lines
9.2 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"run_id": "eval-LyUcB42oK9",
"timestamp": "2026-03-23T15:21:58.788Z",
"mode": "single-pass-strict",
"use_mock": true,
"prompt_version": "normalizer_v1_1_1",
"dataset": {
"source": "file",
"file": "normalizer_eval_v1_1_30cases.json"
},
"cases_total": 5,
"metrics": {
"schema_validation_pass_rate": 100,
"intent_class_accuracy": 40,
"route_hint_accuracy": 100,
"causal_flag_accuracy": 100,
"high_confidence_error_rate": 0
},
"baseline_metrics": {
"schema_validation_pass_rate": 100,
"intent_class_accuracy": 72.73,
"route_hint_accuracy": 90.91,
"causal_flag_accuracy": 81.82,
"high_confidence_error_rate": 9.09
},
"baseline_delta": {
"schema_validation_pass_rate": 0,
"intent_class_accuracy": -32.73,
"route_hint_accuracy": 9.09,
"causal_flag_accuracy": 18.18,
"high_confidence_error_rate": -9.09
},
"class_accuracy": {
"drilldown_explain": {
"total": 2,
"passed": 2,
"accuracy_percent": 100
},
"period_close_risk": {
"total": 2,
"passed": 0,
"accuracy_percent": 0
},
"anomaly_probe": {
"total": 1,
"passed": 0,
"accuracy_percent": 0
}
},
"budget": {
"requests_total": 0,
"retries_used": 0,
"guidance": {
"forensic_calls_max": 10,
"final_eval_calls_max": 30,
"target_total_calls_max": 40,
"hard_cap_calls_max": 45
}
},
"mismatches": [
{
"case_id": "V11-OT-003",
"expected_intent_class": "period_close_risk",
"actual_intent_class": "heavy_analytical",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": true,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
"trace_id": "LyziNYqDRnmIav"
},
{
"case_id": "V11-OT-004",
"expected_intent_class": "anomaly_probe",
"actual_intent_class": "rule_based_account_control",
"expected_route_hint": "store_feature_risk",
"actual_route_hint": "store_feature_risk",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": false,
"needs_anomaly_summary": true,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
"trace_id": "mo09Q4j0hOVWgw"
},
{
"case_id": "V11-OT-005",
"expected_intent_class": "period_close_risk",
"actual_intent_class": "heavy_analytical",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": true,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": false,
"needs_evidence": false
},
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
"trace_id": "zLXsDrsg2vR5Qs"
}
],
"bad_confidence_cases": [],
"results": [
{
"case_id": "NQ-008",
"raw_question": "Покажи по банку документ №TRX-88 и связанную проводку по 51.",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "drilldown_explain",
"actual_intent_class": "drilldown_explain",
"expected_route_hint": "live_mcp_drilldown",
"actual_route_hint": "live_mcp_drilldown",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": true,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": true,
"needs_period_cut": false,
"needs_evidence": true
},
"confidence_overall": "low",
"trace_id": "rnUogTOSYvvR0g",
"request_count_for_case": 0
},
{
"case_id": "V11-DD-005",
"raw_question": "Покажи карточку конкретной операции DOC-7781 и связанную проводку.",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "drilldown_explain",
"actual_intent_class": "drilldown_explain",
"expected_route_hint": "live_mcp_drilldown",
"actual_route_hint": "live_mcp_drilldown",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": true,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": true,
"needs_period_cut": false,
"needs_evidence": true
},
"confidence_overall": "low",
"trace_id": "LSKGq9-z1wUR10",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-003",
"raw_question": "Перед закрытием периода что у нас может взорваться в последний день?",
"validation_passed": true,
"intent_match": false,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "period_close_risk",
"actual_intent_class": "heavy_analytical",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": true,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"confidence_overall": "medium",
"trace_id": "LyziNYqDRnmIav",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-004",
"raw_question": "Где по июню выглядит подозрительно, но без точечного документа, просто дай зоны риска.",
"validation_passed": true,
"intent_match": false,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "anomaly_probe",
"actual_intent_class": "rule_based_account_control",
"expected_route_hint": "store_feature_risk",
"actual_route_hint": "store_feature_risk",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": false,
"needs_anomaly_summary": true,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"confidence_overall": "medium",
"trace_id": "mo09Q4j0hOVWgw",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-005",
"raw_question": "Че-то все криво на предзакрытии, где самые опасные места?",
"validation_passed": true,
"intent_match": false,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "period_close_risk",
"actual_intent_class": "heavy_analytical",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": true,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": false,
"needs_evidence": false
},
"confidence_overall": "low",
"trace_id": "zLXsDrsg2vR5Qs",
"request_count_for_case": 0
}
]
}