NODEDC_1C/llm_normalizer/data/eval_cases/eval-rcY2eNdWNz.report.json

209 lines
6.7 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"run_id": "eval-rcY2eNdWNz",
"timestamp": "2026-03-23T15:23:51.992Z",
"mode": "single-pass-strict",
"use_mock": true,
"prompt_version": "normalizer_v1_1_1",
"dataset": {
"source": "file",
"file": "normalizer_eval_v1_1_30cases.json"
},
"cases_total": 5,
"metrics": {
"schema_validation_pass_rate": 100,
"intent_class_accuracy": 100,
"route_hint_accuracy": 100,
"causal_flag_accuracy": 100,
"high_confidence_error_rate": 0
},
"baseline_metrics": {
"schema_validation_pass_rate": 100,
"intent_class_accuracy": 72.73,
"route_hint_accuracy": 90.91,
"causal_flag_accuracy": 81.82,
"high_confidence_error_rate": 9.09
},
"baseline_delta": {
"schema_validation_pass_rate": 0,
"intent_class_accuracy": 27.27,
"route_hint_accuracy": 9.09,
"causal_flag_accuracy": 18.18,
"high_confidence_error_rate": -9.09
},
"class_accuracy": {
"drilldown_explain": {
"total": 2,
"passed": 2,
"accuracy_percent": 100
},
"period_close_risk": {
"total": 2,
"passed": 2,
"accuracy_percent": 100
},
"anomaly_probe": {
"total": 1,
"passed": 1,
"accuracy_percent": 100
}
},
"budget": {
"requests_total": 0,
"retries_used": 0,
"guidance": {
"forensic_calls_max": 10,
"final_eval_calls_max": 30,
"target_total_calls_max": 40,
"hard_cap_calls_max": 45
}
},
"mismatches": [],
"bad_confidence_cases": [],
"results": [
{
"case_id": "NQ-008",
"raw_question": "Покажи по банку документ №TRX-88 и связанную проводку по 51.",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "drilldown_explain",
"actual_intent_class": "drilldown_explain",
"expected_route_hint": "live_mcp_drilldown",
"actual_route_hint": "live_mcp_drilldown",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": true,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": true,
"needs_period_cut": false,
"needs_evidence": true
},
"confidence_overall": "low",
"trace_id": "fjIuLbOgy1GlvC",
"request_count_for_case": 0
},
{
"case_id": "V11-DD-005",
"raw_question": "Покажи карточку конкретной операции DOC-7781 и связанную проводку.",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "drilldown_explain",
"actual_intent_class": "drilldown_explain",
"expected_route_hint": "live_mcp_drilldown",
"actual_route_hint": "live_mcp_drilldown",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": true,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": true,
"needs_period_cut": false,
"needs_evidence": true
},
"confidence_overall": "low",
"trace_id": "hYPahELBOHx8Gp",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-003",
"raw_question": "Перед закрытием периода что у нас может взорваться в последний день?",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "period_close_risk",
"actual_intent_class": "period_close_risk",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"confidence_overall": "medium",
"trace_id": "sMardbub40y2av",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-004",
"raw_question": "Где по июню выглядит подозрительно, но без точечного документа, просто дай зоны риска.",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "anomaly_probe",
"actual_intent_class": "anomaly_probe",
"expected_route_hint": "store_feature_risk",
"actual_route_hint": "store_feature_risk",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": false,
"needs_anomaly_summary": true,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"confidence_overall": "medium",
"trace_id": "Vj-YEh8keRj_lA",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-005",
"raw_question": "Че-то все криво на предзакрытии, где самые опасные места?",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "period_close_risk",
"actual_intent_class": "period_close_risk",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": false,
"needs_evidence": false
},
"confidence_overall": "low",
"trace_id": "ziCohC1zzdxZuq",
"request_count_for_case": 0
}
]
}