NODEDC_1C/llm_normalizer/data/eval_cases/eval-Z9wlxFh4Wa.report.json

204 lines
6.7 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"run_id": "eval-Z9wlxFh4Wa",
"timestamp": "2026-03-23T15:49:49.563Z",
"mode": "single-pass-strict",
"use_mock": true,
"prompt_version": "normalizer_v1_1_2",
"dataset": {
"source": "file",
"file": "normalizer_eval_v1_1_30cases.json"
},
"cases_total": 5,
"metrics": {
"schema_validation_pass_rate": 100,
"intent_class_accuracy": 100,
"route_hint_accuracy": 100,
"causal_flag_accuracy": 100,
"high_confidence_error_rate": 0
},
"baseline_metrics": {
"schema_validation_pass_rate": 100,
"intent_class_accuracy": 72.73,
"route_hint_accuracy": 90.91,
"causal_flag_accuracy": 81.82,
"high_confidence_error_rate": 9.09
},
"baseline_delta": {
"schema_validation_pass_rate": 0,
"intent_class_accuracy": 27.27,
"route_hint_accuracy": 9.09,
"causal_flag_accuracy": 18.18,
"high_confidence_error_rate": -9.09
},
"class_accuracy": {
"heavy_analytical": {
"total": 3,
"passed": 3,
"accuracy_percent": 100
},
"period_close_risk": {
"total": 2,
"passed": 2,
"accuracy_percent": 100
}
},
"budget": {
"requests_total": 0,
"retries_used": 0,
"guidance": {
"forensic_calls_max": 10,
"final_eval_calls_max": 30,
"target_total_calls_max": 40,
"hard_cap_calls_max": 45
}
},
"mismatches": [],
"bad_confidence_cases": [],
"results": [
{
"case_id": "NQ-002",
"raw_question": "Сделай рейтинг самых рисковых хвостов перед закрытием периода за июнь.",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "heavy_analytical",
"actual_intent_class": "heavy_analytical",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": true,
"needs_anomaly_summary": true,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"confidence_overall": "medium",
"trace_id": "VsAOz81dbRjpxp",
"request_count_for_case": 0
},
{
"case_id": "NQ-007",
"raw_question": "Что у нас выглядит самым проблемным перед закрытием июня, если смотреть на компанию в целом?",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "heavy_analytical",
"actual_intent_class": "heavy_analytical",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": true,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"confidence_overall": "medium",
"trace_id": "i6vYaeziX0JPGh",
"request_count_for_case": 0
},
{
"case_id": "V11-HA-004",
"raw_question": "Дай обзорный риск-срез перед сдачей отчетности: где максимальная концентрация ошибок.",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "heavy_analytical",
"actual_intent_class": "heavy_analytical",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": true,
"needs_anomaly_summary": true,
"needs_runtime_truth": false,
"needs_period_cut": false,
"needs_evidence": false
},
"confidence_overall": "low",
"trace_id": "JU-GLRW5-TVJYu",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-003",
"raw_question": "Перед закрытием периода что у нас может взорваться в последний день?",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "period_close_risk",
"actual_intent_class": "period_close_risk",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": true,
"needs_evidence": false
},
"confidence_overall": "medium",
"trace_id": "EJimLLNg_2cB-D",
"request_count_for_case": 0
},
{
"case_id": "V11-OT-005",
"raw_question": "Че-то все криво на предзакрытии, где самые опасные места?",
"validation_passed": true,
"intent_match": true,
"route_match": true,
"causal_flags_match": true,
"expected_intent_class": "period_close_risk",
"actual_intent_class": "period_close_risk",
"expected_route_hint": "batch_refresh_then_store",
"actual_route_hint": "batch_refresh_then_store",
"expected_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false
},
"actual_requires": {
"needs_cross_entity_join": false,
"needs_causal_chain": false,
"needs_exact_object_trace": false,
"needs_ranking": false,
"needs_anomaly_summary": false,
"needs_runtime_truth": false,
"needs_period_cut": false,
"needs_evidence": false
},
"confidence_overall": "low",
"trace_id": "2d9EQnCBaYkcsB",
"request_count_for_case": 0
}
]
}