1070 lines
38 KiB
JSON
1070 lines
38 KiB
JSON
{
|
||
"run_id": "eval-xk1NE5ndVV",
|
||
"timestamp": "2026-03-23T14:35:48.176Z",
|
||
"mode": "single-pass-strict",
|
||
"use_mock": true,
|
||
"prompt_version": "normalizer_v1_1",
|
||
"dataset": {
|
||
"source": "file",
|
||
"file": "normalizer_eval_v1_1_30cases.json"
|
||
},
|
||
"cases_total": 30,
|
||
"metrics": {
|
||
"schema_validation_pass_rate": 100,
|
||
"intent_class_accuracy": 83.33,
|
||
"route_hint_accuracy": 100,
|
||
"causal_flag_accuracy": 100,
|
||
"high_confidence_error_rate": 0
|
||
},
|
||
"baseline_metrics": {
|
||
"schema_validation_pass_rate": 100,
|
||
"intent_class_accuracy": 72.73,
|
||
"route_hint_accuracy": 90.91,
|
||
"causal_flag_accuracy": 81.82,
|
||
"high_confidence_error_rate": 9.09
|
||
},
|
||
"baseline_delta": {
|
||
"schema_validation_pass_rate": 0,
|
||
"intent_class_accuracy": 10.6,
|
||
"route_hint_accuracy": 9.09,
|
||
"causal_flag_accuracy": 18.18,
|
||
"high_confidence_error_rate": -9.09
|
||
},
|
||
"class_accuracy": {
|
||
"cross_entity": {
|
||
"total": 10,
|
||
"passed": 10,
|
||
"accuracy_percent": 100
|
||
},
|
||
"heavy_analytical": {
|
||
"total": 5,
|
||
"passed": 5,
|
||
"accuracy_percent": 100
|
||
},
|
||
"drilldown_explain": {
|
||
"total": 5,
|
||
"passed": 5,
|
||
"accuracy_percent": 100
|
||
},
|
||
"rule_based_account_control": {
|
||
"total": 5,
|
||
"passed": 5,
|
||
"accuracy_percent": 100
|
||
},
|
||
"anomaly_probe": {
|
||
"total": 2,
|
||
"passed": 0,
|
||
"accuracy_percent": 0
|
||
},
|
||
"ambiguous_human_query": {
|
||
"total": 1,
|
||
"passed": 0,
|
||
"accuracy_percent": 0
|
||
},
|
||
"period_close_risk": {
|
||
"total": 2,
|
||
"passed": 0,
|
||
"accuracy_percent": 0
|
||
}
|
||
},
|
||
"budget": {
|
||
"requests_total": 0,
|
||
"retries_used": 0,
|
||
"guidance": {
|
||
"forensic_calls_max": 10,
|
||
"final_eval_calls_max": 30,
|
||
"target_total_calls_max": 40,
|
||
"hard_cap_calls_max": 45
|
||
}
|
||
},
|
||
"mismatches": [
|
||
{
|
||
"case_id": "NQ-005",
|
||
"expected_intent_class": "anomaly_probe",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
|
||
"trace_id": "lO6b7E4iQGlEAf"
|
||
},
|
||
{
|
||
"case_id": "NQ-009",
|
||
"expected_intent_class": "ambiguous_human_query",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
|
||
"trace_id": "Gk7UiRhZg-AapE"
|
||
},
|
||
{
|
||
"case_id": "V11-OT-003",
|
||
"expected_intent_class": "period_close_risk",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
|
||
"trace_id": "i4o4wkvxfHKUpe"
|
||
},
|
||
{
|
||
"case_id": "V11-OT-004",
|
||
"expected_intent_class": "anomaly_probe",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
|
||
"trace_id": "2riratZ9G8TMN_"
|
||
},
|
||
{
|
||
"case_id": "V11-OT-005",
|
||
"expected_intent_class": "period_close_risk",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"comment": "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.",
|
||
"trace_id": "aw6boDZZ59P2sF"
|
||
}
|
||
],
|
||
"bad_confidence_cases": [],
|
||
"results": [
|
||
{
|
||
"case_id": "NQ-001",
|
||
"raw_question": "По каким поставщикам на конец июня не бьются взаиморасчеты, покажи документы, оплаты и хвосты.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "M0qLQbv0uSFcP_",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-006",
|
||
"raw_question": "По каким реализациям 90/62 хвосты не закрылись оплатой, разложи по цепочке документов.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "GRYZNvT3wt8Z2G",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-003",
|
||
"raw_question": "Где в июне не сходится 60/51: разложи по документу, оплате и закрывающему, чем подтверждается каждый шаг.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "l-k8uL2wR12Yu7",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-004",
|
||
"raw_question": "Разложи по контрагентам цепочку: отгрузка -> оплата -> закрывающий, чтобы понять где рвется подтверждение.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "Dq25fAd3L1t5EM",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-005",
|
||
"raw_question": "По поставщикам где повисло в цепочке поступление-оплата-закрытие по 60, покажи проблемные связки.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "nC1bMGxs-fCBso",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-006",
|
||
"raw_question": "Найди где по 62 не собралось: нужен разбор по документам, оплатам и проводкам с причинно-следственной цепочкой.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "9jmHTGw-seHVRN",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-007",
|
||
"raw_question": "Покажи по июню все случаи когда реализация без оплаты и где в цепочке ошибка подтверждения.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "Shno6vSxK1KaU_",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-008",
|
||
"raw_question": "Сделай причинный разбор хвостов по 60: документ, оплата, проводка, закрывающий, где пошло криво.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "x6VKGs-iLjiEQZ",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-009",
|
||
"raw_question": "Почему у части покупателей не видно закрытия, разложи цепочку документов и оплат по июню.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "x8dzpFWkQkXAaJ",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-CE-010",
|
||
"raw_question": "У кого из контрагентов в июне хвосты между 60 и банком, разложи по документам/оплатам/закрывающим.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "cross_entity",
|
||
"actual_intent_class": "cross_entity",
|
||
"expected_route_hint": "hybrid_store_plus_live",
|
||
"actual_route_hint": "hybrid_store_plus_live",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": true,
|
||
"needs_causal_chain": true,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "NdmcqZDJKUyktD",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-002",
|
||
"raw_question": "Сделай рейтинг самых рисковых хвостов перед закрытием периода за июнь.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "sGbMyIfaek6Urk",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-007",
|
||
"raw_question": "Что у нас выглядит самым проблемным перед закрытием июня, если смотреть на компанию в целом?",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "d8EWv8U6yyHhsr",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-HA-003",
|
||
"raw_question": "Собери топ-10 риск-зон учета по июню и приоритизируй, куда лезть сначала.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "2Z_F539WXmzOKY",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-HA-004",
|
||
"raw_question": "Дай обзорный риск-срез перед сдачей отчетности: где максимальная концентрация ошибок.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "WOLy5XIs2Gp3TH",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-HA-005",
|
||
"raw_question": "Сделай приоритизированный обзор ручных проверок по компании за июнь.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "heavy_analytical",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "D7GXkCRK4LYQsd",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-003",
|
||
"raw_question": "Покажи документ по номеру 000123 и строку проводки, нужен точный source-of-record.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "drilldown_explain",
|
||
"actual_intent_class": "drilldown_explain",
|
||
"expected_route_hint": "live_mcp_drilldown",
|
||
"actual_route_hint": "live_mcp_drilldown",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": true,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": true,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "LabIQZzDA0B3f9",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-008",
|
||
"raw_question": "Покажи по банку документ №TRX-88 и связанную проводку по 51.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "drilldown_explain",
|
||
"actual_intent_class": "drilldown_explain",
|
||
"expected_route_hint": "live_mcp_drilldown",
|
||
"actual_route_hint": "live_mcp_drilldown",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": true,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": true,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "9zwPrDybqODSUE",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-DD-003",
|
||
"raw_question": "Покажи проводку по документу INV-2020-0615, нужна конкретная строка и источник.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "drilldown_explain",
|
||
"actual_intent_class": "drilldown_explain",
|
||
"expected_route_hint": "live_mcp_drilldown",
|
||
"actual_route_hint": "live_mcp_drilldown",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": true,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": true,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "J36zx_j3ooTYEv",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-DD-004",
|
||
"raw_question": "Дай точечный drilldown по документу №PAY-441 и его проводке по 51.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "drilldown_explain",
|
||
"actual_intent_class": "drilldown_explain",
|
||
"expected_route_hint": "live_mcp_drilldown",
|
||
"actual_route_hint": "live_mcp_drilldown",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": true,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": true,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "jG7D_g3E-D_sMq",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-DD-005",
|
||
"raw_question": "Покажи карточку конкретной операции DOC-7781 и связанную проводку.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "drilldown_explain",
|
||
"actual_intent_class": "drilldown_explain",
|
||
"expected_route_hint": "live_mcp_drilldown",
|
||
"actual_route_hint": "live_mcp_drilldown",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": true,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": true,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": true
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "fE_w5vzkNkzUzb",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-004",
|
||
"raw_question": "По 97 счету проверь, где возможна ошибка дат начала и окончания списания.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "rule_based_account_control",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "3OILIBmF5h-YN-",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-RB-002",
|
||
"raw_question": "Проверь контрольные правила по ОС: где ошибки в сроках амортизации и учетной группе.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "rule_based_account_control",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "EshIW8vaOGxjhH",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-RB-003",
|
||
"raw_question": "По 10 счету проверь где нарушены правила оценки остатков.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "rule_based_account_control",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "llnpzqJoB6RxGY",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-RB-004",
|
||
"raw_question": "По НДС на 68.02 найди нарушения контрольных правил расчета за июнь.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "rule_based_account_control",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "NJs3r150DkVdqZ",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-RB-005",
|
||
"raw_question": "Проверь учетные настройки списания на 97 и покажи где высокий риск ручной ошибки.",
|
||
"validation_passed": true,
|
||
"intent_match": true,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "rule_based_account_control",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "RhidN0FhNQqEk4",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-005",
|
||
"raw_question": "Есть ли аномальные материалы на счете 10, которые зависли и выглядят нелогично?",
|
||
"validation_passed": true,
|
||
"intent_match": false,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "anomaly_probe",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "lO6b7E4iQGlEAf",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "NQ-009",
|
||
"raw_question": "Где у нас пахнет ручной ошибкой по июню?",
|
||
"validation_passed": true,
|
||
"intent_match": false,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "ambiguous_human_query",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "Gk7UiRhZg-AapE",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-OT-003",
|
||
"raw_question": "Перед закрытием периода что у нас может взорваться в последний день?",
|
||
"validation_passed": true,
|
||
"intent_match": false,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "period_close_risk",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "i4o4wkvxfHKUpe",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-OT-004",
|
||
"raw_question": "Где по июню выглядит подозрительно, но без точечного документа, просто дай зоны риска.",
|
||
"validation_passed": true,
|
||
"intent_match": false,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "anomaly_probe",
|
||
"actual_intent_class": "rule_based_account_control",
|
||
"expected_route_hint": "store_feature_risk",
|
||
"actual_route_hint": "store_feature_risk",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": false,
|
||
"needs_anomaly_summary": true,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": true,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "medium",
|
||
"trace_id": "2riratZ9G8TMN_",
|
||
"request_count_for_case": 0
|
||
},
|
||
{
|
||
"case_id": "V11-OT-005",
|
||
"raw_question": "Че-то все криво на предзакрытии, где самые опасные места?",
|
||
"validation_passed": true,
|
||
"intent_match": false,
|
||
"route_match": true,
|
||
"causal_flags_match": true,
|
||
"expected_intent_class": "period_close_risk",
|
||
"actual_intent_class": "heavy_analytical",
|
||
"expected_route_hint": "batch_refresh_then_store",
|
||
"actual_route_hint": "batch_refresh_then_store",
|
||
"expected_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false
|
||
},
|
||
"actual_requires": {
|
||
"needs_cross_entity_join": false,
|
||
"needs_causal_chain": false,
|
||
"needs_exact_object_trace": false,
|
||
"needs_ranking": true,
|
||
"needs_anomaly_summary": false,
|
||
"needs_runtime_truth": false,
|
||
"needs_period_cut": false,
|
||
"needs_evidence": false
|
||
},
|
||
"confidence_overall": "low",
|
||
"trace_id": "aw6boDZZ59P2sF",
|
||
"request_count_for_case": 0
|
||
}
|
||
]
|
||
} |