NODEDC_1C/llm_normalizer/reports/assistant-compare-b78uzAhM....

83 lines
2.6 KiB
JSON

{
"schema_version": "assistant_stage1_eval_comparison_v0_1",
"comparison_id": "assistant-compare-b78uzAhM",
"run_timestamp": "2026-03-25T20:41:15.635Z",
"baseline_run_id": "assistant-stage1-vAXHvm-KsF",
"current_run_id": "assistant-stage1-wiKJXmxb22",
"eval_target": "assistant_stage1",
"suite_id": "assistant_stage1_canonical",
"suite_version": "0.1.0",
"baseline_report_file": "X:\\1C\\NDC_1C\\llm_normalizer\\reports\\assistant-stage1-vAXHvm-KsF.json",
"current_report_file": "X:\\1C\\NDC_1C\\llm_normalizer\\reports\\assistant-stage1-wiKJXmxb22.json",
"metric_deltas": {
"retrieval_differentiation_rate": {
"baseline": 0.67,
"current": 0.67,
"delta": 0,
"trend": "unchanged"
},
"generic_explanation_rate": {
"baseline": 0.78,
"current": 0.78,
"delta": 0,
"trend": "unchanged"
},
"accountant_actionability_score": {
"baseline": 0.78,
"current": 2.67,
"delta": 1.89,
"trend": "improved"
},
"false_confidence_rate": {
"baseline": 0.33,
"current": 0.22,
"delta": -0.11,
"trend": "improved"
},
"broad_answer_rate": {
"baseline": 0.25,
"current": 0.25,
"delta": 0,
"trend": "unchanged"
},
"mechanism_specificity_score": {
"baseline": 0,
"current": 0,
"delta": 0,
"trend": "unchanged"
},
"followup_context_retention_score": {
"baseline": 3,
"current": 3,
"delta": 0,
"trend": "unchanged"
}
},
"scenario_notes_summary": {
"improved": 8,
"unchanged": 1,
"weakened": 0
},
"scenario_notes": {
"improved": [
"S1-51-WRONG-CLOSE-TYPE: usefulness 1.5 -> 2.25 (delta 0.75)",
"S1-60-SUPPLIER-TAILS: usefulness 0.75 -> 1.5 (delta 0.75)",
"S1-97-LIFECYCLE-ANOMALY: usefulness 0.75 -> 1.5 (delta 0.75)",
"S1-OS-CARD-VS-CHARGES: usefulness 0.75 -> 1.5 (delta 0.75)",
"S1-VAT-CROSS-DOMAIN-CONTRADICTION: usefulness 0 -> 1.5 (delta 1.5)",
"S1-PERIOD-CLOSE-IMPACT: usefulness 0 -> 0.5 (delta 0.5)",
"S1-MULTI-INTENT: usefulness 0.75 -> 1.5 (delta 0.75)",
"S1-TRANSLIT-QUERY: usefulness 0.75 -> 1.5 (delta 0.75)"
],
"unchanged": [
"S1-FOLLOWUP-INVESTIGATION: usefulness 1.25 -> 1.5 (delta 0.25)"
],
"weakened": []
},
"known_limitations": [
"Snapshot-only retrieval contour remains (no live verification core in Stage 1).",
"Metric mapping for genericness/false confidence is heuristic by design.",
"Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)."
],
"report_title": "Assistant Stage 1 Baseline vs Current"
}