{ "schema_version": "assistant_stage1_eval_comparison_v0_1", "comparison_id": "assistant-compare-u32A9CuI", "run_timestamp": "2026-03-25T20:07:36.336Z", "baseline_run_id": "assistant-stage1-TnhIS4Qc6e", "current_run_id": "assistant-stage1-ZQ5JDEUUkW", "eval_target": "assistant_stage1", "suite_id": "assistant_stage1_canonical", "suite_version": "0.1.0", "baseline_report_file": "X:\\1C\\NDC_1C\\llm_normalizer\\reports\\assistant-stage1-TnhIS4Qc6e.json", "current_report_file": "X:\\1C\\NDC_1C\\llm_normalizer\\reports\\assistant-stage1-ZQ5JDEUUkW.json", "metric_deltas": { "retrieval_differentiation_rate": { "baseline": 0.67, "current": 0.67, "delta": 0, "trend": "unchanged" }, "generic_explanation_rate": { "baseline": 0.89, "current": 0.78, "delta": -0.11, "trend": "improved" }, "accountant_actionability_score": { "baseline": 0.67, "current": 2.33, "delta": 1.66, "trend": "improved" }, "false_confidence_rate": { "baseline": 0.33, "current": 0.33, "delta": 0, "trend": "unchanged" }, "broad_answer_rate": { "baseline": 0.25, "current": 0.25, "delta": 0, "trend": "unchanged" }, "mechanism_specificity_score": { "baseline": 0, "current": 0, "delta": 0, "trend": "unchanged" }, "followup_context_retention_score": { "baseline": 3, "current": 3, "delta": 0, "trend": "unchanged" } }, "scenario_notes_summary": { "improved": 8, "unchanged": 1, "weakened": 0 }, "scenario_notes": { "improved": [ "S1-51-WRONG-CLOSE-TYPE: usefulness 1.5 -> 2.25 (delta 0.75)", "S1-60-SUPPLIER-TAILS: usefulness 0.75 -> 1.5 (delta 0.75)", "S1-97-LIFECYCLE-ANOMALY: usefulness 0.75 -> 1.5 (delta 0.75)", "S1-OS-CARD-VS-CHARGES: usefulness 0.75 -> 1.5 (delta 0.75)", "S1-VAT-CROSS-DOMAIN-CONTRADICTION: usefulness 0 -> 1.5 (delta 1.5)", "S1-PERIOD-CLOSE-IMPACT: usefulness 0 -> 0.5 (delta 0.5)", "S1-MULTI-INTENT: usefulness 0.75 -> 1.5 (delta 0.75)", "S1-FOLLOWUP-INVESTIGATION: usefulness 0.75 -> 1.5 (delta 0.75)" ], "unchanged": [ "S1-TRANSLIT-QUERY: usefulness 0 -> 0 (delta 0)" ], "weakened": [] }, "known_limitations": [ "Snapshot-only retrieval contour remains (no live verification core in Stage 1).", "Metric mapping for genericness/false confidence is heuristic by design.", "Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)." ], "report_title": "Assistant Stage 1 Baseline vs Current" }