83 lines
2.6 KiB
JSON
83 lines
2.6 KiB
JSON
{
|
|
"schema_version": "assistant_stage1_eval_comparison_v0_1",
|
|
"comparison_id": "assistant-compare-u32A9CuI",
|
|
"run_timestamp": "2026-03-25T20:07:36.336Z",
|
|
"baseline_run_id": "assistant-stage1-TnhIS4Qc6e",
|
|
"current_run_id": "assistant-stage1-ZQ5JDEUUkW",
|
|
"eval_target": "assistant_stage1",
|
|
"suite_id": "assistant_stage1_canonical",
|
|
"suite_version": "0.1.0",
|
|
"baseline_report_file": "X:\\1C\\NDC_1C\\llm_normalizer\\reports\\assistant-stage1-TnhIS4Qc6e.json",
|
|
"current_report_file": "X:\\1C\\NDC_1C\\llm_normalizer\\reports\\assistant-stage1-ZQ5JDEUUkW.json",
|
|
"metric_deltas": {
|
|
"retrieval_differentiation_rate": {
|
|
"baseline": 0.67,
|
|
"current": 0.67,
|
|
"delta": 0,
|
|
"trend": "unchanged"
|
|
},
|
|
"generic_explanation_rate": {
|
|
"baseline": 0.89,
|
|
"current": 0.78,
|
|
"delta": -0.11,
|
|
"trend": "improved"
|
|
},
|
|
"accountant_actionability_score": {
|
|
"baseline": 0.67,
|
|
"current": 2.33,
|
|
"delta": 1.66,
|
|
"trend": "improved"
|
|
},
|
|
"false_confidence_rate": {
|
|
"baseline": 0.33,
|
|
"current": 0.33,
|
|
"delta": 0,
|
|
"trend": "unchanged"
|
|
},
|
|
"broad_answer_rate": {
|
|
"baseline": 0.25,
|
|
"current": 0.25,
|
|
"delta": 0,
|
|
"trend": "unchanged"
|
|
},
|
|
"mechanism_specificity_score": {
|
|
"baseline": 0,
|
|
"current": 0,
|
|
"delta": 0,
|
|
"trend": "unchanged"
|
|
},
|
|
"followup_context_retention_score": {
|
|
"baseline": 3,
|
|
"current": 3,
|
|
"delta": 0,
|
|
"trend": "unchanged"
|
|
}
|
|
},
|
|
"scenario_notes_summary": {
|
|
"improved": 8,
|
|
"unchanged": 1,
|
|
"weakened": 0
|
|
},
|
|
"scenario_notes": {
|
|
"improved": [
|
|
"S1-51-WRONG-CLOSE-TYPE: usefulness 1.5 -> 2.25 (delta 0.75)",
|
|
"S1-60-SUPPLIER-TAILS: usefulness 0.75 -> 1.5 (delta 0.75)",
|
|
"S1-97-LIFECYCLE-ANOMALY: usefulness 0.75 -> 1.5 (delta 0.75)",
|
|
"S1-OS-CARD-VS-CHARGES: usefulness 0.75 -> 1.5 (delta 0.75)",
|
|
"S1-VAT-CROSS-DOMAIN-CONTRADICTION: usefulness 0 -> 1.5 (delta 1.5)",
|
|
"S1-PERIOD-CLOSE-IMPACT: usefulness 0 -> 0.5 (delta 0.5)",
|
|
"S1-MULTI-INTENT: usefulness 0.75 -> 1.5 (delta 0.75)",
|
|
"S1-FOLLOWUP-INVESTIGATION: usefulness 0.75 -> 1.5 (delta 0.75)"
|
|
],
|
|
"unchanged": [
|
|
"S1-TRANSLIT-QUERY: usefulness 0 -> 0 (delta 0)"
|
|
],
|
|
"weakened": []
|
|
},
|
|
"known_limitations": [
|
|
"Snapshot-only retrieval contour remains (no live verification core in Stage 1).",
|
|
"Metric mapping for genericness/false confidence is heuristic by design.",
|
|
"Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)."
|
|
],
|
|
"report_title": "Assistant Stage 1 Baseline vs Current"
|
|
} |