ГЛОБАЛЬНЫЙ РЕФАКТОРИНГ АРХИТЕКТУРЫ - Stage 4.6: добавить метрику stage4_contract_compliance_rate в Stage1 eval и отчеты
This commit is contained in:
parent
963f0aa372
commit
ce1ebae8ec
|
|
@ -2709,7 +2709,26 @@ Implemented in current pass (Stage 4.5 contract observability in debug/log, 2026
|
|||
- focused assembler pack passed: `2 files / 5 tests`;
|
||||
- `npm --prefix llm_normalizer/backend run build` passed.
|
||||
|
||||
Status: In progress (Stage 4.1-4.5 completed; continue with focused wave/manual-comment quality backlog)
|
||||
Implemented in current pass (Stage 4.6 eval metric for answer-contract compliance, 2026-04-12):
|
||||
1. Added Stage 4 compliance metric to Stage 1 eval:
|
||||
- `stage4_contract_compliance_rate` in `AssistantEvalMetricVector` + rubric (`ACCOUNTANT_SCORING_RUBRIC_V01`).
|
||||
2. Integrated metric computation in Stage 1 eval runtime:
|
||||
- source: `debug.answer_contract_stage4_v1.is_stage4_shape`;
|
||||
- aggregate in `computeAssistantMetrics(...)`;
|
||||
- per-case metric projection in `metric_subscores/accountant_metrics`;
|
||||
- added denominator `stage4_contract_audited_cases_total`.
|
||||
3. Integrated metric into reporting/comparison and autorun score index:
|
||||
- Stage 1 comparison deltas now include `stage4_contract_compliance_rate`;
|
||||
- auto-run score index for `assistant_stage1` now includes this compliance signal.
|
||||
4. Regression updates:
|
||||
- `assistantContracts.test.ts`
|
||||
- `assistantEvalHarness.test.ts`
|
||||
5. Validation snapshot:
|
||||
- `assistantContracts.test.ts`: `2 passed`;
|
||||
- `assistantEvalHarness.test.ts`: `6 passed` (run with extended timeout budget);
|
||||
- `npm --prefix llm_normalizer/backend run build` passed.
|
||||
|
||||
Status: In progress (Stage 4.1-4.6 completed; continue with focused wave/manual-comment quality backlog)
|
||||
|
||||
## Stage 5 (P3): Quality Loop Driven By GUI Markup
|
||||
|
||||
|
|
|
|||
|
|
@ -533,7 +533,8 @@ function computeScoreIndex(report, target) {
|
|||
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
|
||||
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
|
||||
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
|
||||
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score))
|
||||
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)),
|
||||
rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate))
|
||||
]);
|
||||
}
|
||||
if (target === "assistant_stage2") {
|
||||
|
|
|
|||
|
|
@ -359,6 +359,13 @@ function rateToBandScore(metric, value) {
|
|||
return 3;
|
||||
return 0;
|
||||
}
|
||||
if (metric === "stage4_contract_compliance_rate") {
|
||||
if (value >= 0.95)
|
||||
return 5;
|
||||
if (value >= 0.8)
|
||||
return 3;
|
||||
return 0;
|
||||
}
|
||||
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
|
||||
if (value <= 0.25)
|
||||
return 5;
|
||||
|
|
@ -1254,6 +1261,8 @@ class EvalService {
|
|||
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
|
||||
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
|
||||
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
|
||||
const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null);
|
||||
const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length;
|
||||
const avgActionability = diagnostics.length > 0
|
||||
? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
|
||||
: null;
|
||||
|
|
@ -1268,7 +1277,8 @@ class EvalService {
|
|||
false_confidence_rate: round2(falseConfidenceCases / total),
|
||||
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
|
||||
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
|
||||
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
|
||||
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup),
|
||||
stage4_contract_compliance_rate: stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null
|
||||
};
|
||||
const rubric_bands = {
|
||||
retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
|
||||
|
|
@ -1277,7 +1287,8 @@ class EvalService {
|
|||
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
|
||||
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
|
||||
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
|
||||
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
|
||||
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score),
|
||||
stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate)
|
||||
};
|
||||
return {
|
||||
raw,
|
||||
|
|
@ -1285,7 +1296,8 @@ class EvalService {
|
|||
denominators: {
|
||||
cases_total: diagnostics.length,
|
||||
broad_cases_total: broadCases.length,
|
||||
followup_cases_total: followupCases.length
|
||||
followup_cases_total: followupCases.length,
|
||||
stage4_contract_audited_cases_total: stage4AuditedCases.length
|
||||
},
|
||||
signature_counts: signatureCounter
|
||||
};
|
||||
|
|
@ -1364,7 +1376,8 @@ class EvalService {
|
|||
"false_confidence_rate",
|
||||
"broad_answer_rate",
|
||||
"mechanism_specificity_score",
|
||||
"followup_context_retention_score"
|
||||
"followup_context_retention_score",
|
||||
"stage4_contract_compliance_rate"
|
||||
];
|
||||
const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
|
||||
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
|
||||
|
|
@ -1634,6 +1647,7 @@ class EvalService {
|
|||
final_reply_type: "backend_error",
|
||||
turn_count: turnResponses.length,
|
||||
narrowing_result: "failed",
|
||||
stage4_contract_shape_compliant: null,
|
||||
signature: `backend_error|${suiteCase.scenario_tag}`,
|
||||
is_generic: true,
|
||||
is_false_confident: false,
|
||||
|
|
@ -1671,6 +1685,9 @@ class EvalService {
|
|||
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
|
||||
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
||||
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
|
||||
const stage4ContractShapeCompliant = typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean"
|
||||
? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape
|
||||
: null;
|
||||
const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" "));
|
||||
let genericnessScore = 0;
|
||||
if (!hasAnchors)
|
||||
|
|
@ -1779,6 +1796,7 @@ class EvalService {
|
|||
final_reply_type: finalResponse.reply_type,
|
||||
turn_count: suiteCase.turns.length,
|
||||
narrowing_result: narrowingResult,
|
||||
stage4_contract_shape_compliant: stage4ContractShapeCompliant,
|
||||
signature: [
|
||||
finalResponse.reply_type,
|
||||
signals.routes.sort().join(","),
|
||||
|
|
@ -1810,7 +1828,8 @@ class EvalService {
|
|||
false_confidence_rate: item.is_false_confident ? 1 : 0,
|
||||
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
|
||||
mechanism_specificity_score: round2(item.mechanism_specificity_score),
|
||||
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score)
|
||||
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score),
|
||||
stage4_contract_compliance_rate: item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0
|
||||
};
|
||||
return {
|
||||
schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
|
||||
|
|
@ -1839,7 +1858,8 @@ class EvalService {
|
|||
mechanism_status: item.signals.mechanism_status,
|
||||
source_refs: item.signals.source_refs,
|
||||
routes: item.signals.routes,
|
||||
followup_state_applied: item.signals.followup_state_applied
|
||||
followup_state_applied: item.signals.followup_state_applied,
|
||||
stage4_contract_shape_compliant: item.stage4_contract_shape_compliant
|
||||
},
|
||||
metric_subscores: caseMetricVector,
|
||||
limitations: item.limitations,
|
||||
|
|
|
|||
|
|
@ -44,5 +44,10 @@ exports.ACCOUNTANT_SCORING_RUBRIC_V01 = {
|
|||
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
|
||||
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
|
||||
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
|
||||
],
|
||||
stage4_contract_compliance_rate: [
|
||||
{ score: 0, label: "Non-Compliant", description: "Stage 4 block contract is mostly missing or polluted by legacy sections." },
|
||||
{ score: 3, label: "Partially Compliant", description: "Stage 4 answer shape is present only in part of audited responses." },
|
||||
{ score: 5, label: "Compliant", description: "Stage 4 block contract is consistently present without legacy leakage." }
|
||||
]
|
||||
};
|
||||
|
|
|
|||
|
|
@ -722,7 +722,8 @@ function computeScoreIndex(report: Record<string, unknown>, target: AutoRunTarge
|
|||
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
|
||||
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
|
||||
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
|
||||
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score))
|
||||
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)),
|
||||
rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate))
|
||||
]);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -473,6 +473,7 @@ interface AssistantCaseDiagnostics {
|
|||
final_reply_type: string;
|
||||
turn_count: number;
|
||||
narrowing_result: AssistantEvalNarrowingResult;
|
||||
stage4_contract_shape_compliant: boolean | null;
|
||||
signature: string;
|
||||
is_generic: boolean;
|
||||
is_false_confident: boolean;
|
||||
|
|
@ -554,6 +555,11 @@ function rateToBandScore(metric: AssistantMetricKey, value: number): 0 | 3 | 5 {
|
|||
if (value >= 0.45) return 3;
|
||||
return 0;
|
||||
}
|
||||
if (metric === "stage4_contract_compliance_rate") {
|
||||
if (value >= 0.95) return 5;
|
||||
if (value >= 0.8) return 3;
|
||||
return 0;
|
||||
}
|
||||
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
|
||||
if (value <= 0.25) return 5;
|
||||
if (value <= 0.45) return 3;
|
||||
|
|
@ -1539,6 +1545,8 @@ export class EvalService {
|
|||
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
|
||||
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
|
||||
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
|
||||
const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null);
|
||||
const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length;
|
||||
|
||||
const avgActionability =
|
||||
diagnostics.length > 0
|
||||
|
|
@ -1558,7 +1566,9 @@ export class EvalService {
|
|||
false_confidence_rate: round2(falseConfidenceCases / total),
|
||||
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
|
||||
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
|
||||
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
|
||||
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup),
|
||||
stage4_contract_compliance_rate:
|
||||
stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null
|
||||
};
|
||||
|
||||
const rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null> = {
|
||||
|
|
@ -1568,7 +1578,8 @@ export class EvalService {
|
|||
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
|
||||
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
|
||||
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
|
||||
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
|
||||
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score),
|
||||
stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate)
|
||||
};
|
||||
|
||||
return {
|
||||
|
|
@ -1577,7 +1588,8 @@ export class EvalService {
|
|||
denominators: {
|
||||
cases_total: diagnostics.length,
|
||||
broad_cases_total: broadCases.length,
|
||||
followup_cases_total: followupCases.length
|
||||
followup_cases_total: followupCases.length,
|
||||
stage4_contract_audited_cases_total: stage4AuditedCases.length
|
||||
},
|
||||
signature_counts: signatureCounter
|
||||
};
|
||||
|
|
@ -1672,7 +1684,8 @@ export class EvalService {
|
|||
"false_confidence_rate",
|
||||
"broad_answer_rate",
|
||||
"mechanism_specificity_score",
|
||||
"followup_context_retention_score"
|
||||
"followup_context_retention_score",
|
||||
"stage4_contract_compliance_rate"
|
||||
];
|
||||
const lowerIsBetter = new Set<AssistantMetricKey>(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
|
||||
|
||||
|
|
@ -1976,6 +1989,7 @@ export class EvalService {
|
|||
final_reply_type: "backend_error",
|
||||
turn_count: turnResponses.length,
|
||||
narrowing_result: "failed",
|
||||
stage4_contract_shape_compliant: null,
|
||||
signature: `backend_error|${suiteCase.scenario_tag}`,
|
||||
is_generic: true,
|
||||
is_false_confident: false,
|
||||
|
|
@ -2014,6 +2028,10 @@ export class EvalService {
|
|||
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
|
||||
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
||||
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
|
||||
const stage4ContractShapeCompliant =
|
||||
typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean"
|
||||
? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape
|
||||
: null;
|
||||
|
||||
const hasAnchors = hasDomainAnchors(
|
||||
[directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")
|
||||
|
|
@ -2113,6 +2131,7 @@ export class EvalService {
|
|||
final_reply_type: finalResponse.reply_type,
|
||||
turn_count: suiteCase.turns.length,
|
||||
narrowing_result: narrowingResult,
|
||||
stage4_contract_shape_compliant: stage4ContractShapeCompliant,
|
||||
signature: [
|
||||
finalResponse.reply_type,
|
||||
signals.routes.sort().join(","),
|
||||
|
|
@ -2146,7 +2165,9 @@ export class EvalService {
|
|||
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
|
||||
mechanism_specificity_score: round2(item.mechanism_specificity_score),
|
||||
followup_context_retention_score:
|
||||
item.followup_retention_score === null ? null : round2(item.followup_retention_score)
|
||||
item.followup_retention_score === null ? null : round2(item.followup_retention_score),
|
||||
stage4_contract_compliance_rate:
|
||||
item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0
|
||||
};
|
||||
return {
|
||||
schema_version: ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
|
||||
|
|
@ -2175,7 +2196,8 @@ export class EvalService {
|
|||
mechanism_status: item.signals.mechanism_status,
|
||||
source_refs: item.signals.source_refs,
|
||||
routes: item.signals.routes,
|
||||
followup_state_applied: item.signals.followup_state_applied
|
||||
followup_state_applied: item.signals.followup_state_applied,
|
||||
stage4_contract_shape_compliant: item.stage4_contract_shape_compliant
|
||||
},
|
||||
metric_subscores: caseMetricVector,
|
||||
limitations: item.limitations,
|
||||
|
|
|
|||
|
|
@ -161,6 +161,7 @@ export interface AssistantEvalMetricVector {
|
|||
broad_answer_rate: number | null;
|
||||
mechanism_specificity_score: number | null;
|
||||
followup_context_retention_score: number | null;
|
||||
stage4_contract_compliance_rate: number | null;
|
||||
}
|
||||
|
||||
export interface AssistantEvalRecord {
|
||||
|
|
@ -227,5 +228,10 @@ export const ACCOUNTANT_SCORING_RUBRIC_V01: Record<AccountantMetricName, Account
|
|||
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
|
||||
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
|
||||
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
|
||||
],
|
||||
stage4_contract_compliance_rate: [
|
||||
{ score: 0, label: "Non-Compliant", description: "Stage 4 block contract is mostly missing or polluted by legacy sections." },
|
||||
{ score: 3, label: "Partially Compliant", description: "Stage 4 answer shape is present only in part of audited responses." },
|
||||
{ score: 5, label: "Compliant", description: "Stage 4 block contract is consistently present without legacy leakage." }
|
||||
]
|
||||
};
|
||||
|
|
|
|||
|
|
@ -109,7 +109,8 @@ describe("stage1 contract scaffolding", () => {
|
|||
"false_confidence_rate",
|
||||
"broad_answer_rate",
|
||||
"mechanism_specificity_score",
|
||||
"followup_context_retention_score"
|
||||
"followup_context_retention_score",
|
||||
"stage4_contract_compliance_rate"
|
||||
]);
|
||||
for (const metric of metricNames) {
|
||||
const bands = ACCOUNTANT_SCORING_RUBRIC_V01[metric as keyof typeof ACCOUNTANT_SCORING_RUBRIC_V01];
|
||||
|
|
|
|||
|
|
@ -84,7 +84,8 @@ describe.sequential("assistant Stage 1 eval harness", () => {
|
|||
"false_confidence_rate",
|
||||
"broad_answer_rate",
|
||||
"mechanism_specificity_score",
|
||||
"followup_context_retention_score"
|
||||
"followup_context_retention_score",
|
||||
"stage4_contract_compliance_rate"
|
||||
]);
|
||||
expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy();
|
||||
expect(response.body.report?.feature_profile_snapshot).toBeTruthy();
|
||||
|
|
|
|||
Loading…
Reference in New Issue