ГЛОБАЛЬНЫЙ РЕФАКТОРИНГ АРХИТЕКТУРЫ - Stage 4.6: добавить метрику stage4_contract_compliance_rate в Stage1 eval и отчеты

This commit is contained in:
dctouch 2026-04-12 09:25:15 +03:00
parent 963f0aa372
commit ce1ebae8ec
9 changed files with 93 additions and 17 deletions

View File

@ -2709,7 +2709,26 @@ Implemented in current pass (Stage 4.5 contract observability in debug/log, 2026
- focused assembler pack passed: `2 files / 5 tests`;
- `npm --prefix llm_normalizer/backend run build` passed.
Status: In progress (Stage 4.1-4.5 completed; continue with focused wave/manual-comment quality backlog)
Implemented in current pass (Stage 4.6 eval metric for answer-contract compliance, 2026-04-12):
1. Added Stage 4 compliance metric to Stage 1 eval:
- `stage4_contract_compliance_rate` in `AssistantEvalMetricVector` + rubric (`ACCOUNTANT_SCORING_RUBRIC_V01`).
2. Integrated metric computation in Stage 1 eval runtime:
- source: `debug.answer_contract_stage4_v1.is_stage4_shape`;
- aggregate in `computeAssistantMetrics(...)`;
- per-case metric projection in `metric_subscores/accountant_metrics`;
- added denominator `stage4_contract_audited_cases_total`.
3. Integrated metric into reporting/comparison and autorun score index:
- Stage 1 comparison deltas now include `stage4_contract_compliance_rate`;
- auto-run score index for `assistant_stage1` now includes this compliance signal.
4. Regression updates:
- `assistantContracts.test.ts`
- `assistantEvalHarness.test.ts`
5. Validation snapshot:
- `assistantContracts.test.ts`: `2 passed`;
- `assistantEvalHarness.test.ts`: `6 passed` (run with extended timeout budget);
- `npm --prefix llm_normalizer/backend run build` passed.
Status: In progress (Stage 4.1-4.6 completed; continue with focused wave/manual-comment quality backlog)
## Stage 5 (P3): Quality Loop Driven By GUI Markup

View File

@ -533,7 +533,8 @@ function computeScoreIndex(report, target) {
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score))
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)),
rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate))
]);
}
if (target === "assistant_stage2") {

View File

@ -359,6 +359,13 @@ function rateToBandScore(metric, value) {
return 3;
return 0;
}
if (metric === "stage4_contract_compliance_rate") {
if (value >= 0.95)
return 5;
if (value >= 0.8)
return 3;
return 0;
}
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
if (value <= 0.25)
return 5;
@ -1254,6 +1261,8 @@ class EvalService {
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null);
const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length;
const avgActionability = diagnostics.length > 0
? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
: null;
@ -1268,7 +1277,8 @@ class EvalService {
false_confidence_rate: round2(falseConfidenceCases / total),
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup),
stage4_contract_compliance_rate: stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null
};
const rubric_bands = {
retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
@ -1277,7 +1287,8 @@ class EvalService {
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score),
stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate)
};
return {
raw,
@ -1285,7 +1296,8 @@ class EvalService {
denominators: {
cases_total: diagnostics.length,
broad_cases_total: broadCases.length,
followup_cases_total: followupCases.length
followup_cases_total: followupCases.length,
stage4_contract_audited_cases_total: stage4AuditedCases.length
},
signature_counts: signatureCounter
};
@ -1364,7 +1376,8 @@ class EvalService {
"false_confidence_rate",
"broad_answer_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
"followup_context_retention_score",
"stage4_contract_compliance_rate"
];
const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
@ -1634,6 +1647,7 @@ class EvalService {
final_reply_type: "backend_error",
turn_count: turnResponses.length,
narrowing_result: "failed",
stage4_contract_shape_compliant: null,
signature: `backend_error|${suiteCase.scenario_tag}`,
is_generic: true,
is_false_confident: false,
@ -1671,6 +1685,9 @@ class EvalService {
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
const stage4ContractShapeCompliant = typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean"
? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape
: null;
const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" "));
let genericnessScore = 0;
if (!hasAnchors)
@ -1779,6 +1796,7 @@ class EvalService {
final_reply_type: finalResponse.reply_type,
turn_count: suiteCase.turns.length,
narrowing_result: narrowingResult,
stage4_contract_shape_compliant: stage4ContractShapeCompliant,
signature: [
finalResponse.reply_type,
signals.routes.sort().join(","),
@ -1810,7 +1828,8 @@ class EvalService {
false_confidence_rate: item.is_false_confident ? 1 : 0,
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
mechanism_specificity_score: round2(item.mechanism_specificity_score),
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score)
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score),
stage4_contract_compliance_rate: item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0
};
return {
schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
@ -1839,7 +1858,8 @@ class EvalService {
mechanism_status: item.signals.mechanism_status,
source_refs: item.signals.source_refs,
routes: item.signals.routes,
followup_state_applied: item.signals.followup_state_applied
followup_state_applied: item.signals.followup_state_applied,
stage4_contract_shape_compliant: item.stage4_contract_shape_compliant
},
metric_subscores: caseMetricVector,
limitations: item.limitations,

View File

@ -44,5 +44,10 @@ exports.ACCOUNTANT_SCORING_RUBRIC_V01 = {
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
],
stage4_contract_compliance_rate: [
{ score: 0, label: "Non-Compliant", description: "Stage 4 block contract is mostly missing or polluted by legacy sections." },
{ score: 3, label: "Partially Compliant", description: "Stage 4 answer shape is present only in part of audited responses." },
{ score: 5, label: "Compliant", description: "Stage 4 block contract is consistently present without legacy leakage." }
]
};

View File

@ -722,7 +722,8 @@ function computeScoreIndex(report: Record<string, unknown>, target: AutoRunTarge
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score))
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)),
rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate))
]);
}

View File

@ -473,6 +473,7 @@ interface AssistantCaseDiagnostics {
final_reply_type: string;
turn_count: number;
narrowing_result: AssistantEvalNarrowingResult;
stage4_contract_shape_compliant: boolean | null;
signature: string;
is_generic: boolean;
is_false_confident: boolean;
@ -554,6 +555,11 @@ function rateToBandScore(metric: AssistantMetricKey, value: number): 0 | 3 | 5 {
if (value >= 0.45) return 3;
return 0;
}
if (metric === "stage4_contract_compliance_rate") {
if (value >= 0.95) return 5;
if (value >= 0.8) return 3;
return 0;
}
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
if (value <= 0.25) return 5;
if (value <= 0.45) return 3;
@ -1539,6 +1545,8 @@ export class EvalService {
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null);
const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length;
const avgActionability =
diagnostics.length > 0
@ -1558,7 +1566,9 @@ export class EvalService {
false_confidence_rate: round2(falseConfidenceCases / total),
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup),
stage4_contract_compliance_rate:
stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null
};
const rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null> = {
@ -1568,7 +1578,8 @@ export class EvalService {
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score),
stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate)
};
return {
@ -1577,7 +1588,8 @@ export class EvalService {
denominators: {
cases_total: diagnostics.length,
broad_cases_total: broadCases.length,
followup_cases_total: followupCases.length
followup_cases_total: followupCases.length,
stage4_contract_audited_cases_total: stage4AuditedCases.length
},
signature_counts: signatureCounter
};
@ -1672,7 +1684,8 @@ export class EvalService {
"false_confidence_rate",
"broad_answer_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
"followup_context_retention_score",
"stage4_contract_compliance_rate"
];
const lowerIsBetter = new Set<AssistantMetricKey>(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
@ -1976,6 +1989,7 @@ export class EvalService {
final_reply_type: "backend_error",
turn_count: turnResponses.length,
narrowing_result: "failed",
stage4_contract_shape_compliant: null,
signature: `backend_error|${suiteCase.scenario_tag}`,
is_generic: true,
is_false_confident: false,
@ -2014,6 +2028,10 @@ export class EvalService {
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
const stage4ContractShapeCompliant =
typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean"
? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape
: null;
const hasAnchors = hasDomainAnchors(
[directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")
@ -2113,6 +2131,7 @@ export class EvalService {
final_reply_type: finalResponse.reply_type,
turn_count: suiteCase.turns.length,
narrowing_result: narrowingResult,
stage4_contract_shape_compliant: stage4ContractShapeCompliant,
signature: [
finalResponse.reply_type,
signals.routes.sort().join(","),
@ -2146,7 +2165,9 @@ export class EvalService {
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
mechanism_specificity_score: round2(item.mechanism_specificity_score),
followup_context_retention_score:
item.followup_retention_score === null ? null : round2(item.followup_retention_score)
item.followup_retention_score === null ? null : round2(item.followup_retention_score),
stage4_contract_compliance_rate:
item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0
};
return {
schema_version: ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
@ -2175,7 +2196,8 @@ export class EvalService {
mechanism_status: item.signals.mechanism_status,
source_refs: item.signals.source_refs,
routes: item.signals.routes,
followup_state_applied: item.signals.followup_state_applied
followup_state_applied: item.signals.followup_state_applied,
stage4_contract_shape_compliant: item.stage4_contract_shape_compliant
},
metric_subscores: caseMetricVector,
limitations: item.limitations,

View File

@ -161,6 +161,7 @@ export interface AssistantEvalMetricVector {
broad_answer_rate: number | null;
mechanism_specificity_score: number | null;
followup_context_retention_score: number | null;
stage4_contract_compliance_rate: number | null;
}
export interface AssistantEvalRecord {
@ -227,5 +228,10 @@ export const ACCOUNTANT_SCORING_RUBRIC_V01: Record<AccountantMetricName, Account
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
],
stage4_contract_compliance_rate: [
{ score: 0, label: "Non-Compliant", description: "Stage 4 block contract is mostly missing or polluted by legacy sections." },
{ score: 3, label: "Partially Compliant", description: "Stage 4 answer shape is present only in part of audited responses." },
{ score: 5, label: "Compliant", description: "Stage 4 block contract is consistently present without legacy leakage." }
]
};

View File

@ -109,7 +109,8 @@ describe("stage1 contract scaffolding", () => {
"false_confidence_rate",
"broad_answer_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
"followup_context_retention_score",
"stage4_contract_compliance_rate"
]);
for (const metric of metricNames) {
const bands = ACCOUNTANT_SCORING_RUBRIC_V01[metric as keyof typeof ACCOUNTANT_SCORING_RUBRIC_V01];

View File

@ -84,7 +84,8 @@ describe.sequential("assistant Stage 1 eval harness", () => {
"false_confidence_rate",
"broad_answer_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
"followup_context_retention_score",
"stage4_contract_compliance_rate"
]);
expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy();
expect(response.body.report?.feature_profile_snapshot).toBeTruthy();