From ce1ebae8ecdc3b50919bee7efc719a1f9023c110 Mon Sep 17 00:00:00 2001 From: dctouch Date: Sun, 12 Apr 2026 09:25:15 +0300 Subject: [PATCH] =?UTF-8?q?=D0=93=D0=9B=D0=9E=D0=91=D0=90=D0=9B=D0=AC?= =?UTF-8?q?=D0=9D=D0=AB=D0=99=20=D0=A0=D0=95=D0=A4=D0=90=D0=9A=D0=A2=D0=9E?= =?UTF-8?q?=D0=A0=D0=98=D0=9D=D0=93=20=D0=90=D0=A0=D0=A5=D0=98=D0=A2=D0=95?= =?UTF-8?q?=D0=9A=D0=A2=D0=A3=D0=A0=D0=AB=20-=20Stage=204.6:=20=D0=B4?= =?UTF-8?q?=D0=BE=D0=B1=D0=B0=D0=B2=D0=B8=D1=82=D1=8C=20=D0=BC=D0=B5=D1=82?= =?UTF-8?q?=D1=80=D0=B8=D0=BA=D1=83=20stage4=5Fcontract=5Fcompliance=5Frat?= =?UTF-8?q?e=20=D0=B2=20Stage1=20eval=20=D0=B8=20=D0=BE=D1=82=D1=87=D0=B5?= =?UTF-8?q?=D1=82=D1=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/TECH/1CLLMARCH-FACT.md | 21 +++++++++++- .../backend/dist/routes/autoRuns.js | 3 +- .../backend/dist/services/evalService.js | 32 +++++++++++++---- .../backend/dist/types/stage1Contracts.js | 5 +++ llm_normalizer/backend/src/routes/autoRuns.ts | 3 +- .../backend/src/services/evalService.ts | 34 +++++++++++++++---- .../backend/src/types/stage1Contracts.ts | 6 ++++ .../backend/tests/assistantContracts.test.ts | 3 +- .../tests/assistantEvalHarness.test.ts | 3 +- 9 files changed, 93 insertions(+), 17 deletions(-) diff --git a/docs/TECH/1CLLMARCH-FACT.md b/docs/TECH/1CLLMARCH-FACT.md index c40456f..99e0bc7 100644 --- a/docs/TECH/1CLLMARCH-FACT.md +++ b/docs/TECH/1CLLMARCH-FACT.md @@ -2709,7 +2709,26 @@ Implemented in current pass (Stage 4.5 contract observability in debug/log, 2026 - focused assembler pack passed: `2 files / 5 tests`; - `npm --prefix llm_normalizer/backend run build` passed. -Status: In progress (Stage 4.1-4.5 completed; continue with focused wave/manual-comment quality backlog) +Implemented in current pass (Stage 4.6 eval metric for answer-contract compliance, 2026-04-12): +1. Added Stage 4 compliance metric to Stage 1 eval: + - `stage4_contract_compliance_rate` in `AssistantEvalMetricVector` + rubric (`ACCOUNTANT_SCORING_RUBRIC_V01`). +2. Integrated metric computation in Stage 1 eval runtime: + - source: `debug.answer_contract_stage4_v1.is_stage4_shape`; + - aggregate in `computeAssistantMetrics(...)`; + - per-case metric projection in `metric_subscores/accountant_metrics`; + - added denominator `stage4_contract_audited_cases_total`. +3. Integrated metric into reporting/comparison and autorun score index: + - Stage 1 comparison deltas now include `stage4_contract_compliance_rate`; + - auto-run score index for `assistant_stage1` now includes this compliance signal. +4. Regression updates: + - `assistantContracts.test.ts` + - `assistantEvalHarness.test.ts` +5. Validation snapshot: + - `assistantContracts.test.ts`: `2 passed`; + - `assistantEvalHarness.test.ts`: `6 passed` (run with extended timeout budget); + - `npm --prefix llm_normalizer/backend run build` passed. + +Status: In progress (Stage 4.1-4.6 completed; continue with focused wave/manual-comment quality backlog) ## Stage 5 (P3): Quality Loop Driven By GUI Markup diff --git a/llm_normalizer/backend/dist/routes/autoRuns.js b/llm_normalizer/backend/dist/routes/autoRuns.js index 73c3b44..897d50d 100644 --- a/llm_normalizer/backend/dist/routes/autoRuns.js +++ b/llm_normalizer/backend/dist/routes/autoRuns.js @@ -533,7 +533,8 @@ function computeScoreIndex(report, target) { rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)), rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)), scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)), - scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)) + scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)), + rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate)) ]); } if (target === "assistant_stage2") { diff --git a/llm_normalizer/backend/dist/services/evalService.js b/llm_normalizer/backend/dist/services/evalService.js index 59a44f9..6a28142 100644 --- a/llm_normalizer/backend/dist/services/evalService.js +++ b/llm_normalizer/backend/dist/services/evalService.js @@ -359,6 +359,13 @@ function rateToBandScore(metric, value) { return 3; return 0; } + if (metric === "stage4_contract_compliance_rate") { + if (value >= 0.95) + return 5; + if (value >= 0.8) + return 3; + return 0; + } if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") { if (value <= 0.25) return 5; @@ -1254,6 +1261,8 @@ class EvalService { const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null); const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length; const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null); + const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null); + const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length; const avgActionability = diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length : null; @@ -1268,7 +1277,8 @@ class EvalService { false_confidence_rate: round2(falseConfidenceCases / total), broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null, mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism), - followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup) + followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup), + stage4_contract_compliance_rate: stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null }; const rubric_bands = { retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate), @@ -1277,7 +1287,8 @@ class EvalService { false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate), broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate), mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score), - followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score) + followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score), + stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate) }; return { raw, @@ -1285,7 +1296,8 @@ class EvalService { denominators: { cases_total: diagnostics.length, broad_cases_total: broadCases.length, - followup_cases_total: followupCases.length + followup_cases_total: followupCases.length, + stage4_contract_audited_cases_total: stage4AuditedCases.length }, signature_counts: signatureCounter }; @@ -1364,7 +1376,8 @@ class EvalService { "false_confidence_rate", "broad_answer_rate", "mechanism_specificity_score", - "followup_context_retention_score" + "followup_context_retention_score", + "stage4_contract_compliance_rate" ]; const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]); const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {}; @@ -1634,6 +1647,7 @@ class EvalService { final_reply_type: "backend_error", turn_count: turnResponses.length, narrowing_result: "failed", + stage4_contract_shape_compliant: null, signature: `backend_error|${suiteCase.scenario_tag}`, is_generic: true, is_false_confident: false, @@ -1671,6 +1685,9 @@ class EvalService { const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes); const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations); const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? ""); + const stage4ContractShapeCompliant = typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean" + ? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape + : null; const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")); let genericnessScore = 0; if (!hasAnchors) @@ -1779,6 +1796,7 @@ class EvalService { final_reply_type: finalResponse.reply_type, turn_count: suiteCase.turns.length, narrowing_result: narrowingResult, + stage4_contract_shape_compliant: stage4ContractShapeCompliant, signature: [ finalResponse.reply_type, signals.routes.sort().join(","), @@ -1810,7 +1828,8 @@ class EvalService { false_confidence_rate: item.is_false_confident ? 1 : 0, broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0, mechanism_specificity_score: round2(item.mechanism_specificity_score), - followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score) + followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score), + stage4_contract_compliance_rate: item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0 }; return { schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION, @@ -1839,7 +1858,8 @@ class EvalService { mechanism_status: item.signals.mechanism_status, source_refs: item.signals.source_refs, routes: item.signals.routes, - followup_state_applied: item.signals.followup_state_applied + followup_state_applied: item.signals.followup_state_applied, + stage4_contract_shape_compliant: item.stage4_contract_shape_compliant }, metric_subscores: caseMetricVector, limitations: item.limitations, diff --git a/llm_normalizer/backend/dist/types/stage1Contracts.js b/llm_normalizer/backend/dist/types/stage1Contracts.js index b9239be..34fb9f2 100644 --- a/llm_normalizer/backend/dist/types/stage1Contracts.js +++ b/llm_normalizer/backend/dist/types/stage1Contracts.js @@ -44,5 +44,10 @@ exports.ACCOUNTANT_SCORING_RUBRIC_V01 = { { score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." }, { score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." }, { score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." } + ], + stage4_contract_compliance_rate: [ + { score: 0, label: "Non-Compliant", description: "Stage 4 block contract is mostly missing or polluted by legacy sections." }, + { score: 3, label: "Partially Compliant", description: "Stage 4 answer shape is present only in part of audited responses." }, + { score: 5, label: "Compliant", description: "Stage 4 block contract is consistently present without legacy leakage." } ] }; diff --git a/llm_normalizer/backend/src/routes/autoRuns.ts b/llm_normalizer/backend/src/routes/autoRuns.ts index e2e9d9e..630a0cf 100644 --- a/llm_normalizer/backend/src/routes/autoRuns.ts +++ b/llm_normalizer/backend/src/routes/autoRuns.ts @@ -722,7 +722,8 @@ function computeScoreIndex(report: Record, target: AutoRunTarge rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)), rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)), scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)), - scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)) + scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)), + rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate)) ]); } diff --git a/llm_normalizer/backend/src/services/evalService.ts b/llm_normalizer/backend/src/services/evalService.ts index f237d7f..1310879 100644 --- a/llm_normalizer/backend/src/services/evalService.ts +++ b/llm_normalizer/backend/src/services/evalService.ts @@ -473,6 +473,7 @@ interface AssistantCaseDiagnostics { final_reply_type: string; turn_count: number; narrowing_result: AssistantEvalNarrowingResult; + stage4_contract_shape_compliant: boolean | null; signature: string; is_generic: boolean; is_false_confident: boolean; @@ -554,6 +555,11 @@ function rateToBandScore(metric: AssistantMetricKey, value: number): 0 | 3 | 5 { if (value >= 0.45) return 3; return 0; } + if (metric === "stage4_contract_compliance_rate") { + if (value >= 0.95) return 5; + if (value >= 0.8) return 3; + return 0; + } if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") { if (value <= 0.25) return 5; if (value <= 0.45) return 3; @@ -1539,6 +1545,8 @@ export class EvalService { const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null); const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length; const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null); + const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null); + const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length; const avgActionability = diagnostics.length > 0 @@ -1558,7 +1566,9 @@ export class EvalService { false_confidence_rate: round2(falseConfidenceCases / total), broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null, mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism), - followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup) + followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup), + stage4_contract_compliance_rate: + stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null }; const rubric_bands: Record = { @@ -1568,7 +1578,8 @@ export class EvalService { false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate), broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate), mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score), - followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score) + followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score), + stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate) }; return { @@ -1577,7 +1588,8 @@ export class EvalService { denominators: { cases_total: diagnostics.length, broad_cases_total: broadCases.length, - followup_cases_total: followupCases.length + followup_cases_total: followupCases.length, + stage4_contract_audited_cases_total: stage4AuditedCases.length }, signature_counts: signatureCounter }; @@ -1672,7 +1684,8 @@ export class EvalService { "false_confidence_rate", "broad_answer_rate", "mechanism_specificity_score", - "followup_context_retention_score" + "followup_context_retention_score", + "stage4_contract_compliance_rate" ]; const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]); @@ -1976,6 +1989,7 @@ export class EvalService { final_reply_type: "backend_error", turn_count: turnResponses.length, narrowing_result: "failed", + stage4_contract_shape_compliant: null, signature: `backend_error|${suiteCase.scenario_tag}`, is_generic: true, is_false_confident: false, @@ -2014,6 +2028,10 @@ export class EvalService { const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes); const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations); const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? ""); + const stage4ContractShapeCompliant = + typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean" + ? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape + : null; const hasAnchors = hasDomainAnchors( [directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ") @@ -2113,6 +2131,7 @@ export class EvalService { final_reply_type: finalResponse.reply_type, turn_count: suiteCase.turns.length, narrowing_result: narrowingResult, + stage4_contract_shape_compliant: stage4ContractShapeCompliant, signature: [ finalResponse.reply_type, signals.routes.sort().join(","), @@ -2146,7 +2165,9 @@ export class EvalService { broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0, mechanism_specificity_score: round2(item.mechanism_specificity_score), followup_context_retention_score: - item.followup_retention_score === null ? null : round2(item.followup_retention_score) + item.followup_retention_score === null ? null : round2(item.followup_retention_score), + stage4_contract_compliance_rate: + item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0 }; return { schema_version: ASSISTANT_EVAL_RECORD_SCHEMA_VERSION, @@ -2175,7 +2196,8 @@ export class EvalService { mechanism_status: item.signals.mechanism_status, source_refs: item.signals.source_refs, routes: item.signals.routes, - followup_state_applied: item.signals.followup_state_applied + followup_state_applied: item.signals.followup_state_applied, + stage4_contract_shape_compliant: item.stage4_contract_shape_compliant }, metric_subscores: caseMetricVector, limitations: item.limitations, diff --git a/llm_normalizer/backend/src/types/stage1Contracts.ts b/llm_normalizer/backend/src/types/stage1Contracts.ts index 17558e2..9251bc5 100644 --- a/llm_normalizer/backend/src/types/stage1Contracts.ts +++ b/llm_normalizer/backend/src/types/stage1Contracts.ts @@ -161,6 +161,7 @@ export interface AssistantEvalMetricVector { broad_answer_rate: number | null; mechanism_specificity_score: number | null; followup_context_retention_score: number | null; + stage4_contract_compliance_rate: number | null; } export interface AssistantEvalRecord { @@ -227,5 +228,10 @@ export const ACCOUNTANT_SCORING_RUBRIC_V01: Record { "false_confidence_rate", "broad_answer_rate", "mechanism_specificity_score", - "followup_context_retention_score" + "followup_context_retention_score", + "stage4_contract_compliance_rate" ]); for (const metric of metricNames) { const bands = ACCOUNTANT_SCORING_RUBRIC_V01[metric as keyof typeof ACCOUNTANT_SCORING_RUBRIC_V01]; diff --git a/llm_normalizer/backend/tests/assistantEvalHarness.test.ts b/llm_normalizer/backend/tests/assistantEvalHarness.test.ts index 8189273..7ce5921 100644 --- a/llm_normalizer/backend/tests/assistantEvalHarness.test.ts +++ b/llm_normalizer/backend/tests/assistantEvalHarness.test.ts @@ -84,7 +84,8 @@ describe.sequential("assistant Stage 1 eval harness", () => { "false_confidence_rate", "broad_answer_rate", "mechanism_specificity_score", - "followup_context_retention_score" + "followup_context_retention_score", + "stage4_contract_compliance_rate" ]); expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy(); expect(response.body.report?.feature_profile_snapshot).toBeTruthy();