ГЛОБАЛЬНЫЙ РЕФАКТОРИНГ АРХИТЕКТУРЫ - Stage 4.7: внедрить stage4_contract_compliance_rate в P0 quality-gap и baseline gate

This commit is contained in:
dctouch 2026-04-12 09:32:27 +03:00
parent ce1ebae8ec
commit a49212608f
8 changed files with 75 additions and 7 deletions

View File

@ -2728,7 +2728,25 @@ Implemented in current pass (Stage 4.6 eval metric for answer-contract complianc
- `assistantEvalHarness.test.ts`: `6 passed` (run with extended timeout budget);
- `npm --prefix llm_normalizer/backend run build` passed.
Status: In progress (Stage 4.1-4.6 completed; continue with focused wave/manual-comment quality backlog)
Implemented in current pass (Stage 4.7 Stage4 compliance rollout into P0 quality-gap loop, 2026-04-12):
1. Extended P0 quality-gap metric contract:
- added `stage4_contract_compliance_rate` to `P0QualityGapMetricVector`;
- added metric definition in `P0_QUALITY_GAP_METRIC_DEFINITIONS`.
2. Extended P0 quality-gap thresholds and baseline gate checks:
- added `stage4_contract_compliance_rate_min` to `P0QualityGapThresholds`;
- default threshold set to `0.9`;
- `buildQualityGapChecks(...)` now includes Stage4 compliance check.
3. Integrated Stage4 compliance metric computation in P0 eval runner:
- source: `buildStage4AnswerContractAuditV1(assistant_reply).is_stage4_shape`;
- included in `quality_gap_metrics.raw`;
- added denominator `stage4_contract_audited_cases_total`.
4. Regression updates:
- `assistantP0EvalHarness.test.ts` updated for new quality-gap metric key.
5. Validation snapshot:
- `assistantP0EvalHarness.test.ts`: `5 passed` (extended timeout budget);
- `npm --prefix llm_normalizer/backend run build` passed.
Status: In progress (Stage 4.1-4.7 completed; continue with focused wave/manual-comment quality backlog)
## Stage 5 (P3): Quality Loop Driven By GUI Markup

View File

@ -108,6 +108,13 @@ function buildQualityGapChecks(input) {
threshold: thresholds.followup_context_retention_score_min,
comparator: ">=",
passed: metrics.followup_context_retention_score >= thresholds.followup_context_retention_score_min
},
{
metric: "stage4_contract_compliance_rate",
value: metrics.stage4_contract_compliance_rate,
threshold: thresholds.stage4_contract_compliance_rate_min,
comparator: ">=",
passed: metrics.stage4_contract_compliance_rate >= thresholds.stage4_contract_compliance_rate_min
}
];
}

View File

@ -10,6 +10,7 @@ const nanoid_1 = require("nanoid");
const config_1 = require("../config");
const assistantService_1 = require("../services/assistantService");
const assistantSessionStore_1 = require("../services/assistantSessionStore");
const assistantStage4AnswerContractAudit_1 = require("../services/assistantStage4AnswerContractAudit");
const files_1 = require("../utils/files");
const p0_acceptance_gate_1 = require("./p0_acceptance_gate");
const p0_metric_definitions_1 = require("./p0_metric_definitions");
@ -1062,16 +1063,22 @@ function computeQualityGapMetrics(caseRecords) {
const mechanismSpecificityScores = caseRecords.map((item) => item.metric_subscores.mechanism_specificity_score);
const followupCases = caseRecords.filter((item) => item.query_class === "followup_investigation");
const followupRetainedCases = followupCases.filter((item) => item.checks.followup_context_retained).length;
const stage4ContractAuditedCases = caseRecords
.map((item) => (0, assistantStage4AnswerContractAudit_1.buildStage4AnswerContractAuditV1)(item.assistant_reply ?? ""))
.filter((item) => item.schema_version === "stage4_answer_contract_audit_v1");
const stage4ContractCompliantCases = stage4ContractAuditedCases.filter((item) => item.is_stage4_shape === true).length;
return {
raw: {
generic_explanation_rate: round2(genericCases / total),
false_confidence_rate: round2(falseConfidenceCases / total),
mechanism_specificity_score: average(mechanismSpecificityScores),
followup_context_retention_score: round2(followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length))
followup_context_retention_score: round2(followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length)),
stage4_contract_compliance_rate: round2(stage4ContractAuditedCases.length === 0 ? 0 : stage4ContractCompliantCases / Math.max(1, stage4ContractAuditedCases.length))
},
denominators: {
cases_total: caseRecords.length,
followup_cases_total: followupCases.length
followup_cases_total: followupCases.length,
stage4_contract_audited_cases_total: stage4ContractAuditedCases.length
}
};
}

View File

@ -65,6 +65,11 @@ exports.P0_QUALITY_GAP_METRIC_DEFINITIONS = {
description: "Rate of correct context retention in follow-up investigation cases.",
unit: "rate",
direction: "higher_is_better"
},
stage4_contract_compliance_rate: {
description: "Share of answers that satisfy the Stage 4 user-facing answer contract shape.",
unit: "rate",
direction: "higher_is_better"
}
};
exports.P0_DEFAULT_FORBIDDEN_LEAKAGE_TOKENS = [
@ -97,7 +102,8 @@ exports.P0_DEFAULT_QUALITY_GAP_THRESHOLDS = {
generic_explanation_rate_max: 0.2,
false_confidence_rate_max: 0.12,
mechanism_specificity_score_min: 3.0,
followup_context_retention_score_min: 0.75
followup_context_retention_score_min: 0.75,
stage4_contract_compliance_rate_min: 0.9
};
function isProblemUnitType(value) {
return (value === "document_conflict" ||

View File

@ -161,6 +161,13 @@ function buildQualityGapChecks(input: {
threshold: thresholds.followup_context_retention_score_min,
comparator: ">=",
passed: metrics.followup_context_retention_score >= thresholds.followup_context_retention_score_min
},
{
metric: "stage4_contract_compliance_rate",
value: metrics.stage4_contract_compliance_rate,
threshold: thresholds.stage4_contract_compliance_rate_min,
comparator: ">=",
passed: metrics.stage4_contract_compliance_rate >= thresholds.stage4_contract_compliance_rate_min
}
];
}

View File

@ -15,6 +15,7 @@ import {
} from "../config";
import { AssistantService } from "../services/assistantService";
import { AssistantSessionStore } from "../services/assistantSessionStore";
import { buildStage4AnswerContractAuditV1 } from "../services/assistantStage4AnswerContractAudit";
import { NormalizerService } from "../services/normalizerService";
import type { AssistantMessageResponsePayload } from "../types/assistant";
import type { EvalRunMode, NormalizeRequestPayload } from "../types/normalizer";
@ -1291,6 +1292,10 @@ function computeQualityGapMetrics(caseRecords: P0CaseRecord[]): {
const mechanismSpecificityScores = caseRecords.map((item) => item.metric_subscores.mechanism_specificity_score);
const followupCases = caseRecords.filter((item) => item.query_class === "followup_investigation");
const followupRetainedCases = followupCases.filter((item) => item.checks.followup_context_retained).length;
const stage4ContractAuditedCases = caseRecords
.map((item) => buildStage4AnswerContractAuditV1(item.assistant_reply ?? ""))
.filter((item) => item.schema_version === "stage4_answer_contract_audit_v1");
const stage4ContractCompliantCases = stage4ContractAuditedCases.filter((item) => item.is_stage4_shape === true).length;
return {
raw: {
@ -1299,11 +1304,15 @@ function computeQualityGapMetrics(caseRecords: P0CaseRecord[]): {
mechanism_specificity_score: average(mechanismSpecificityScores),
followup_context_retention_score: round2(
followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length)
),
stage4_contract_compliance_rate: round2(
stage4ContractAuditedCases.length === 0 ? 0 : stage4ContractCompliantCases / Math.max(1, stage4ContractAuditedCases.length)
)
},
denominators: {
cases_total: caseRecords.length,
followup_cases_total: followupCases.length
followup_cases_total: followupCases.length,
stage4_contract_audited_cases_total: stage4ContractAuditedCases.length
}
};
}

View File

@ -68,6 +68,7 @@ export interface P0QualityGapMetricVector {
false_confidence_rate: number;
mechanism_specificity_score: number;
followup_context_retention_score: number;
stage4_contract_compliance_rate: number;
}
export type P0QualityGapMetricName = keyof P0QualityGapMetricVector;
@ -147,6 +148,11 @@ export const P0_QUALITY_GAP_METRIC_DEFINITIONS: Record<P0QualityGapMetricName, P
description: "Rate of correct context retention in follow-up investigation cases.",
unit: "rate",
direction: "higher_is_better"
},
stage4_contract_compliance_rate: {
description: "Share of answers that satisfy the Stage 4 user-facing answer contract shape.",
unit: "rate",
direction: "higher_is_better"
}
};
@ -183,6 +189,7 @@ export interface P0QualityGapThresholds {
false_confidence_rate_max: number;
mechanism_specificity_score_min: number;
followup_context_retention_score_min: number;
stage4_contract_compliance_rate_min: number;
}
export const P0_DEFAULT_ACCEPTANCE_THRESHOLDS: P0AcceptanceThresholds = {
@ -200,7 +207,8 @@ export const P0_DEFAULT_QUALITY_GAP_THRESHOLDS: P0QualityGapThresholds = {
generic_explanation_rate_max: 0.2,
false_confidence_rate_max: 0.12,
mechanism_specificity_score_min: 3.0,
followup_context_retention_score_min: 0.75
followup_context_retention_score_min: 0.75,
stage4_contract_compliance_rate_min: 0.9
};
function isProblemUnitType(value: unknown): value is ProblemUnitType {

View File

@ -88,8 +88,14 @@ describe.sequential("assistant P0 eval harness (Wave 7)", () => {
"generic_explanation_rate",
"false_confidence_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
"followup_context_retention_score",
"stage4_contract_compliance_rate"
]);
expect(Number(response.body.report?.quality_gap_metrics?.denominators?.stage4_contract_audited_cases_total ?? 0)).toBeGreaterThan(0);
const qualityGapChecks = Array.isArray(response.body.report?.baseline_stability_gate?.quality_gap_checks)
? response.body.report.baseline_stability_gate.quality_gap_checks
: [];
expect(qualityGapChecks.some((item: { metric?: string }) => item.metric === "stage4_contract_compliance_rate")).toBe(true);
expect(["P0_ACCEPTED", "P0_ACCEPTED_WITH_LIMITATIONS", "P0_NOT_ACCEPTED"]).toContain(
String(response.body.report?.acceptance_gate?.verdict ?? "")
);