ГЛОБАЛЬНЫЙ РЕФАКТОРИНГ АРХИТЕКТУРЫ - Stage 4.7: внедрить stage4_contract_compliance_rate в P0 quality-gap и baseline gate
This commit is contained in:
parent
ce1ebae8ec
commit
a49212608f
|
|
@ -2728,7 +2728,25 @@ Implemented in current pass (Stage 4.6 eval metric for answer-contract complianc
|
|||
- `assistantEvalHarness.test.ts`: `6 passed` (run with extended timeout budget);
|
||||
- `npm --prefix llm_normalizer/backend run build` passed.
|
||||
|
||||
Status: In progress (Stage 4.1-4.6 completed; continue with focused wave/manual-comment quality backlog)
|
||||
Implemented in current pass (Stage 4.7 Stage4 compliance rollout into P0 quality-gap loop, 2026-04-12):
|
||||
1. Extended P0 quality-gap metric contract:
|
||||
- added `stage4_contract_compliance_rate` to `P0QualityGapMetricVector`;
|
||||
- added metric definition in `P0_QUALITY_GAP_METRIC_DEFINITIONS`.
|
||||
2. Extended P0 quality-gap thresholds and baseline gate checks:
|
||||
- added `stage4_contract_compliance_rate_min` to `P0QualityGapThresholds`;
|
||||
- default threshold set to `0.9`;
|
||||
- `buildQualityGapChecks(...)` now includes Stage4 compliance check.
|
||||
3. Integrated Stage4 compliance metric computation in P0 eval runner:
|
||||
- source: `buildStage4AnswerContractAuditV1(assistant_reply).is_stage4_shape`;
|
||||
- included in `quality_gap_metrics.raw`;
|
||||
- added denominator `stage4_contract_audited_cases_total`.
|
||||
4. Regression updates:
|
||||
- `assistantP0EvalHarness.test.ts` updated for new quality-gap metric key.
|
||||
5. Validation snapshot:
|
||||
- `assistantP0EvalHarness.test.ts`: `5 passed` (extended timeout budget);
|
||||
- `npm --prefix llm_normalizer/backend run build` passed.
|
||||
|
||||
Status: In progress (Stage 4.1-4.7 completed; continue with focused wave/manual-comment quality backlog)
|
||||
|
||||
## Stage 5 (P3): Quality Loop Driven By GUI Markup
|
||||
|
||||
|
|
|
|||
|
|
@ -108,6 +108,13 @@ function buildQualityGapChecks(input) {
|
|||
threshold: thresholds.followup_context_retention_score_min,
|
||||
comparator: ">=",
|
||||
passed: metrics.followup_context_retention_score >= thresholds.followup_context_retention_score_min
|
||||
},
|
||||
{
|
||||
metric: "stage4_contract_compliance_rate",
|
||||
value: metrics.stage4_contract_compliance_rate,
|
||||
threshold: thresholds.stage4_contract_compliance_rate_min,
|
||||
comparator: ">=",
|
||||
passed: metrics.stage4_contract_compliance_rate >= thresholds.stage4_contract_compliance_rate_min
|
||||
}
|
||||
];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ const nanoid_1 = require("nanoid");
|
|||
const config_1 = require("../config");
|
||||
const assistantService_1 = require("../services/assistantService");
|
||||
const assistantSessionStore_1 = require("../services/assistantSessionStore");
|
||||
const assistantStage4AnswerContractAudit_1 = require("../services/assistantStage4AnswerContractAudit");
|
||||
const files_1 = require("../utils/files");
|
||||
const p0_acceptance_gate_1 = require("./p0_acceptance_gate");
|
||||
const p0_metric_definitions_1 = require("./p0_metric_definitions");
|
||||
|
|
@ -1062,16 +1063,22 @@ function computeQualityGapMetrics(caseRecords) {
|
|||
const mechanismSpecificityScores = caseRecords.map((item) => item.metric_subscores.mechanism_specificity_score);
|
||||
const followupCases = caseRecords.filter((item) => item.query_class === "followup_investigation");
|
||||
const followupRetainedCases = followupCases.filter((item) => item.checks.followup_context_retained).length;
|
||||
const stage4ContractAuditedCases = caseRecords
|
||||
.map((item) => (0, assistantStage4AnswerContractAudit_1.buildStage4AnswerContractAuditV1)(item.assistant_reply ?? ""))
|
||||
.filter((item) => item.schema_version === "stage4_answer_contract_audit_v1");
|
||||
const stage4ContractCompliantCases = stage4ContractAuditedCases.filter((item) => item.is_stage4_shape === true).length;
|
||||
return {
|
||||
raw: {
|
||||
generic_explanation_rate: round2(genericCases / total),
|
||||
false_confidence_rate: round2(falseConfidenceCases / total),
|
||||
mechanism_specificity_score: average(mechanismSpecificityScores),
|
||||
followup_context_retention_score: round2(followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length))
|
||||
followup_context_retention_score: round2(followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length)),
|
||||
stage4_contract_compliance_rate: round2(stage4ContractAuditedCases.length === 0 ? 0 : stage4ContractCompliantCases / Math.max(1, stage4ContractAuditedCases.length))
|
||||
},
|
||||
denominators: {
|
||||
cases_total: caseRecords.length,
|
||||
followup_cases_total: followupCases.length
|
||||
followup_cases_total: followupCases.length,
|
||||
stage4_contract_audited_cases_total: stage4ContractAuditedCases.length
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,6 +65,11 @@ exports.P0_QUALITY_GAP_METRIC_DEFINITIONS = {
|
|||
description: "Rate of correct context retention in follow-up investigation cases.",
|
||||
unit: "rate",
|
||||
direction: "higher_is_better"
|
||||
},
|
||||
stage4_contract_compliance_rate: {
|
||||
description: "Share of answers that satisfy the Stage 4 user-facing answer contract shape.",
|
||||
unit: "rate",
|
||||
direction: "higher_is_better"
|
||||
}
|
||||
};
|
||||
exports.P0_DEFAULT_FORBIDDEN_LEAKAGE_TOKENS = [
|
||||
|
|
@ -97,7 +102,8 @@ exports.P0_DEFAULT_QUALITY_GAP_THRESHOLDS = {
|
|||
generic_explanation_rate_max: 0.2,
|
||||
false_confidence_rate_max: 0.12,
|
||||
mechanism_specificity_score_min: 3.0,
|
||||
followup_context_retention_score_min: 0.75
|
||||
followup_context_retention_score_min: 0.75,
|
||||
stage4_contract_compliance_rate_min: 0.9
|
||||
};
|
||||
function isProblemUnitType(value) {
|
||||
return (value === "document_conflict" ||
|
||||
|
|
|
|||
|
|
@ -161,6 +161,13 @@ function buildQualityGapChecks(input: {
|
|||
threshold: thresholds.followup_context_retention_score_min,
|
||||
comparator: ">=",
|
||||
passed: metrics.followup_context_retention_score >= thresholds.followup_context_retention_score_min
|
||||
},
|
||||
{
|
||||
metric: "stage4_contract_compliance_rate",
|
||||
value: metrics.stage4_contract_compliance_rate,
|
||||
threshold: thresholds.stage4_contract_compliance_rate_min,
|
||||
comparator: ">=",
|
||||
passed: metrics.stage4_contract_compliance_rate >= thresholds.stage4_contract_compliance_rate_min
|
||||
}
|
||||
];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import {
|
|||
} from "../config";
|
||||
import { AssistantService } from "../services/assistantService";
|
||||
import { AssistantSessionStore } from "../services/assistantSessionStore";
|
||||
import { buildStage4AnswerContractAuditV1 } from "../services/assistantStage4AnswerContractAudit";
|
||||
import { NormalizerService } from "../services/normalizerService";
|
||||
import type { AssistantMessageResponsePayload } from "../types/assistant";
|
||||
import type { EvalRunMode, NormalizeRequestPayload } from "../types/normalizer";
|
||||
|
|
@ -1291,6 +1292,10 @@ function computeQualityGapMetrics(caseRecords: P0CaseRecord[]): {
|
|||
const mechanismSpecificityScores = caseRecords.map((item) => item.metric_subscores.mechanism_specificity_score);
|
||||
const followupCases = caseRecords.filter((item) => item.query_class === "followup_investigation");
|
||||
const followupRetainedCases = followupCases.filter((item) => item.checks.followup_context_retained).length;
|
||||
const stage4ContractAuditedCases = caseRecords
|
||||
.map((item) => buildStage4AnswerContractAuditV1(item.assistant_reply ?? ""))
|
||||
.filter((item) => item.schema_version === "stage4_answer_contract_audit_v1");
|
||||
const stage4ContractCompliantCases = stage4ContractAuditedCases.filter((item) => item.is_stage4_shape === true).length;
|
||||
|
||||
return {
|
||||
raw: {
|
||||
|
|
@ -1299,11 +1304,15 @@ function computeQualityGapMetrics(caseRecords: P0CaseRecord[]): {
|
|||
mechanism_specificity_score: average(mechanismSpecificityScores),
|
||||
followup_context_retention_score: round2(
|
||||
followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length)
|
||||
),
|
||||
stage4_contract_compliance_rate: round2(
|
||||
stage4ContractAuditedCases.length === 0 ? 0 : stage4ContractCompliantCases / Math.max(1, stage4ContractAuditedCases.length)
|
||||
)
|
||||
},
|
||||
denominators: {
|
||||
cases_total: caseRecords.length,
|
||||
followup_cases_total: followupCases.length
|
||||
followup_cases_total: followupCases.length,
|
||||
stage4_contract_audited_cases_total: stage4ContractAuditedCases.length
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,6 +68,7 @@ export interface P0QualityGapMetricVector {
|
|||
false_confidence_rate: number;
|
||||
mechanism_specificity_score: number;
|
||||
followup_context_retention_score: number;
|
||||
stage4_contract_compliance_rate: number;
|
||||
}
|
||||
|
||||
export type P0QualityGapMetricName = keyof P0QualityGapMetricVector;
|
||||
|
|
@ -147,6 +148,11 @@ export const P0_QUALITY_GAP_METRIC_DEFINITIONS: Record<P0QualityGapMetricName, P
|
|||
description: "Rate of correct context retention in follow-up investigation cases.",
|
||||
unit: "rate",
|
||||
direction: "higher_is_better"
|
||||
},
|
||||
stage4_contract_compliance_rate: {
|
||||
description: "Share of answers that satisfy the Stage 4 user-facing answer contract shape.",
|
||||
unit: "rate",
|
||||
direction: "higher_is_better"
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -183,6 +189,7 @@ export interface P0QualityGapThresholds {
|
|||
false_confidence_rate_max: number;
|
||||
mechanism_specificity_score_min: number;
|
||||
followup_context_retention_score_min: number;
|
||||
stage4_contract_compliance_rate_min: number;
|
||||
}
|
||||
|
||||
export const P0_DEFAULT_ACCEPTANCE_THRESHOLDS: P0AcceptanceThresholds = {
|
||||
|
|
@ -200,7 +207,8 @@ export const P0_DEFAULT_QUALITY_GAP_THRESHOLDS: P0QualityGapThresholds = {
|
|||
generic_explanation_rate_max: 0.2,
|
||||
false_confidence_rate_max: 0.12,
|
||||
mechanism_specificity_score_min: 3.0,
|
||||
followup_context_retention_score_min: 0.75
|
||||
followup_context_retention_score_min: 0.75,
|
||||
stage4_contract_compliance_rate_min: 0.9
|
||||
};
|
||||
|
||||
function isProblemUnitType(value: unknown): value is ProblemUnitType {
|
||||
|
|
|
|||
|
|
@ -88,8 +88,14 @@ describe.sequential("assistant P0 eval harness (Wave 7)", () => {
|
|||
"generic_explanation_rate",
|
||||
"false_confidence_rate",
|
||||
"mechanism_specificity_score",
|
||||
"followup_context_retention_score"
|
||||
"followup_context_retention_score",
|
||||
"stage4_contract_compliance_rate"
|
||||
]);
|
||||
expect(Number(response.body.report?.quality_gap_metrics?.denominators?.stage4_contract_audited_cases_total ?? 0)).toBeGreaterThan(0);
|
||||
const qualityGapChecks = Array.isArray(response.body.report?.baseline_stability_gate?.quality_gap_checks)
|
||||
? response.body.report.baseline_stability_gate.quality_gap_checks
|
||||
: [];
|
||||
expect(qualityGapChecks.some((item: { metric?: string }) => item.metric === "stage4_contract_compliance_rate")).toBe(true);
|
||||
expect(["P0_ACCEPTED", "P0_ACCEPTED_WITH_LIMITATIONS", "P0_NOT_ACCEPTED"]).toContain(
|
||||
String(response.body.report?.acceptance_gate?.verdict ?? "")
|
||||
);
|
||||
|
|
|
|||
Loading…
Reference in New Issue