From a49212608f8bd9ee3c5e522dd201ed8ba19f16d8 Mon Sep 17 00:00:00 2001 From: dctouch Date: Sun, 12 Apr 2026 09:32:27 +0300 Subject: [PATCH] =?UTF-8?q?=D0=93=D0=9B=D0=9E=D0=91=D0=90=D0=9B=D0=AC?= =?UTF-8?q?=D0=9D=D0=AB=D0=99=20=D0=A0=D0=95=D0=A4=D0=90=D0=9A=D0=A2=D0=9E?= =?UTF-8?q?=D0=A0=D0=98=D0=9D=D0=93=20=D0=90=D0=A0=D0=A5=D0=98=D0=A2=D0=95?= =?UTF-8?q?=D0=9A=D0=A2=D0=A3=D0=A0=D0=AB=20-=20Stage=204.7:=20=D0=B2?= =?UTF-8?q?=D0=BD=D0=B5=D0=B4=D1=80=D0=B8=D1=82=D1=8C=20stage4=5Fcontract?= =?UTF-8?q?=5Fcompliance=5Frate=20=D0=B2=20P0=20quality-gap=20=D0=B8=20bas?= =?UTF-8?q?eline=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/TECH/1CLLMARCH-FACT.md | 20 ++++++++++++++++++- .../backend/dist/eval/p0_acceptance_gate.js | 7 +++++++ .../backend/dist/eval/p0_eval_runner.js | 11 ++++++++-- .../dist/eval/p0_metric_definitions.js | 8 +++++++- .../backend/src/eval/p0_acceptance_gate.ts | 7 +++++++ .../backend/src/eval/p0_eval_runner.ts | 11 +++++++++- .../backend/src/eval/p0_metric_definitions.ts | 10 +++++++++- .../tests/assistantP0EvalHarness.test.ts | 8 +++++++- 8 files changed, 75 insertions(+), 7 deletions(-) diff --git a/docs/TECH/1CLLMARCH-FACT.md b/docs/TECH/1CLLMARCH-FACT.md index 99e0bc7..41d0d02 100644 --- a/docs/TECH/1CLLMARCH-FACT.md +++ b/docs/TECH/1CLLMARCH-FACT.md @@ -2728,7 +2728,25 @@ Implemented in current pass (Stage 4.6 eval metric for answer-contract complianc - `assistantEvalHarness.test.ts`: `6 passed` (run with extended timeout budget); - `npm --prefix llm_normalizer/backend run build` passed. -Status: In progress (Stage 4.1-4.6 completed; continue with focused wave/manual-comment quality backlog) +Implemented in current pass (Stage 4.7 Stage4 compliance rollout into P0 quality-gap loop, 2026-04-12): +1. Extended P0 quality-gap metric contract: + - added `stage4_contract_compliance_rate` to `P0QualityGapMetricVector`; + - added metric definition in `P0_QUALITY_GAP_METRIC_DEFINITIONS`. +2. Extended P0 quality-gap thresholds and baseline gate checks: + - added `stage4_contract_compliance_rate_min` to `P0QualityGapThresholds`; + - default threshold set to `0.9`; + - `buildQualityGapChecks(...)` now includes Stage4 compliance check. +3. Integrated Stage4 compliance metric computation in P0 eval runner: + - source: `buildStage4AnswerContractAuditV1(assistant_reply).is_stage4_shape`; + - included in `quality_gap_metrics.raw`; + - added denominator `stage4_contract_audited_cases_total`. +4. Regression updates: + - `assistantP0EvalHarness.test.ts` updated for new quality-gap metric key. +5. Validation snapshot: + - `assistantP0EvalHarness.test.ts`: `5 passed` (extended timeout budget); + - `npm --prefix llm_normalizer/backend run build` passed. + +Status: In progress (Stage 4.1-4.7 completed; continue with focused wave/manual-comment quality backlog) ## Stage 5 (P3): Quality Loop Driven By GUI Markup diff --git a/llm_normalizer/backend/dist/eval/p0_acceptance_gate.js b/llm_normalizer/backend/dist/eval/p0_acceptance_gate.js index c7fe396..b423dc0 100644 --- a/llm_normalizer/backend/dist/eval/p0_acceptance_gate.js +++ b/llm_normalizer/backend/dist/eval/p0_acceptance_gate.js @@ -108,6 +108,13 @@ function buildQualityGapChecks(input) { threshold: thresholds.followup_context_retention_score_min, comparator: ">=", passed: metrics.followup_context_retention_score >= thresholds.followup_context_retention_score_min + }, + { + metric: "stage4_contract_compliance_rate", + value: metrics.stage4_contract_compliance_rate, + threshold: thresholds.stage4_contract_compliance_rate_min, + comparator: ">=", + passed: metrics.stage4_contract_compliance_rate >= thresholds.stage4_contract_compliance_rate_min } ]; } diff --git a/llm_normalizer/backend/dist/eval/p0_eval_runner.js b/llm_normalizer/backend/dist/eval/p0_eval_runner.js index 483a005..33d2abb 100644 --- a/llm_normalizer/backend/dist/eval/p0_eval_runner.js +++ b/llm_normalizer/backend/dist/eval/p0_eval_runner.js @@ -10,6 +10,7 @@ const nanoid_1 = require("nanoid"); const config_1 = require("../config"); const assistantService_1 = require("../services/assistantService"); const assistantSessionStore_1 = require("../services/assistantSessionStore"); +const assistantStage4AnswerContractAudit_1 = require("../services/assistantStage4AnswerContractAudit"); const files_1 = require("../utils/files"); const p0_acceptance_gate_1 = require("./p0_acceptance_gate"); const p0_metric_definitions_1 = require("./p0_metric_definitions"); @@ -1062,16 +1063,22 @@ function computeQualityGapMetrics(caseRecords) { const mechanismSpecificityScores = caseRecords.map((item) => item.metric_subscores.mechanism_specificity_score); const followupCases = caseRecords.filter((item) => item.query_class === "followup_investigation"); const followupRetainedCases = followupCases.filter((item) => item.checks.followup_context_retained).length; + const stage4ContractAuditedCases = caseRecords + .map((item) => (0, assistantStage4AnswerContractAudit_1.buildStage4AnswerContractAuditV1)(item.assistant_reply ?? "")) + .filter((item) => item.schema_version === "stage4_answer_contract_audit_v1"); + const stage4ContractCompliantCases = stage4ContractAuditedCases.filter((item) => item.is_stage4_shape === true).length; return { raw: { generic_explanation_rate: round2(genericCases / total), false_confidence_rate: round2(falseConfidenceCases / total), mechanism_specificity_score: average(mechanismSpecificityScores), - followup_context_retention_score: round2(followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length)) + followup_context_retention_score: round2(followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length)), + stage4_contract_compliance_rate: round2(stage4ContractAuditedCases.length === 0 ? 0 : stage4ContractCompliantCases / Math.max(1, stage4ContractAuditedCases.length)) }, denominators: { cases_total: caseRecords.length, - followup_cases_total: followupCases.length + followup_cases_total: followupCases.length, + stage4_contract_audited_cases_total: stage4ContractAuditedCases.length } }; } diff --git a/llm_normalizer/backend/dist/eval/p0_metric_definitions.js b/llm_normalizer/backend/dist/eval/p0_metric_definitions.js index 25037a0..fb270d3 100644 --- a/llm_normalizer/backend/dist/eval/p0_metric_definitions.js +++ b/llm_normalizer/backend/dist/eval/p0_metric_definitions.js @@ -65,6 +65,11 @@ exports.P0_QUALITY_GAP_METRIC_DEFINITIONS = { description: "Rate of correct context retention in follow-up investigation cases.", unit: "rate", direction: "higher_is_better" + }, + stage4_contract_compliance_rate: { + description: "Share of answers that satisfy the Stage 4 user-facing answer contract shape.", + unit: "rate", + direction: "higher_is_better" } }; exports.P0_DEFAULT_FORBIDDEN_LEAKAGE_TOKENS = [ @@ -97,7 +102,8 @@ exports.P0_DEFAULT_QUALITY_GAP_THRESHOLDS = { generic_explanation_rate_max: 0.2, false_confidence_rate_max: 0.12, mechanism_specificity_score_min: 3.0, - followup_context_retention_score_min: 0.75 + followup_context_retention_score_min: 0.75, + stage4_contract_compliance_rate_min: 0.9 }; function isProblemUnitType(value) { return (value === "document_conflict" || diff --git a/llm_normalizer/backend/src/eval/p0_acceptance_gate.ts b/llm_normalizer/backend/src/eval/p0_acceptance_gate.ts index 4d438bb..53dc3cf 100644 --- a/llm_normalizer/backend/src/eval/p0_acceptance_gate.ts +++ b/llm_normalizer/backend/src/eval/p0_acceptance_gate.ts @@ -161,6 +161,13 @@ function buildQualityGapChecks(input: { threshold: thresholds.followup_context_retention_score_min, comparator: ">=", passed: metrics.followup_context_retention_score >= thresholds.followup_context_retention_score_min + }, + { + metric: "stage4_contract_compliance_rate", + value: metrics.stage4_contract_compliance_rate, + threshold: thresholds.stage4_contract_compliance_rate_min, + comparator: ">=", + passed: metrics.stage4_contract_compliance_rate >= thresholds.stage4_contract_compliance_rate_min } ]; } diff --git a/llm_normalizer/backend/src/eval/p0_eval_runner.ts b/llm_normalizer/backend/src/eval/p0_eval_runner.ts index 09ee1da..ceaeeec 100644 --- a/llm_normalizer/backend/src/eval/p0_eval_runner.ts +++ b/llm_normalizer/backend/src/eval/p0_eval_runner.ts @@ -15,6 +15,7 @@ import { } from "../config"; import { AssistantService } from "../services/assistantService"; import { AssistantSessionStore } from "../services/assistantSessionStore"; +import { buildStage4AnswerContractAuditV1 } from "../services/assistantStage4AnswerContractAudit"; import { NormalizerService } from "../services/normalizerService"; import type { AssistantMessageResponsePayload } from "../types/assistant"; import type { EvalRunMode, NormalizeRequestPayload } from "../types/normalizer"; @@ -1291,6 +1292,10 @@ function computeQualityGapMetrics(caseRecords: P0CaseRecord[]): { const mechanismSpecificityScores = caseRecords.map((item) => item.metric_subscores.mechanism_specificity_score); const followupCases = caseRecords.filter((item) => item.query_class === "followup_investigation"); const followupRetainedCases = followupCases.filter((item) => item.checks.followup_context_retained).length; + const stage4ContractAuditedCases = caseRecords + .map((item) => buildStage4AnswerContractAuditV1(item.assistant_reply ?? "")) + .filter((item) => item.schema_version === "stage4_answer_contract_audit_v1"); + const stage4ContractCompliantCases = stage4ContractAuditedCases.filter((item) => item.is_stage4_shape === true).length; return { raw: { @@ -1299,11 +1304,15 @@ function computeQualityGapMetrics(caseRecords: P0CaseRecord[]): { mechanism_specificity_score: average(mechanismSpecificityScores), followup_context_retention_score: round2( followupCases.length === 0 ? 1 : followupRetainedCases / Math.max(1, followupCases.length) + ), + stage4_contract_compliance_rate: round2( + stage4ContractAuditedCases.length === 0 ? 0 : stage4ContractCompliantCases / Math.max(1, stage4ContractAuditedCases.length) ) }, denominators: { cases_total: caseRecords.length, - followup_cases_total: followupCases.length + followup_cases_total: followupCases.length, + stage4_contract_audited_cases_total: stage4ContractAuditedCases.length } }; } diff --git a/llm_normalizer/backend/src/eval/p0_metric_definitions.ts b/llm_normalizer/backend/src/eval/p0_metric_definitions.ts index a9b191a..ba850c6 100644 --- a/llm_normalizer/backend/src/eval/p0_metric_definitions.ts +++ b/llm_normalizer/backend/src/eval/p0_metric_definitions.ts @@ -68,6 +68,7 @@ export interface P0QualityGapMetricVector { false_confidence_rate: number; mechanism_specificity_score: number; followup_context_retention_score: number; + stage4_contract_compliance_rate: number; } export type P0QualityGapMetricName = keyof P0QualityGapMetricVector; @@ -147,6 +148,11 @@ export const P0_QUALITY_GAP_METRIC_DEFINITIONS: Record { "generic_explanation_rate", "false_confidence_rate", "mechanism_specificity_score", - "followup_context_retention_score" + "followup_context_retention_score", + "stage4_contract_compliance_rate" ]); + expect(Number(response.body.report?.quality_gap_metrics?.denominators?.stage4_contract_audited_cases_total ?? 0)).toBeGreaterThan(0); + const qualityGapChecks = Array.isArray(response.body.report?.baseline_stability_gate?.quality_gap_checks) + ? response.body.report.baseline_stability_gate.quality_gap_checks + : []; + expect(qualityGapChecks.some((item: { metric?: string }) => item.metric === "stage4_contract_compliance_rate")).toBe(true); expect(["P0_ACCEPTED", "P0_ACCEPTED_WITH_LIMITATIONS", "P0_NOT_ACCEPTED"]).toContain( String(response.body.report?.acceptance_gate?.verdict ?? "") );