ГЛОБАЛЬНЫЙ РЕФАКТОРИНГ АРХИТЕКТУРЫ - Stage 4.6: добавить метрику stage4_contract_compliance_rate в Stage1 eval и отчеты
This commit is contained in:
parent
963f0aa372
commit
ce1ebae8ec
|
|
@ -2709,7 +2709,26 @@ Implemented in current pass (Stage 4.5 contract observability in debug/log, 2026
|
||||||
- focused assembler pack passed: `2 files / 5 tests`;
|
- focused assembler pack passed: `2 files / 5 tests`;
|
||||||
- `npm --prefix llm_normalizer/backend run build` passed.
|
- `npm --prefix llm_normalizer/backend run build` passed.
|
||||||
|
|
||||||
Status: In progress (Stage 4.1-4.5 completed; continue with focused wave/manual-comment quality backlog)
|
Implemented in current pass (Stage 4.6 eval metric for answer-contract compliance, 2026-04-12):
|
||||||
|
1. Added Stage 4 compliance metric to Stage 1 eval:
|
||||||
|
- `stage4_contract_compliance_rate` in `AssistantEvalMetricVector` + rubric (`ACCOUNTANT_SCORING_RUBRIC_V01`).
|
||||||
|
2. Integrated metric computation in Stage 1 eval runtime:
|
||||||
|
- source: `debug.answer_contract_stage4_v1.is_stage4_shape`;
|
||||||
|
- aggregate in `computeAssistantMetrics(...)`;
|
||||||
|
- per-case metric projection in `metric_subscores/accountant_metrics`;
|
||||||
|
- added denominator `stage4_contract_audited_cases_total`.
|
||||||
|
3. Integrated metric into reporting/comparison and autorun score index:
|
||||||
|
- Stage 1 comparison deltas now include `stage4_contract_compliance_rate`;
|
||||||
|
- auto-run score index for `assistant_stage1` now includes this compliance signal.
|
||||||
|
4. Regression updates:
|
||||||
|
- `assistantContracts.test.ts`
|
||||||
|
- `assistantEvalHarness.test.ts`
|
||||||
|
5. Validation snapshot:
|
||||||
|
- `assistantContracts.test.ts`: `2 passed`;
|
||||||
|
- `assistantEvalHarness.test.ts`: `6 passed` (run with extended timeout budget);
|
||||||
|
- `npm --prefix llm_normalizer/backend run build` passed.
|
||||||
|
|
||||||
|
Status: In progress (Stage 4.1-4.6 completed; continue with focused wave/manual-comment quality backlog)
|
||||||
|
|
||||||
## Stage 5 (P3): Quality Loop Driven By GUI Markup
|
## Stage 5 (P3): Quality Loop Driven By GUI Markup
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -533,7 +533,8 @@ function computeScoreIndex(report, target) {
|
||||||
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
|
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
|
||||||
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
|
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
|
||||||
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
|
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
|
||||||
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score))
|
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)),
|
||||||
|
rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate))
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
if (target === "assistant_stage2") {
|
if (target === "assistant_stage2") {
|
||||||
|
|
|
||||||
|
|
@ -359,6 +359,13 @@ function rateToBandScore(metric, value) {
|
||||||
return 3;
|
return 3;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (metric === "stage4_contract_compliance_rate") {
|
||||||
|
if (value >= 0.95)
|
||||||
|
return 5;
|
||||||
|
if (value >= 0.8)
|
||||||
|
return 3;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
|
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
|
||||||
if (value <= 0.25)
|
if (value <= 0.25)
|
||||||
return 5;
|
return 5;
|
||||||
|
|
@ -1254,6 +1261,8 @@ class EvalService {
|
||||||
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
|
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
|
||||||
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
|
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
|
||||||
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
|
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
|
||||||
|
const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null);
|
||||||
|
const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length;
|
||||||
const avgActionability = diagnostics.length > 0
|
const avgActionability = diagnostics.length > 0
|
||||||
? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
|
? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
|
||||||
: null;
|
: null;
|
||||||
|
|
@ -1268,7 +1277,8 @@ class EvalService {
|
||||||
false_confidence_rate: round2(falseConfidenceCases / total),
|
false_confidence_rate: round2(falseConfidenceCases / total),
|
||||||
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
|
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
|
||||||
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
|
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
|
||||||
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
|
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup),
|
||||||
|
stage4_contract_compliance_rate: stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null
|
||||||
};
|
};
|
||||||
const rubric_bands = {
|
const rubric_bands = {
|
||||||
retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
|
retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
|
||||||
|
|
@ -1277,7 +1287,8 @@ class EvalService {
|
||||||
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
|
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
|
||||||
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
|
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
|
||||||
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
|
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
|
||||||
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
|
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score),
|
||||||
|
stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate)
|
||||||
};
|
};
|
||||||
return {
|
return {
|
||||||
raw,
|
raw,
|
||||||
|
|
@ -1285,7 +1296,8 @@ class EvalService {
|
||||||
denominators: {
|
denominators: {
|
||||||
cases_total: diagnostics.length,
|
cases_total: diagnostics.length,
|
||||||
broad_cases_total: broadCases.length,
|
broad_cases_total: broadCases.length,
|
||||||
followup_cases_total: followupCases.length
|
followup_cases_total: followupCases.length,
|
||||||
|
stage4_contract_audited_cases_total: stage4AuditedCases.length
|
||||||
},
|
},
|
||||||
signature_counts: signatureCounter
|
signature_counts: signatureCounter
|
||||||
};
|
};
|
||||||
|
|
@ -1364,7 +1376,8 @@ class EvalService {
|
||||||
"false_confidence_rate",
|
"false_confidence_rate",
|
||||||
"broad_answer_rate",
|
"broad_answer_rate",
|
||||||
"mechanism_specificity_score",
|
"mechanism_specificity_score",
|
||||||
"followup_context_retention_score"
|
"followup_context_retention_score",
|
||||||
|
"stage4_contract_compliance_rate"
|
||||||
];
|
];
|
||||||
const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
|
const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
|
||||||
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
|
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
|
||||||
|
|
@ -1634,6 +1647,7 @@ class EvalService {
|
||||||
final_reply_type: "backend_error",
|
final_reply_type: "backend_error",
|
||||||
turn_count: turnResponses.length,
|
turn_count: turnResponses.length,
|
||||||
narrowing_result: "failed",
|
narrowing_result: "failed",
|
||||||
|
stage4_contract_shape_compliant: null,
|
||||||
signature: `backend_error|${suiteCase.scenario_tag}`,
|
signature: `backend_error|${suiteCase.scenario_tag}`,
|
||||||
is_generic: true,
|
is_generic: true,
|
||||||
is_false_confident: false,
|
is_false_confident: false,
|
||||||
|
|
@ -1671,6 +1685,9 @@ class EvalService {
|
||||||
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
|
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
|
||||||
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
||||||
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
|
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
|
||||||
|
const stage4ContractShapeCompliant = typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean"
|
||||||
|
? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape
|
||||||
|
: null;
|
||||||
const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" "));
|
const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" "));
|
||||||
let genericnessScore = 0;
|
let genericnessScore = 0;
|
||||||
if (!hasAnchors)
|
if (!hasAnchors)
|
||||||
|
|
@ -1779,6 +1796,7 @@ class EvalService {
|
||||||
final_reply_type: finalResponse.reply_type,
|
final_reply_type: finalResponse.reply_type,
|
||||||
turn_count: suiteCase.turns.length,
|
turn_count: suiteCase.turns.length,
|
||||||
narrowing_result: narrowingResult,
|
narrowing_result: narrowingResult,
|
||||||
|
stage4_contract_shape_compliant: stage4ContractShapeCompliant,
|
||||||
signature: [
|
signature: [
|
||||||
finalResponse.reply_type,
|
finalResponse.reply_type,
|
||||||
signals.routes.sort().join(","),
|
signals.routes.sort().join(","),
|
||||||
|
|
@ -1810,7 +1828,8 @@ class EvalService {
|
||||||
false_confidence_rate: item.is_false_confident ? 1 : 0,
|
false_confidence_rate: item.is_false_confident ? 1 : 0,
|
||||||
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
|
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
|
||||||
mechanism_specificity_score: round2(item.mechanism_specificity_score),
|
mechanism_specificity_score: round2(item.mechanism_specificity_score),
|
||||||
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score)
|
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score),
|
||||||
|
stage4_contract_compliance_rate: item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0
|
||||||
};
|
};
|
||||||
return {
|
return {
|
||||||
schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
|
schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
|
||||||
|
|
@ -1839,7 +1858,8 @@ class EvalService {
|
||||||
mechanism_status: item.signals.mechanism_status,
|
mechanism_status: item.signals.mechanism_status,
|
||||||
source_refs: item.signals.source_refs,
|
source_refs: item.signals.source_refs,
|
||||||
routes: item.signals.routes,
|
routes: item.signals.routes,
|
||||||
followup_state_applied: item.signals.followup_state_applied
|
followup_state_applied: item.signals.followup_state_applied,
|
||||||
|
stage4_contract_shape_compliant: item.stage4_contract_shape_compliant
|
||||||
},
|
},
|
||||||
metric_subscores: caseMetricVector,
|
metric_subscores: caseMetricVector,
|
||||||
limitations: item.limitations,
|
limitations: item.limitations,
|
||||||
|
|
|
||||||
|
|
@ -44,5 +44,10 @@ exports.ACCOUNTANT_SCORING_RUBRIC_V01 = {
|
||||||
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
|
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
|
||||||
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
|
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
|
||||||
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
|
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
|
||||||
|
],
|
||||||
|
stage4_contract_compliance_rate: [
|
||||||
|
{ score: 0, label: "Non-Compliant", description: "Stage 4 block contract is mostly missing or polluted by legacy sections." },
|
||||||
|
{ score: 3, label: "Partially Compliant", description: "Stage 4 answer shape is present only in part of audited responses." },
|
||||||
|
{ score: 5, label: "Compliant", description: "Stage 4 block contract is consistently present without legacy leakage." }
|
||||||
]
|
]
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -722,7 +722,8 @@ function computeScoreIndex(report: Record<string, unknown>, target: AutoRunTarge
|
||||||
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
|
rateToPercent(1 - (toNumberSafe(metrics.false_confidence_rate) ?? 1)),
|
||||||
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
|
rateToPercent(1 - (toNumberSafe(metrics.broad_answer_rate) ?? 1)),
|
||||||
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
|
scoreToPercent(toNumberSafe(metrics.mechanism_specificity_score)),
|
||||||
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score))
|
scoreToPercent(toNumberSafe(metrics.followup_context_retention_score)),
|
||||||
|
rateToPercent(toNumberSafe(metrics.stage4_contract_compliance_rate))
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -473,6 +473,7 @@ interface AssistantCaseDiagnostics {
|
||||||
final_reply_type: string;
|
final_reply_type: string;
|
||||||
turn_count: number;
|
turn_count: number;
|
||||||
narrowing_result: AssistantEvalNarrowingResult;
|
narrowing_result: AssistantEvalNarrowingResult;
|
||||||
|
stage4_contract_shape_compliant: boolean | null;
|
||||||
signature: string;
|
signature: string;
|
||||||
is_generic: boolean;
|
is_generic: boolean;
|
||||||
is_false_confident: boolean;
|
is_false_confident: boolean;
|
||||||
|
|
@ -554,6 +555,11 @@ function rateToBandScore(metric: AssistantMetricKey, value: number): 0 | 3 | 5 {
|
||||||
if (value >= 0.45) return 3;
|
if (value >= 0.45) return 3;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (metric === "stage4_contract_compliance_rate") {
|
||||||
|
if (value >= 0.95) return 5;
|
||||||
|
if (value >= 0.8) return 3;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
|
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
|
||||||
if (value <= 0.25) return 5;
|
if (value <= 0.25) return 5;
|
||||||
if (value <= 0.45) return 3;
|
if (value <= 0.45) return 3;
|
||||||
|
|
@ -1539,6 +1545,8 @@ export class EvalService {
|
||||||
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
|
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
|
||||||
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
|
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
|
||||||
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
|
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
|
||||||
|
const stage4AuditedCases = diagnostics.filter((item) => item.stage4_contract_shape_compliant !== null);
|
||||||
|
const stage4CompliantCases = stage4AuditedCases.filter((item) => item.stage4_contract_shape_compliant === true).length;
|
||||||
|
|
||||||
const avgActionability =
|
const avgActionability =
|
||||||
diagnostics.length > 0
|
diagnostics.length > 0
|
||||||
|
|
@ -1558,7 +1566,9 @@ export class EvalService {
|
||||||
false_confidence_rate: round2(falseConfidenceCases / total),
|
false_confidence_rate: round2(falseConfidenceCases / total),
|
||||||
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
|
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
|
||||||
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
|
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
|
||||||
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
|
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup),
|
||||||
|
stage4_contract_compliance_rate:
|
||||||
|
stage4AuditedCases.length > 0 ? round2(stage4CompliantCases / stage4AuditedCases.length) : null
|
||||||
};
|
};
|
||||||
|
|
||||||
const rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null> = {
|
const rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null> = {
|
||||||
|
|
@ -1568,7 +1578,8 @@ export class EvalService {
|
||||||
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
|
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
|
||||||
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
|
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
|
||||||
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
|
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
|
||||||
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
|
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score),
|
||||||
|
stage4_contract_compliance_rate: rubricBandForMetric("stage4_contract_compliance_rate", raw.stage4_contract_compliance_rate)
|
||||||
};
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -1577,7 +1588,8 @@ export class EvalService {
|
||||||
denominators: {
|
denominators: {
|
||||||
cases_total: diagnostics.length,
|
cases_total: diagnostics.length,
|
||||||
broad_cases_total: broadCases.length,
|
broad_cases_total: broadCases.length,
|
||||||
followup_cases_total: followupCases.length
|
followup_cases_total: followupCases.length,
|
||||||
|
stage4_contract_audited_cases_total: stage4AuditedCases.length
|
||||||
},
|
},
|
||||||
signature_counts: signatureCounter
|
signature_counts: signatureCounter
|
||||||
};
|
};
|
||||||
|
|
@ -1672,7 +1684,8 @@ export class EvalService {
|
||||||
"false_confidence_rate",
|
"false_confidence_rate",
|
||||||
"broad_answer_rate",
|
"broad_answer_rate",
|
||||||
"mechanism_specificity_score",
|
"mechanism_specificity_score",
|
||||||
"followup_context_retention_score"
|
"followup_context_retention_score",
|
||||||
|
"stage4_contract_compliance_rate"
|
||||||
];
|
];
|
||||||
const lowerIsBetter = new Set<AssistantMetricKey>(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
|
const lowerIsBetter = new Set<AssistantMetricKey>(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
|
||||||
|
|
||||||
|
|
@ -1976,6 +1989,7 @@ export class EvalService {
|
||||||
final_reply_type: "backend_error",
|
final_reply_type: "backend_error",
|
||||||
turn_count: turnResponses.length,
|
turn_count: turnResponses.length,
|
||||||
narrowing_result: "failed",
|
narrowing_result: "failed",
|
||||||
|
stage4_contract_shape_compliant: null,
|
||||||
signature: `backend_error|${suiteCase.scenario_tag}`,
|
signature: `backend_error|${suiteCase.scenario_tag}`,
|
||||||
is_generic: true,
|
is_generic: true,
|
||||||
is_false_confident: false,
|
is_false_confident: false,
|
||||||
|
|
@ -2014,6 +2028,10 @@ export class EvalService {
|
||||||
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
|
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
|
||||||
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
||||||
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
|
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
|
||||||
|
const stage4ContractShapeCompliant =
|
||||||
|
typeof finalResponse.debug?.answer_contract_stage4_v1?.is_stage4_shape === "boolean"
|
||||||
|
? finalResponse.debug.answer_contract_stage4_v1.is_stage4_shape
|
||||||
|
: null;
|
||||||
|
|
||||||
const hasAnchors = hasDomainAnchors(
|
const hasAnchors = hasDomainAnchors(
|
||||||
[directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")
|
[directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")
|
||||||
|
|
@ -2113,6 +2131,7 @@ export class EvalService {
|
||||||
final_reply_type: finalResponse.reply_type,
|
final_reply_type: finalResponse.reply_type,
|
||||||
turn_count: suiteCase.turns.length,
|
turn_count: suiteCase.turns.length,
|
||||||
narrowing_result: narrowingResult,
|
narrowing_result: narrowingResult,
|
||||||
|
stage4_contract_shape_compliant: stage4ContractShapeCompliant,
|
||||||
signature: [
|
signature: [
|
||||||
finalResponse.reply_type,
|
finalResponse.reply_type,
|
||||||
signals.routes.sort().join(","),
|
signals.routes.sort().join(","),
|
||||||
|
|
@ -2146,7 +2165,9 @@ export class EvalService {
|
||||||
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
|
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
|
||||||
mechanism_specificity_score: round2(item.mechanism_specificity_score),
|
mechanism_specificity_score: round2(item.mechanism_specificity_score),
|
||||||
followup_context_retention_score:
|
followup_context_retention_score:
|
||||||
item.followup_retention_score === null ? null : round2(item.followup_retention_score)
|
item.followup_retention_score === null ? null : round2(item.followup_retention_score),
|
||||||
|
stage4_contract_compliance_rate:
|
||||||
|
item.stage4_contract_shape_compliant === null ? null : item.stage4_contract_shape_compliant ? 1 : 0
|
||||||
};
|
};
|
||||||
return {
|
return {
|
||||||
schema_version: ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
|
schema_version: ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
|
||||||
|
|
@ -2175,7 +2196,8 @@ export class EvalService {
|
||||||
mechanism_status: item.signals.mechanism_status,
|
mechanism_status: item.signals.mechanism_status,
|
||||||
source_refs: item.signals.source_refs,
|
source_refs: item.signals.source_refs,
|
||||||
routes: item.signals.routes,
|
routes: item.signals.routes,
|
||||||
followup_state_applied: item.signals.followup_state_applied
|
followup_state_applied: item.signals.followup_state_applied,
|
||||||
|
stage4_contract_shape_compliant: item.stage4_contract_shape_compliant
|
||||||
},
|
},
|
||||||
metric_subscores: caseMetricVector,
|
metric_subscores: caseMetricVector,
|
||||||
limitations: item.limitations,
|
limitations: item.limitations,
|
||||||
|
|
|
||||||
|
|
@ -161,6 +161,7 @@ export interface AssistantEvalMetricVector {
|
||||||
broad_answer_rate: number | null;
|
broad_answer_rate: number | null;
|
||||||
mechanism_specificity_score: number | null;
|
mechanism_specificity_score: number | null;
|
||||||
followup_context_retention_score: number | null;
|
followup_context_retention_score: number | null;
|
||||||
|
stage4_contract_compliance_rate: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface AssistantEvalRecord {
|
export interface AssistantEvalRecord {
|
||||||
|
|
@ -227,5 +228,10 @@ export const ACCOUNTANT_SCORING_RUBRIC_V01: Record<AccountantMetricName, Account
|
||||||
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
|
{ score: 0, label: "Context Lost", description: "Follow-up теряет фокус текущего разбора." },
|
||||||
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
|
{ score: 3, label: "Context Partial", description: "Фокус удерживается частично, с дрейфом." },
|
||||||
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
|
{ score: 5, label: "Context Retained", description: "Follow-up устойчиво держит предмет и ограничения." }
|
||||||
|
],
|
||||||
|
stage4_contract_compliance_rate: [
|
||||||
|
{ score: 0, label: "Non-Compliant", description: "Stage 4 block contract is mostly missing or polluted by legacy sections." },
|
||||||
|
{ score: 3, label: "Partially Compliant", description: "Stage 4 answer shape is present only in part of audited responses." },
|
||||||
|
{ score: 5, label: "Compliant", description: "Stage 4 block contract is consistently present without legacy leakage." }
|
||||||
]
|
]
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -109,7 +109,8 @@ describe("stage1 contract scaffolding", () => {
|
||||||
"false_confidence_rate",
|
"false_confidence_rate",
|
||||||
"broad_answer_rate",
|
"broad_answer_rate",
|
||||||
"mechanism_specificity_score",
|
"mechanism_specificity_score",
|
||||||
"followup_context_retention_score"
|
"followup_context_retention_score",
|
||||||
|
"stage4_contract_compliance_rate"
|
||||||
]);
|
]);
|
||||||
for (const metric of metricNames) {
|
for (const metric of metricNames) {
|
||||||
const bands = ACCOUNTANT_SCORING_RUBRIC_V01[metric as keyof typeof ACCOUNTANT_SCORING_RUBRIC_V01];
|
const bands = ACCOUNTANT_SCORING_RUBRIC_V01[metric as keyof typeof ACCOUNTANT_SCORING_RUBRIC_V01];
|
||||||
|
|
|
||||||
|
|
@ -84,7 +84,8 @@ describe.sequential("assistant Stage 1 eval harness", () => {
|
||||||
"false_confidence_rate",
|
"false_confidence_rate",
|
||||||
"broad_answer_rate",
|
"broad_answer_rate",
|
||||||
"mechanism_specificity_score",
|
"mechanism_specificity_score",
|
||||||
"followup_context_retention_score"
|
"followup_context_retention_score",
|
||||||
|
"stage4_contract_compliance_rate"
|
||||||
]);
|
]);
|
||||||
expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy();
|
expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy();
|
||||||
expect(response.body.report?.feature_profile_snapshot).toBeTruthy();
|
expect(response.body.report?.feature_profile_snapshot).toBeTruthy();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue