NODEDC_1C/llm_normalizer/backend/src/services/evalService.ts

1836 lines
74 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fs from "fs";
import path from "path";
import { nanoid } from "nanoid";
import {
EVAL_CASES_DIR,
EVAL_DATASETS_DIR,
FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1,
FEATURE_ASSISTANT_ANSWER_POLICY_V11,
REPORTS_DIR
} from "../config";
import type { AssistantMessageResponsePayload } from "../types/assistant";
import type { EvalTarget, AssistantStage1SuiteCase, AssistantStage1SuiteFile } from "../types/assistantEval";
import type {
EvalRunMode,
NoRouteReason,
NormalizeRequestPayload,
NormalizedQueryV1,
NormalizedQueryV2,
NormalizedQueryV2_0_1,
NormalizedQueryV2_0_2
} from "../types/normalizer";
import {
ACCOUNTANT_SCORING_RUBRIC_V01,
ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
type AccountantMetricName,
type AccountantMetricRubricBand,
type AssistantEvalMetricVector,
type AssistantEvalNarrowingResult,
type AssistantEvalRecord
} from "../types/stage1Contracts";
import { ApiError } from "../utils/http";
import { AssistantService } from "./assistantService";
import { AssistantSessionStore } from "./assistantSessionStore";
import { NormalizerService } from "./normalizerService";
import { ensureDir, writeJsonFile } from "../utils/files";
interface EvalCaseFile {
case_id: string;
raw_question: string;
expected: {
intent_class?: string;
route_hint?: string;
expected_scope_in_scope?: boolean;
clarification_required?: boolean;
expected_route_status?: "routed" | "no_route";
expected_no_route_reason?: NoRouteReason;
expected_execution_readiness?: "executable" | "executable_with_soft_assumptions" | "needs_clarification" | "no_route";
requires?: {
needs_cross_entity_join?: boolean;
needs_causal_chain?: boolean;
};
accounts_mentioned?: string[];
expected_output_shape?: string;
};
}
interface EvalInputCase {
case_id: string;
raw_question: string;
expected: EvalCaseFile["expected"] | null;
}
interface BaselineMetrics {
schema_validation_pass_rate: number;
intent_class_accuracy: number;
route_hint_accuracy: number;
causal_flag_accuracy: number;
high_confidence_error_rate: number;
}
const BASELINE_METRICS: BaselineMetrics = {
schema_validation_pass_rate: 100,
intent_class_accuracy: 72.73,
route_hint_accuracy: 90.91,
causal_flag_accuracy: 81.82,
high_confidence_error_rate: 9.09
};
const V111_MICRO_CASE_IDS = ["NQ-008", "V11-DD-005", "V11-OT-003", "V11-OT-004", "V11-OT-005"];
const V112_MICRO_CASE_IDS = ["NQ-002", "NQ-007", "V11-HA-004", "V11-OT-003", "V11-OT-005"];
function isSameCaseSet(input: string[] | undefined, target: string[]): boolean {
if (!input || input.length !== target.length) {
return false;
}
const left = [...input].sort();
const right = [...target].sort();
return left.every((value, index) => value === right[index]);
}
function formatPercent(value: number): string {
return `${value.toFixed(2)}%`;
}
function shortMismatchComment(input: { intentMatch: boolean; routeMatch: boolean; causalMatch: boolean; validationPassed: boolean }): string {
if (!input.validationPassed) {
return "Schema validation failed for this case.";
}
if (!input.intentMatch && input.routeMatch) {
return "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.";
}
if (input.intentMatch && !input.routeMatch) {
return "Intent understood, but route_hint selected a weaker execution route.";
}
if (!input.intentMatch && !input.routeMatch) {
return "Both intent and route misclassified; likely lexical ambiguity in causal vs risk wording.";
}
if (!input.causalMatch) {
return "Causal flags are inconsistent with expected relationship depth.";
}
return "No mismatch.";
}
function buildMarkdownReport(report: Record<string, unknown>): string {
const metrics = (report.metrics ?? {}) as Record<string, number>;
const baseline = (report.baseline_metrics ?? {}) as Record<string, number>;
const delta = (report.baseline_delta ?? {}) as Record<string, number>;
const classAccuracy = (report.class_accuracy ?? {}) as Record<string, { total: number; passed: number; accuracy_percent: number }>;
const mismatches = Array.isArray(report.mismatches) ? report.mismatches : [];
const badConfidenceCases = Array.isArray(report.bad_confidence_cases) ? report.bad_confidence_cases : [];
const budget = (report.budget ?? {}) as Record<string, unknown>;
const metricRows = Object.keys(metrics)
.map((key) => {
const current = Number(metrics[key] ?? 0);
const base = Number(baseline[key] ?? 0);
const d = Number(delta[key] ?? 0);
const sign = d > 0 ? "+" : "";
return `| ${key} | ${formatPercent(current)} | ${formatPercent(base)} | ${sign}${d.toFixed(2)} |`;
})
.join("\n");
const classRows = Object.keys(classAccuracy)
.map((key) => {
const row = classAccuracy[key];
return `| ${key} | ${row.passed}/${row.total} | ${formatPercent(row.accuracy_percent)} |`;
})
.join("\n");
const mismatchRows =
mismatches.length === 0
? "No mismatches."
: mismatches
.map((item) => {
const row = item as Record<string, unknown>;
return `- ${row.case_id}: expected(${row.expected_intent_class} / ${row.expected_route_hint}) -> actual(${row.actual_intent_class} / ${row.actual_route_hint}). ${row.comment}`;
})
.join("\n");
const badConfidenceRows =
badConfidenceCases.length === 0
? "No bad-confidence cases."
: badConfidenceCases
.map((item) => {
const row = item as Record<string, unknown>;
return `- ${row.case_id}: confidence=${row.confidence_overall}, intent_match=${row.intent_match}, route_match=${row.route_match}`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "LLM Normalizer Eval Run")}`,
"",
`- run_id: ${String(report.run_id ?? "")}`,
`- timestamp: ${String(report.timestamp ?? "")}`,
`- mode: ${String(report.mode ?? "")}`,
`- use_mock: ${String(report.use_mock ?? false)}`,
`- cases_total: ${String(report.cases_total ?? 0)}`,
`- prompt_version: ${String(report.prompt_version ?? "")}`,
"",
"## Metrics vs Baseline",
"",
"| Metric | Current | Baseline | Delta |",
"|---|---:|---:|---:|",
metricRows || "| n/a | n/a | n/a | n/a |",
"",
"## Class Accuracy",
"",
"| Intent class | Passed/Total | Accuracy |",
"|---|---:|---:|",
classRows || "| n/a | n/a | n/a |",
"",
"## Budget",
"",
`- requests_total: ${String(budget.requests_total ?? 0)}`,
`- retries_used: ${String(budget.retries_used ?? 0)}`,
"",
"## Mismatches",
"",
mismatchRows,
"",
"## Bad Confidence Cases",
"",
badConfidenceRows,
""
].join("\n");
}
function parseCaseSetFile(inputPath: string): EvalCaseFile[] {
const filePath = path.isAbsolute(inputPath) ? inputPath : path.resolve(EVAL_DATASETS_DIR, inputPath);
const raw = fs.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
const parsed = JSON.parse(raw) as unknown;
if (Array.isArray(parsed)) {
return parsed as EvalCaseFile[];
}
if (parsed && typeof parsed === "object" && Array.isArray((parsed as { cases?: unknown[] }).cases)) {
return (parsed as { cases: EvalCaseFile[] }).cases;
}
throw new Error(`Unsupported eval dataset format: ${filePath}`);
}
function formatCaseId(prefix: string, index: number): string {
return `${prefix}-${String(index + 1).padStart(3, "0")}`;
}
function parseRawQuestions(rawQuestions: string): string[] {
const text = rawQuestions.replace(/\r\n/g, "\n").trim();
if (!text) {
return [];
}
const bySemicolon = text
.split(";")
.map((item) => item.trim())
.filter(Boolean);
if (bySemicolon.length > 1) {
return bySemicolon;
}
const byBlankLine = text
.split(/\n\s*\n+/)
.map((item) => item.trim())
.filter(Boolean);
if (byBlankLine.length > 1) {
return byBlankLine;
}
const byLine = text
.split("\n")
.map((item) => item.trim())
.filter(Boolean);
return byLine.length > 0 ? byLine : [text];
}
type V2FamilyFragment =
| NormalizedQueryV2["fragments"][number]
| NormalizedQueryV2_0_1["fragments"][number]
| NormalizedQueryV2_0_2["fragments"][number];
function executionReadinessOf(fragment: V2FamilyFragment): string {
return "execution_readiness" in fragment ? fragment.execution_readiness : "executable";
}
function softAssumptionsOf(fragment: V2FamilyFragment): string[] {
return "soft_assumption_used" in fragment ? fragment.soft_assumption_used : [];
}
function routeStatusOf(fragment: V2FamilyFragment): "routed" | "no_route" | null {
return "route_status" in fragment ? fragment.route_status : null;
}
function noRouteReasonOf(fragment: V2FamilyFragment): NoRouteReason | null {
return "no_route_reason" in fragment ? fragment.no_route_reason : null;
}
function expectedScopeInScope(expected: EvalCaseFile["expected"] | null): boolean | null {
if (!expected) {
return null;
}
if (typeof expected.expected_scope_in_scope === "boolean") {
return expected.expected_scope_in_scope;
}
if (expected.expected_no_route_reason === "out_of_scope") {
return false;
}
if (expected.expected_route_status === "routed") {
return true;
}
if (typeof expected.clarification_required === "boolean") {
return true;
}
return null;
}
function isDecisionStateConsistent(decision: {
route: string;
execution_readiness?: string | null;
no_route_reason?: NoRouteReason | null;
}): boolean {
const readiness = String(decision.execution_readiness ?? "");
const noRouteReason = decision.no_route_reason ?? null;
if (decision.route === "no_route") {
if (!noRouteReason) {
return false;
}
return readiness !== "executable" && readiness !== "executable_with_soft_assumptions";
}
if (noRouteReason) {
return false;
}
return readiness !== "needs_clarification" && readiness !== "no_route";
}
const DEFAULT_ASSISTANT_STAGE1_SUITE_FILE = "assistant_stage1_canonical_v0_1.json";
const ASSISTANT_STAGE1_RUN_SCHEMA_VERSION = "assistant_stage1_eval_run_v0_1";
const ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION = "assistant_stage1_eval_comparison_v0_1";
type AssistantMetricKey = keyof AssistantEvalMetricVector;
interface AssistantCaseSignalSummary {
broad_query_detected: boolean;
broad_result_flag: boolean;
narrowing_strength: "weak" | "medium" | "strong" | null;
minimum_evidence_failed: boolean;
degraded_to: "partial" | "clarification" | null;
evidence_confidence: "high" | "medium" | "low" | null;
limitation_reason_codes: string[];
mechanism_status: "grounded" | "limited" | "unresolved" | null;
source_refs: string[];
routes: string[];
followup_state_applied: boolean;
uncertainty_limitations_count: number;
}
interface AssistantCaseDiagnostics {
suite_case: AssistantStage1SuiteCase;
session_id: string;
trace_id: string | null;
final_reply_type: string;
turn_count: number;
narrowing_result: AssistantEvalNarrowingResult;
signature: string;
is_generic: boolean;
is_false_confident: boolean;
is_broad_answer: boolean | null;
followup_retention_score: number | null;
evidence_quality_score: number;
mechanism_specificity_score: number;
genericness_score: number;
accountant_actionability_score: number;
accountant_usefulness_score: number;
signals: AssistantCaseSignalSummary;
limitations: string[];
notes: string[];
}
function round2(value: number): number {
return Number(value.toFixed(2));
}
function clampScore(value: number, min = 0, max = 5): number {
if (Number.isNaN(value)) {
return min;
}
if (value < min) return min;
if (value > max) return max;
return value;
}
function rateToBandScore(metric: AssistantMetricKey, value: number): 0 | 3 | 5 {
if (metric === "retrieval_differentiation_rate") {
if (value >= 0.75) return 5;
if (value >= 0.45) return 3;
return 0;
}
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
if (value <= 0.25) return 5;
if (value <= 0.45) return 3;
return 0;
}
if (metric === "accountant_actionability_score" || metric === "mechanism_specificity_score" || metric === "followup_context_retention_score") {
if (value >= 4) return 5;
if (value >= 2.5) return 3;
return 0;
}
return 0;
}
function rubricBandForMetric(metric: AssistantMetricKey, value: number | null): AccountantMetricRubricBand | null {
if (value === null) {
return null;
}
const score = rateToBandScore(metric, value);
return ACCOUNTANT_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null;
}
function buildFeatureProfileSnapshot(): Record<string, unknown> {
return {
FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1,
FEATURE_ASSISTANT_ANSWER_POLICY_V11: process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 ?? String(FEATURE_ASSISTANT_ANSWER_POLICY_V11),
FEATURE_ASSISTANT_BROAD_GUARD_V1: process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 ?? null,
FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1: process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 ?? null,
FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1: process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 ?? null,
FEATURE_ASSISTANT_INVESTIGATION_STATE_V1: process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 ?? null,
FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1: process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 ?? null
};
}
function buildCodeVersionMarker(): Record<string, string | null> {
return {
git_commit:
process.env.GIT_COMMIT ??
process.env.CI_COMMIT_SHA ??
process.env.VERCEL_GIT_COMMIT_SHA ??
process.env.GITHUB_SHA ??
null,
build_marker: process.env.BUILD_MARKER ?? process.env.BUILD_ID ?? process.env.npm_package_version ?? null
};
}
function resolveReadablePath(inputPath: string): string {
if (path.isAbsolute(inputPath)) {
return inputPath;
}
const candidates = [
path.resolve(REPORTS_DIR, inputPath),
path.resolve(EVAL_DATASETS_DIR, inputPath),
path.resolve(EVAL_CASES_DIR, inputPath),
path.resolve(inputPath)
];
for (const candidate of candidates) {
if (fs.existsSync(candidate)) {
return candidate;
}
}
return candidates[0];
}
function parseAssistantSuiteFile(inputPath?: string): AssistantStage1SuiteFile {
const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE1_SUITE_FILE);
const raw = fs.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
const parsed = JSON.parse(raw) as AssistantStage1SuiteFile;
if (!parsed || typeof parsed !== "object") {
throw new Error(`Invalid assistant suite format: ${filePath}`);
}
if (!Array.isArray(parsed.cases)) {
throw new Error(`Assistant suite cases[] is required: ${filePath}`);
}
if (!Array.isArray(parsed.case_ids)) {
throw new Error(`Assistant suite case_ids[] is required: ${filePath}`);
}
if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) {
throw new Error(`Assistant suite suite_id is required: ${filePath}`);
}
if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) {
throw new Error(`Assistant suite suite_version is required: ${filePath}`);
}
if (parsed.scenario_count !== parsed.cases.length) {
throw new Error(`Assistant suite scenario_count mismatch: ${filePath}`);
}
const declaredIds = [...parsed.case_ids].sort();
const actualIds = parsed.cases.map((item) => item.case_id).sort();
const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]);
if (!idsMatch) {
throw new Error(`Assistant suite case_ids do not match cases[]: ${filePath}`);
}
for (const item of parsed.cases) {
if (!Array.isArray(item.turns) || item.turns.length === 0) {
throw new Error(`Assistant suite case ${item.case_id} must include at least one turn.`);
}
}
return parsed;
}
function hasDomainAnchors(text: string): boolean {
const source = String(text ?? "");
if (!source.trim()) {
return false;
}
const hasPeriod = /\b20\d{2}(?:[-./](?:0[1-9]|1[0-2]))?\b/.test(source);
const hasAccountingObject = /(счет|контрагент|документ|ндс|ос|period|account|supplier|invoice|guid|объект)/i.test(source);
const hasAccountCode = /\b(?:01|02|03|04|08|10|19|20|25|26|41|43|44|50|51|52|57|60|62|68|69|70|71|73|76|90|91|94|97)\b/.test(source);
const hits = [hasPeriod, hasAccountingObject, hasAccountCode].filter(Boolean).length;
return hits >= 2;
}
function extractTextList(value: unknown): string[] {
if (!Array.isArray(value)) {
return [];
}
return value
.map((item) => (typeof item === "string" ? item.trim() : ""))
.filter(Boolean);
}
function toNarrowingStrength(value: unknown): "weak" | "medium" | "strong" | null {
if (value === "weak" || value === "medium" || value === "strong") {
return value;
}
return null;
}
function toDegradedTo(value: unknown): "partial" | "clarification" | null {
if (value === "partial" || value === "clarification") {
return value;
}
return null;
}
function buildAssistantEvalMarkdownReport(report: Record<string, unknown>): string {
const metrics = ((report.metrics ?? {}) as { raw?: Record<string, number | null> }).raw ?? {};
const bands = (report.rubric_bands ?? {}) as Record<string, { score?: number; label?: string } | null>;
const subsets = (report.subsets ?? {}) as Record<string, unknown>;
const scenarioSummary = (report.scenario_summary ?? {}) as Record<string, unknown>;
const improvementHints = (report.improvement_hints ?? {}) as Record<string, unknown>;
const rows = Object.keys(metrics)
.map((key) => {
const rawValue = metrics[key];
const band = bands[key];
const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue);
const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a";
return `| ${key} | ${rawPrintable} | ${bandPrintable} |`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "Assistant Stage 1 Eval Run")}`,
"",
`- run_id: ${String(report.run_id ?? "")}`,
`- eval_target: ${String(report.eval_target ?? "")}`,
`- run_timestamp: ${String(report.run_timestamp ?? "")}`,
`- suite_id: ${String(report.suite_id ?? "")}`,
`- suite_version: ${String(report.suite_version ?? "")}`,
`- cases_total: ${String(report.cases_total ?? 0)}`,
"",
"## Raw Metrics and Rubric Bands",
"",
"| Metric | Raw | Rubric band |",
"|---|---:|---|",
rows || "| n/a | n/a | n/a |",
"",
"## Subsets",
"",
`- broad_cases_total: ${String(subsets.broad_cases_total ?? 0)}`,
`- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`,
"",
"## Scenario Summary",
"",
`- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`,
`- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`,
`- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`,
"",
"## Improvement Hints",
"",
`- strongest_signals: ${String(improvementHints.strongest_signals ?? "n/a")}`,
`- weakest_signals: ${String(improvementHints.weakest_signals ?? "n/a")}`,
""
].join("\n");
}
function buildAssistantComparisonMarkdownReport(report: Record<string, unknown>): string {
const metrics = (report.metric_deltas ?? {}) as Record<
string,
{ baseline?: number | null; current?: number | null; delta?: number | null; trend?: string }
>;
const summary = (report.scenario_notes_summary ?? {}) as Record<string, unknown>;
const rows = Object.keys(metrics)
.map((key) => {
const row = metrics[key];
return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "Assistant Stage 1 Baseline vs Current")}`,
"",
`- comparison_id: ${String(report.comparison_id ?? "")}`,
`- baseline_run_id: ${String(report.baseline_run_id ?? "")}`,
`- current_run_id: ${String(report.current_run_id ?? "")}`,
`- suite_version: ${String(report.suite_version ?? "")}`,
"",
"## Metric Deltas",
"",
"| Metric | Baseline | Current | Delta | Trend |",
"|---|---:|---:|---:|---|",
rows || "| n/a | n/a | n/a | n/a | n/a |",
"",
"## Scenario Notes Summary",
"",
`- improved: ${String(summary.improved ?? 0)}`,
`- unchanged: ${String(summary.unchanged ?? 0)}`,
`- weakened: ${String(summary.weakened ?? 0)}`,
""
].join("\n");
}
export class EvalService {
constructor(private readonly normalizerService: NormalizerService) {}
public listCases(): EvalCaseFile[] {
ensureDir(EVAL_CASES_DIR);
const files = fs
.readdirSync(EVAL_CASES_DIR)
.filter((item) => item.endsWith(".json") && !item.endsWith(".report.json"));
return files
.map((name) => {
const raw = fs.readFileSync(path.resolve(EVAL_CASES_DIR, name), "utf-8");
return JSON.parse(raw) as EvalCaseFile;
})
.sort((a, b) => a.case_id.localeCompare(b.case_id));
}
private async runV2(payload: {
normalizeConfig: Omit<NormalizeRequestPayload, "userQuestion" | "context">;
useMock?: boolean;
mode: EvalRunMode;
caseSetFile?: string;
rawQuestions?: string;
cases: EvalInputCase[];
}): Promise<Record<string, unknown>> {
const runId = `eval-${nanoid(10)}`;
const results: Array<Record<string, unknown>> = [];
const routeCounter: Record<string, number> = {};
const fallbackCounter: Record<string, number> = {};
let schemaPass = 0;
let inScopeMessages = 0;
let multiIntentMessages = 0;
let clarificationMessages = 0;
let totalFragments = 0;
let inScopeFragments = 0;
let outOfScopeFragments = 0;
let unclearFragments = 0;
let executableWithSoftAssumptionsFragments = 0;
let softAssumptionFragments = 0;
let routedFragments = 0;
let noRouteFragments = 0;
let requestsTotal = 0;
let retriesUsed = 0;
let clarificationLabeledCases = 0;
let clarificationTruePositive = 0;
let clarificationFalsePositive = 0;
let clarificationFalseNegative = 0;
let scopeLabeledCases = 0;
let scopeCorrectCases = 0;
let routeLabeledCases = 0;
let routeCorrectCases = 0;
let expectedRoutedCases = 0;
let noRouteTruePositive = 0;
let noRouteFalsePositive = 0;
let stateConsistencyChecks = 0;
let stateConsistencyPass = 0;
for (const item of payload.cases) {
const response = await this.normalizerService.normalize({
...payload.normalizeConfig,
userQuestion: item.raw_question,
context: {
eval_label: runId,
case_id: item.case_id,
eval_mode: payload.mode
},
retryPolicy: payload.mode === "single-pass-strict" ? "single-pass-strict" : "default",
useMock: payload.useMock
});
if (response.validation.passed) {
schemaPass += 1;
}
const requestCount = Number(response.request_count_for_case ?? 0);
requestsTotal += requestCount;
if (requestCount > 1) {
retriesUsed += 1;
}
const normalized =
response.normalized &&
["normalized_query_v2", "normalized_query_v2_0_1", "normalized_query_v2_0_2"].includes(
String((response.normalized as { schema_version?: string }).schema_version ?? "")
)
? (response.normalized as NormalizedQueryV2 | NormalizedQueryV2_0_1 | NormalizedQueryV2_0_2)
: null;
const routeSummary =
response.route_hint_summary &&
(response.route_hint_summary as { mode?: string }).mode === "deterministic_v2"
? (response.route_hint_summary as {
decisions: Array<{
route: string;
domain_relevance?: string;
execution_readiness?: string | null;
route_status?: "routed" | "no_route" | null;
no_route_reason?: NoRouteReason | null;
}>;
fallback?: { type?: string };
})
: null;
if (normalized) {
if (normalized.message_in_scope) {
inScopeMessages += 1;
}
if (normalized.contains_multiple_tasks) {
multiIntentMessages += 1;
}
if (normalized.global_notes.needs_clarification) {
clarificationMessages += 1;
}
totalFragments += normalized.fragments.length;
const inScopeList = normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope");
inScopeFragments += inScopeList.length;
outOfScopeFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length;
unclearFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length;
for (const fragment of inScopeList as V2FamilyFragment[]) {
const readiness = executionReadinessOf(fragment);
if (readiness === "executable_with_soft_assumptions") {
executableWithSoftAssumptionsFragments += 1;
}
if (softAssumptionsOf(fragment).length > 0) {
softAssumptionFragments += 1;
}
}
}
const predictedClarification = Boolean(normalized?.global_notes?.needs_clarification);
const expectedClarification = typeof item.expected?.clarification_required === "boolean" ? item.expected.clarification_required : null;
if (expectedClarification !== null) {
clarificationLabeledCases += 1;
if (predictedClarification && expectedClarification) clarificationTruePositive += 1;
if (predictedClarification && !expectedClarification) clarificationFalsePositive += 1;
if (!predictedClarification && expectedClarification) clarificationFalseNegative += 1;
}
const predictedScope = normalized ? normalized.message_in_scope : null;
const expectedScope = expectedScopeInScope(item.expected);
if (expectedScope !== null && predictedScope !== null) {
scopeLabeledCases += 1;
if (predictedScope === expectedScope) {
scopeCorrectCases += 1;
}
}
const predictedRouteStatus: "routed" | "no_route" | null = routeSummary
? routeSummary.decisions.some((decision) => decision.route !== "no_route")
? "routed"
: "no_route"
: null;
const predictedNoRouteReason: NoRouteReason | null =
routeSummary &&
routeSummary.decisions.length > 0 &&
routeSummary.decisions.every((decision) => decision.route === "no_route")
? (routeSummary.decisions[0]?.no_route_reason ?? null)
: null;
const expectedRouteStatus = item.expected?.expected_route_status ?? null;
const expectedNoRouteReason = item.expected?.expected_no_route_reason ?? null;
if (expectedRouteStatus) {
routeLabeledCases += 1;
if (predictedRouteStatus === expectedRouteStatus) {
routeCorrectCases += 1;
}
if (expectedRouteStatus === "routed") {
expectedRoutedCases += 1;
}
}
if (predictedRouteStatus === "no_route") {
if (expectedRouteStatus === "no_route") {
if (!expectedNoRouteReason || expectedNoRouteReason === predictedNoRouteReason) {
noRouteTruePositive += 1;
} else {
noRouteFalsePositive += 1;
}
} else if (expectedRouteStatus === "routed") {
noRouteFalsePositive += 1;
}
}
if (routeSummary) {
for (const decision of routeSummary.decisions) {
stateConsistencyChecks += 1;
if (isDecisionStateConsistent(decision)) {
stateConsistencyPass += 1;
}
routeCounter[decision.route] = (routeCounter[decision.route] ?? 0) + 1;
if (decision.route === "no_route") {
noRouteFragments += 1;
} else {
routedFragments += 1;
}
}
const fallbackType = String(routeSummary.fallback?.type ?? "none");
fallbackCounter[fallbackType] = (fallbackCounter[fallbackType] ?? 0) + 1;
} else {
fallbackCounter.none = (fallbackCounter.none ?? 0) + 1;
}
results.push({
case_id: item.case_id,
raw_question: item.raw_question,
validation_passed: response.validation.passed,
message_in_scope: normalized?.message_in_scope ?? null,
scope_confidence: normalized?.scope_confidence ?? null,
contains_multiple_tasks: normalized?.contains_multiple_tasks ?? null,
fragments_total: normalized?.fragments.length ?? 0,
in_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope").length : 0,
out_of_scope_fragments: normalized
? normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length
: 0,
unclear_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length : 0,
fallback_type: routeSummary?.fallback?.type ?? "none",
predicted_route_status: predictedRouteStatus,
expected_route_status: expectedRouteStatus,
predicted_no_route_reason: predictedNoRouteReason,
expected_no_route_reason: expectedNoRouteReason,
predicted_clarification_required: predictedClarification,
expected_clarification_required: expectedClarification,
executable_with_soft_assumptions_fragments: normalized
? normalized.fragments.filter((fragment) => executionReadinessOf(fragment as V2FamilyFragment) === "executable_with_soft_assumptions")
.length
: 0,
trace_id: response.trace_id,
request_count_for_case: requestCount
});
}
const total = Math.max(1, payload.cases.length);
const totalFragmentsSafe = Math.max(1, totalFragments);
const totalRoutedDecisions = Math.max(1, routedFragments + noRouteFragments);
const precisionDenominator = clarificationTruePositive + clarificationFalsePositive;
const recallDenominator = clarificationTruePositive + clarificationFalseNegative;
const noRoutePrecisionDenominator = noRouteTruePositive + noRouteFalsePositive;
const metrics = {
schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
scope_detection_accuracy:
scopeLabeledCases > 0 ? Number(((scopeCorrectCases / scopeLabeledCases) * 100).toFixed(2)) : null,
scope_in_scope_rate: Number(((inScopeMessages / total) * 100).toFixed(2)),
multi_intent_detected_rate: Number(((multiIntentMessages / total) * 100).toFixed(2)),
clarification_required_rate: Number(((clarificationMessages / total) * 100).toFixed(2)),
avg_fragments_per_message: Number((totalFragments / total).toFixed(2)),
out_of_scope_fragment_rate: Number(((outOfScopeFragments / totalFragmentsSafe) * 100).toFixed(2)),
routed_fragment_rate: Number(((routedFragments / totalRoutedDecisions) * 100).toFixed(2)),
no_route_fragment_rate: Number(((noRouteFragments / totalRoutedDecisions) * 100).toFixed(2)),
route_resolution_accuracy:
routeLabeledCases > 0 ? Number(((routeCorrectCases / routeLabeledCases) * 100).toFixed(2)) : null,
no_route_precision:
noRoutePrecisionDenominator > 0 ? Number(((noRouteTruePositive / noRoutePrecisionDenominator) * 100).toFixed(2)) : null,
false_no_route_rate:
expectedRoutedCases > 0 ? Number(((noRouteFalsePositive / expectedRoutedCases) * 100).toFixed(2)) : null,
execution_state_consistency_rate:
stateConsistencyChecks > 0 ? Number(((stateConsistencyPass / stateConsistencyChecks) * 100).toFixed(2)) : null,
executable_with_soft_assumptions_rate: Number(((executableWithSoftAssumptionsFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
soft_assumption_used_fragment_rate: Number(((softAssumptionFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
clarification_precision:
precisionDenominator > 0 ? Number(((clarificationTruePositive / precisionDenominator) * 100).toFixed(2)) : null,
clarification_recall:
recallDenominator > 0 ? Number(((clarificationTruePositive / recallDenominator) * 100).toFixed(2)) : null,
false_clarification_rate:
clarificationLabeledCases > 0 ? Number(((clarificationFalsePositive / clarificationLabeledCases) * 100).toFixed(2)) : null
};
const report = {
run_id: runId,
timestamp: new Date().toISOString(),
mode: payload.mode,
use_mock: Boolean(payload.useMock),
prompt_version: payload.normalizeConfig.promptVersion ?? null,
schema_version: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
.toLowerCase()
.includes("v2_0_2")
? "v2_0_2"
: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
.toLowerCase()
.includes("v2_0_1")
? "v2_0_1"
: "v2",
dataset: {
source: payload.rawQuestions ? "inline_raw_questions" : payload.caseSetFile ? "file" : "data/eval_cases/*.json",
file: payload.caseSetFile ?? null,
raw_questions_count: payload.rawQuestions ? parseRawQuestions(payload.rawQuestions).length : null
},
cases_total: payload.cases.length,
metrics,
budget: {
requests_total: requestsTotal,
retries_used: retriesUsed
},
clarification_eval: {
labeled_cases: clarificationLabeledCases,
true_positive: clarificationTruePositive,
false_positive: clarificationFalsePositive,
false_negative: clarificationFalseNegative
},
route_eval: {
labeled_cases: routeLabeledCases,
correct_cases: routeCorrectCases,
expected_routed_cases: expectedRoutedCases,
no_route_true_positive: noRouteTruePositive,
no_route_false_positive: noRouteFalsePositive
},
scope_eval: {
labeled_cases: scopeLabeledCases,
correct_cases: scopeCorrectCases
},
execution_state_eval: {
checks_total: stateConsistencyChecks,
checks_passed: stateConsistencyPass
},
route_distribution: routeCounter,
fallback_distribution: fallbackCounter,
results
};
ensureDir(EVAL_CASES_DIR);
writeJsonFile(path.resolve(EVAL_CASES_DIR, `${runId}.report.json`), report);
return report;
}
private collectAssistantSignals(finalResponse: AssistantMessageResponsePayload, turnResponses: AssistantMessageResponsePayload[]): AssistantCaseSignalSummary {
const debug = finalResponse.debug;
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
const sourceRefSet = new Set<string>();
const limitationCodeSet = new Set<string>();
const routeSet = new Set<string>();
const confidenceScores: number[] = [];
const narrowingOrder: Record<"weak" | "medium" | "strong", number> = { weak: 0, medium: 1, strong: 2 };
let broadQueryDetected = false;
let broadResultFlag = false;
let minimumEvidenceFailed = false;
let degradedTo: "partial" | "clarification" | null = null;
let narrowingStrength: "weak" | "medium" | "strong" | null = null;
for (const result of retrievalResults) {
routeSet.add(String(result.route ?? "unknown"));
const summary = result.summary ?? {};
if (summary.broad_query_detected === true) broadQueryDetected = true;
if (summary.broad_result_flag === true) broadResultFlag = true;
if (summary.minimum_evidence_failed === true) minimumEvidenceFailed = true;
const degraded = toDegradedTo(summary.degraded_to);
if (degraded === "clarification") {
degradedTo = "clarification";
} else if (!degradedTo && degraded === "partial") {
degradedTo = "partial";
}
const narrowed = toNarrowingStrength(summary.narrowing_strength);
if (narrowed && (!narrowingStrength || narrowingOrder[narrowed] < narrowingOrder[narrowingStrength])) {
narrowingStrength = narrowed;
}
if (result.confidence === "high") confidenceScores.push(3);
if (result.confidence === "medium") confidenceScores.push(2);
if (result.confidence === "low") confidenceScores.push(1);
for (const evidence of Array.isArray(result.evidence) ? result.evidence : []) {
const canonicalRef = String(evidence.source_ref?.canonical_ref ?? "").trim();
if (canonicalRef) {
sourceRefSet.add(canonicalRef);
}
const reasonCode = String(evidence.limitation?.reason_code ?? "").trim();
if (reasonCode) {
limitationCodeSet.add(reasonCode);
}
if (evidence.confidence === "high") confidenceScores.push(3);
if (evidence.confidence === "medium") confidenceScores.push(2);
if (evidence.confidence === "low") confidenceScores.push(1);
}
}
const averageConfidence =
confidenceScores.length > 0 ? confidenceScores.reduce((acc, item) => acc + item, 0) / confidenceScores.length : null;
const evidenceConfidence: "high" | "medium" | "low" | null =
averageConfidence === null ? null : averageConfidence >= 2.6 ? "high" : averageConfidence >= 1.8 ? "medium" : "low";
const mechanismStatus =
debug?.answer_structure_v11?.mechanism_block?.status === "grounded" ||
debug?.answer_structure_v11?.mechanism_block?.status === "limited" ||
debug?.answer_structure_v11?.mechanism_block?.status === "unresolved"
? debug.answer_structure_v11.mechanism_block.status
: null;
const followupStateApplied = turnResponses.some((item) => item.debug?.followup_state_usage?.applied === true);
const uncertaintyLimitationsCount = debug?.answer_structure_v11?.uncertainty_block?.limitations?.length ?? 0;
return {
broad_query_detected: broadQueryDetected,
broad_result_flag: broadResultFlag,
narrowing_strength: narrowingStrength,
minimum_evidence_failed: minimumEvidenceFailed,
degraded_to: degradedTo,
evidence_confidence: evidenceConfidence,
limitation_reason_codes: [...limitationCodeSet],
mechanism_status: mechanismStatus,
source_refs: [...sourceRefSet],
routes: [...routeSet],
followup_state_applied: followupStateApplied,
uncertainty_limitations_count: uncertaintyLimitationsCount
};
}
private computeAssistantMetrics(input: {
diagnostics: AssistantCaseDiagnostics[];
}): {
raw: AssistantEvalMetricVector;
rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null>;
denominators: Record<string, number>;
signature_counts: Record<string, number>;
} {
const diagnostics = input.diagnostics;
const total = Math.max(1, diagnostics.length);
const signatureCounter = diagnostics.reduce<Record<string, number>>((acc, item) => {
acc[item.signature] = (acc[item.signature] ?? 0) + 1;
return acc;
}, {});
const uniqueSignatures = Object.keys(signatureCounter).length;
const genericCases = diagnostics.filter((item) => item.is_generic).length;
const falseConfidenceCases = diagnostics.filter((item) => item.is_false_confident).length;
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
const avgActionability =
diagnostics.length > 0
? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
: null;
const avgMechanism =
diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.mechanism_specificity_score, 0) / diagnostics.length : null;
const avgFollowup =
followupCases.length > 0
? followupCases.reduce((acc, item) => acc + Number(item.followup_retention_score ?? 0), 0) / followupCases.length
: null;
const raw: AssistantEvalMetricVector = {
retrieval_differentiation_rate: round2(uniqueSignatures / total),
generic_explanation_rate: round2(genericCases / total),
accountant_actionability_score: avgActionability === null ? null : round2(avgActionability),
false_confidence_rate: round2(falseConfidenceCases / total),
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
};
const rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null> = {
retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
generic_explanation_rate: rubricBandForMetric("generic_explanation_rate", raw.generic_explanation_rate),
accountant_actionability_score: rubricBandForMetric("accountant_actionability_score", raw.accountant_actionability_score),
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
};
return {
raw,
rubric_bands,
denominators: {
cases_total: diagnostics.length,
broad_cases_total: broadCases.length,
followup_cases_total: followupCases.length
},
signature_counts: signatureCounter
};
}
private buildAssistantComparisonReport(input: {
currentReport: Record<string, unknown>;
baselineReportFile: string;
}): Record<string, unknown> {
const baselinePath = resolveReadablePath(input.baselineReportFile);
const baselineReport = JSON.parse(fs.readFileSync(baselinePath, "utf-8")) as Record<string, unknown>;
const currentReport = input.currentReport;
const metricKeys: AssistantMetricKey[] = [
"retrieval_differentiation_rate",
"generic_explanation_rate",
"accountant_actionability_score",
"false_confidence_rate",
"broad_answer_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
];
const lowerIsBetter = new Set<AssistantMetricKey>(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
const baselineRaw = ((baselineReport.metrics ?? {}) as { raw?: Record<string, number | null> }).raw ?? {};
const currentRaw = ((currentReport.metrics ?? {}) as { raw?: Record<string, number | null> }).raw ?? {};
const deltas: Record<string, { baseline: number | null; current: number | null; delta: number | null; trend: string }> = {};
for (const metric of metricKeys) {
const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null;
const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null;
const delta = baseline !== null && current !== null ? round2(current - baseline) : null;
let trend = "n/a";
if (baseline !== null && current !== null) {
const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01;
const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01;
trend = improved ? "improved" : weakened ? "weakened" : "unchanged";
}
deltas[metric] = { baseline, current, delta, trend };
}
const baselineResults = Array.isArray(baselineReport.results) ? (baselineReport.results as Record<string, unknown>[]) : [];
const currentResults = Array.isArray(currentReport.results) ? (currentReport.results as Record<string, unknown>[]) : [];
const baselineByCase = new Map<string, Record<string, unknown>>();
for (const row of baselineResults) {
baselineByCase.set(String(row.case_id ?? ""), row);
}
const improvedNotes: string[] = [];
const unchangedNotes: string[] = [];
const weakenedNotes: string[] = [];
for (const row of currentResults) {
const caseId = String(row.case_id ?? "");
const currentUsefulness = typeof row.accountant_usefulness_score === "number" ? Number(row.accountant_usefulness_score) : null;
const baselineRow = baselineByCase.get(caseId);
const baselineUsefulness =
baselineRow && typeof baselineRow.accountant_usefulness_score === "number"
? Number(baselineRow.accountant_usefulness_score)
: null;
if (baselineUsefulness === null || currentUsefulness === null) {
continue;
}
const delta = round2(currentUsefulness - baselineUsefulness);
const note = `${caseId}: usefulness ${baselineUsefulness} -> ${currentUsefulness} (delta ${delta})`;
if (delta > 0.25) {
improvedNotes.push(note);
} else if (delta < -0.25) {
weakenedNotes.push(note);
} else {
unchangedNotes.push(note);
}
}
const comparisonId = `assistant-compare-${nanoid(8)}`;
const comparisonReport = {
schema_version: ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION,
comparison_id: comparisonId,
run_timestamp: new Date().toISOString(),
baseline_run_id: baselineReport.run_id ?? null,
current_run_id: currentReport.run_id ?? null,
eval_target: "assistant_stage1",
suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null,
suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null,
baseline_report_file: baselinePath,
current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object"
? (currentReport.artifacts as { run_report_json_path?: string }).run_report_json_path ?? null
: null,
metric_deltas: deltas,
scenario_notes_summary: {
improved: improvedNotes.length,
unchanged: unchangedNotes.length,
weakened: weakenedNotes.length
},
scenario_notes: {
improved: improvedNotes,
unchanged: unchangedNotes,
weakened: weakenedNotes
},
known_limitations: currentReport.known_limitations ?? [
"Comparison is run-to-run and depends on stable mock/runtime flags.",
"Metrics remain Stage 1 heuristic bands, not full product scorecards."
],
report_title: "Assistant Stage 1 Baseline vs Current"
};
ensureDir(REPORTS_DIR);
const jsonPath = path.resolve(REPORTS_DIR, `${comparisonId}.json`);
const mdPath = path.resolve(REPORTS_DIR, `${comparisonId}.md`);
writeJsonFile(jsonPath, comparisonReport);
fs.writeFileSync(mdPath, buildAssistantComparisonMarkdownReport(comparisonReport), "utf-8");
return {
...comparisonReport,
artifacts: {
comparison_report_json_path: jsonPath,
comparison_report_md_path: mdPath
}
};
}
private async runAssistantStage1(payload: {
normalizeConfig: Omit<NormalizeRequestPayload, "userQuestion" | "context">;
caseIds?: string[];
useMock?: boolean;
mode: EvalRunMode;
caseSetFile?: string;
compareWithReportFile?: string;
}): Promise<Record<string, unknown>> {
if (!FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1) {
throw new ApiError(
"ASSISTANT_STAGE1_EVAL_DISABLED",
"Assistant Stage 1 eval target is disabled by FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1.",
409
);
}
const suite = parseAssistantSuiteFile(payload.caseSetFile);
const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
const runId = `assistant-stage1-${nanoid(10)}`;
const assistantService = new AssistantService(this.normalizerService, new AssistantSessionStore());
const diagnostics: AssistantCaseDiagnostics[] = [];
let requestsTotal = 0;
for (const suiteCase of suiteCases) {
const sessionId = `${runId}-${suiteCase.case_id}`;
const turnResponses: AssistantMessageResponsePayload[] = [];
const notes: string[] = [];
const limitations: string[] = [];
try {
for (const turn of suiteCase.turns) {
const response = await assistantService.handleMessage({
session_id: sessionId,
user_message: turn.user_message,
message: turn.user_message,
mode: "assistant",
apiKey: payload.normalizeConfig.apiKey,
model: payload.normalizeConfig.model,
baseUrl: payload.normalizeConfig.baseUrl,
temperature: payload.normalizeConfig.temperature,
maxOutputTokens: payload.normalizeConfig.maxOutputTokens,
promptVersion: payload.normalizeConfig.promptVersion,
systemPrompt: payload.normalizeConfig.systemPrompt,
developerPrompt: payload.normalizeConfig.developerPrompt,
domainPrompt: payload.normalizeConfig.domainPrompt,
fewShotExamples: payload.normalizeConfig.fewShotExamples,
useMock: payload.useMock
});
turnResponses.push(response);
requestsTotal += 1;
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
diagnostics.push({
suite_case: suiteCase,
session_id: sessionId,
trace_id: null,
final_reply_type: "backend_error",
turn_count: turnResponses.length,
narrowing_result: "failed",
signature: `backend_error|${suiteCase.scenario_tag}`,
is_generic: true,
is_false_confident: false,
is_broad_answer: suiteCase.broadness_level === "low" ? null : false,
followup_retention_score: suiteCase.question_type === "followup" || suiteCase.turns.length > 1 ? 0 : null,
evidence_quality_score: 0,
mechanism_specificity_score: 0,
genericness_score: 5,
accountant_actionability_score: 0,
accountant_usefulness_score: 0,
signals: {
broad_query_detected: suiteCase.broadness_level !== "low",
broad_result_flag: false,
narrowing_strength: null,
minimum_evidence_failed: true,
degraded_to: "clarification",
evidence_confidence: "low",
limitation_reason_codes: [],
mechanism_status: null,
source_refs: [],
routes: [],
followup_state_applied: false,
uncertainty_limitations_count: 0
},
limitations: [errorMessage],
notes: [`Case execution failed: ${errorMessage}`]
});
continue;
}
const finalResponse = turnResponses[turnResponses.length - 1];
const signals = this.collectAssistantSignals(finalResponse, turnResponses);
const structure = finalResponse.debug?.answer_structure_v11 ?? null;
const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions);
const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions);
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
const hasAnchors = hasDomainAnchors(
[directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")
);
let genericnessScore = 0;
if (!hasAnchors) genericnessScore += 2;
if (mechanismNotes.length === 0) genericnessScore += 1;
if (signals.source_refs.length === 0) genericnessScore += 1;
if (recommendedActions.length === 0) genericnessScore += 1;
genericnessScore = clampScore(genericnessScore);
let actionabilityScore = 0;
if (recommendedActions.length > 0) actionabilityScore += 2;
if (recommendedActions.some((item) => hasDomainAnchors(item))) actionabilityScore += 2;
if (clarificationQuestions.length > 0 && (finalResponse.reply_type === "clarification_required" || signals.degraded_to === "clarification")) {
actionabilityScore += 1;
}
if (signals.source_refs.length > 0 && actionabilityScore < 5) {
actionabilityScore += 1;
}
actionabilityScore = clampScore(actionabilityScore);
let evidenceQualityScore = 0;
if (signals.source_refs.length >= 3) evidenceQualityScore += 2;
else if (signals.source_refs.length > 0) evidenceQualityScore += 1;
if (signals.evidence_confidence === "high") evidenceQualityScore += 2;
if (signals.evidence_confidence === "medium") evidenceQualityScore += 1;
if (signals.minimum_evidence_failed) evidenceQualityScore -= 2;
if (signals.limitation_reason_codes.includes("insufficient_detail")) evidenceQualityScore -= 1;
if (signals.limitation_reason_codes.includes("missing_mechanism")) evidenceQualityScore -= 1;
evidenceQualityScore = clampScore(evidenceQualityScore);
let mechanismSpecificityScore = 0;
if (signals.mechanism_status === "grounded" && mechanismNotes.length > 0 && !signals.limitation_reason_codes.includes("missing_mechanism")) {
mechanismSpecificityScore = 5;
} else if (signals.mechanism_status === "limited" && mechanismNotes.length > 0) {
mechanismSpecificityScore = 3;
} else if (mechanismNotes.length > 0) {
mechanismSpecificityScore = 2;
} else {
mechanismSpecificityScore = 0;
}
const usefulnessScore = clampScore((actionabilityScore + (5 - genericnessScore) + evidenceQualityScore + mechanismSpecificityScore) / 4);
const isGeneric = genericnessScore >= 3;
const factualReply = finalResponse.reply_type === "factual" || finalResponse.reply_type === "factual_with_explanation";
const isFalseConfident =
factualReply &&
(signals.minimum_evidence_failed ||
signals.degraded_to !== null ||
signals.evidence_confidence === "low" ||
(signals.limitation_reason_codes.length > 0 && signals.uncertainty_limitations_count === 0));
const isBroadCase = suiteCase.broadness_level !== "low" || signals.broad_query_detected;
const isBroadAnswer = isBroadCase
? factualReply && signals.degraded_to === null && !signals.minimum_evidence_failed
: null;
const isFollowupCase = suiteCase.question_type === "followup" || suiteCase.turns.length > 1;
let followupRetentionScore: number | null = null;
if (isFollowupCase) {
const finalTurnIndex = Number(finalResponse.debug?.investigation_state_snapshot?.turn_index ?? 0);
if (signals.followup_state_applied && finalTurnIndex >= suiteCase.turns.length) {
followupRetentionScore = 5;
} else if (finalTurnIndex >= suiteCase.turns.length) {
followupRetentionScore = 3;
} else {
followupRetentionScore = 0;
}
}
let narrowingResult: AssistantEvalNarrowingResult = "not_required";
if (signals.degraded_to === "clarification" || finalResponse.reply_type === "clarification_required") {
narrowingResult = "clarification_requested";
} else if (signals.broad_query_detected || signals.broad_result_flag) {
narrowingResult = signals.minimum_evidence_failed ? "failed" : "applied";
}
if (signals.minimum_evidence_failed) {
limitations.push("minimum_evidence_failed");
}
limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`));
if (signals.mechanism_status === "unresolved") {
limitations.push("mechanism_unresolved");
}
limitations.push(...uncertaintyLimitations);
if (isGeneric) notes.push("genericness_high");
if (isFalseConfident) notes.push("false_confidence_risk");
if (isBroadCase && isBroadAnswer) notes.push("broad_answer_without_degradation");
if (followupRetentionScore !== null && followupRetentionScore < 3) notes.push("followup_context_retention_weak");
diagnostics.push({
suite_case: suiteCase,
session_id: sessionId,
trace_id: finalResponse.debug?.trace_id ?? null,
final_reply_type: finalResponse.reply_type,
turn_count: suiteCase.turns.length,
narrowing_result: narrowingResult,
signature: [
finalResponse.reply_type,
signals.routes.sort().join(","),
signals.degraded_to ?? "none",
signals.mechanism_status ?? "unknown",
signals.source_refs.slice(0, 2).join(",")
].join("|"),
is_generic: isGeneric,
is_false_confident: isFalseConfident,
is_broad_answer: isBroadAnswer,
followup_retention_score: followupRetentionScore,
evidence_quality_score: evidenceQualityScore,
mechanism_specificity_score: mechanismSpecificityScore,
genericness_score: genericnessScore,
accountant_actionability_score: actionabilityScore,
accountant_usefulness_score: round2(usefulnessScore),
signals,
limitations: Array.from(new Set(limitations)),
notes
});
}
const metrics = this.computeAssistantMetrics({ diagnostics });
const caseRecords: AssistantEvalRecord[] = diagnostics.map((item) => {
const signatureHits = metrics.signature_counts[item.signature] ?? 1;
const caseMetricVector: AssistantEvalMetricVector = {
retrieval_differentiation_rate: signatureHits === 1 ? 1 : 0,
generic_explanation_rate: item.is_generic ? 1 : 0,
accountant_actionability_score: round2(item.accountant_actionability_score),
false_confidence_rate: item.is_false_confident ? 1 : 0,
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
mechanism_specificity_score: round2(item.mechanism_specificity_score),
followup_context_retention_score:
item.followup_retention_score === null ? null : round2(item.followup_retention_score)
};
return {
schema_version: ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
created_at: new Date().toISOString(),
case_id: item.suite_case.case_id,
scenario_tag: item.suite_case.scenario_tag,
session_id: item.session_id,
trace_id: item.trace_id,
question_type: item.suite_case.question_type,
broadness_level: item.suite_case.broadness_level,
narrowing_result: item.narrowing_result,
evidence_quality_score: round2(item.evidence_quality_score),
genericness_score: round2(item.genericness_score),
accountant_usefulness_score: round2(item.accountant_usefulness_score),
accountant_metrics: caseMetricVector,
raw_signals: {
final_reply_type: item.final_reply_type,
turn_count: item.turn_count,
broad_query_detected: item.signals.broad_query_detected,
broad_result_flag: item.signals.broad_result_flag,
narrowing_strength: item.signals.narrowing_strength,
minimum_evidence_failed: item.signals.minimum_evidence_failed,
degraded_to: item.signals.degraded_to,
evidence_confidence: item.signals.evidence_confidence,
limitation_reason_codes: item.signals.limitation_reason_codes,
mechanism_status: item.signals.mechanism_status,
source_refs: item.signals.source_refs,
routes: item.signals.routes,
followup_state_applied: item.signals.followup_state_applied
},
metric_subscores: caseMetricVector,
limitations: item.limitations,
notes: item.notes
};
});
const strongestSignals = (Object.entries(metrics.rubric_bands) as Array<[AccountantMetricName, AccountantMetricRubricBand | null]>)
.filter(([, band]) => band?.score === 5)
.map(([name]) => name);
const weakestSignals = (Object.entries(metrics.rubric_bands) as Array<[AccountantMetricName, AccountantMetricRubricBand | null]>)
.filter(([, band]) => band?.score === 0)
.map(([name]) => name);
const runTimestamp = new Date().toISOString();
const report: Record<string, unknown> = {
schema_version: ASSISTANT_STAGE1_RUN_SCHEMA_VERSION,
run_id: runId,
run_timestamp: runTimestamp,
eval_target: "assistant_stage1",
mode: payload.mode,
use_mock: Boolean(payload.useMock),
prompt_version: payload.normalizeConfig.promptVersion ?? null,
suite_id: suite.suite_id,
suite_version: suite.suite_version,
suite_schema_version: suite.schema_version ?? null,
scenario_count: suite.scenario_count,
case_ids: suiteCases.map((item) => item.case_id),
cases_total: caseRecords.length,
feature_profile_snapshot: buildFeatureProfileSnapshot(),
code_version: buildCodeVersionMarker(),
metrics: {
raw: metrics.raw,
denominators: metrics.denominators
},
rubric_bands: metrics.rubric_bands,
subsets: {
broad_cases_total: metrics.denominators.broad_cases_total,
followup_cases_total: metrics.denominators.followup_cases_total
},
budget: {
requests_total: requestsTotal
},
results: caseRecords,
scenario_summary: {
improved_or_strong: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) >= 4).length,
unchanged_or_mixed: caseRecords.filter((item) => {
const value = Number(item.accountant_usefulness_score ?? 0);
return value >= 2.5 && value < 4;
}).length,
weak_or_regressed: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) < 2.5).length
},
improvement_hints: {
strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none",
weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none"
},
known_limitations: [
"Snapshot-only retrieval contour remains (no live verification core in Stage 1).",
"Metric mapping for genericness/false confidence is heuristic by design.",
"Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)."
],
report_title: "Assistant Stage 1 Eval Run"
};
ensureDir(REPORTS_DIR);
const runJsonPath = path.resolve(REPORTS_DIR, `${runId}.json`);
const runMdPath = path.resolve(REPORTS_DIR, `${runId}.md`);
writeJsonFile(runJsonPath, report);
fs.writeFileSync(runMdPath, buildAssistantEvalMarkdownReport(report), "utf-8");
report.artifacts = {
run_report_json_path: runJsonPath,
run_report_md_path: runMdPath
};
if (payload.compareWithReportFile) {
report.comparison = this.buildAssistantComparisonReport({
currentReport: report,
baselineReportFile: payload.compareWithReportFile
});
}
return report;
}
public async run(payload: {
normalizeConfig: Omit<NormalizeRequestPayload, "userQuestion" | "context">;
caseIds?: string[];
useMock?: boolean;
mode?: EvalRunMode;
caseSetFile?: string;
rawQuestions?: string;
evalTarget?: EvalTarget;
compareWithReportFile?: string;
}): Promise<Record<string, unknown>> {
const mode = payload.mode ?? "standard";
const evalTarget = payload.evalTarget ?? "normalizer";
if (evalTarget === "assistant_stage1") {
return this.runAssistantStage1({
normalizeConfig: payload.normalizeConfig,
caseIds: payload.caseIds,
useMock: payload.useMock,
mode,
caseSetFile: payload.caseSetFile,
compareWithReportFile: payload.compareWithReportFile
});
}
const promptVersion = String(payload.normalizeConfig.promptVersion ?? "").toLowerCase();
const schemaVersion = String(payload.normalizeConfig.schemaVersion ?? "").toLowerCase();
const isV2 =
promptVersion.startsWith("normalizer_v2") || schemaVersion === "v2" || schemaVersion === "v2_0_1" || schemaVersion === "v2_0_2";
const inlineQuestions = payload.rawQuestions ? parseRawQuestions(payload.rawQuestions) : [];
const inlineCases: EvalInputCase[] = inlineQuestions.map((question, index) => ({
case_id: formatCaseId("BQ", index),
raw_question: question,
expected: null
}));
if (isV2) {
const sourceCases =
inlineCases.length > 0
? inlineCases
: payload.caseSetFile
? parseCaseSetFile(payload.caseSetFile).map((item) => ({
case_id: item.case_id,
raw_question: item.raw_question,
expected: item.expected
}))
: this.listCases().map((item) => ({
case_id: item.case_id,
raw_question: item.raw_question,
expected: item.expected
}));
const filtered = sourceCases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
return this.runV2({
...payload,
mode,
cases: filtered
});
}
if (inlineCases.length > 0) {
throw new Error("rawQuestions batch is supported for normalizer_v2 only.");
}
const casesSource = payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile) : this.listCases();
const filteredCases = casesSource.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
const runId = `eval-${nanoid(10)}`;
const results: Array<Record<string, unknown>> = [];
const mismatches: Array<Record<string, unknown>> = [];
const badConfidenceCases: Array<Record<string, unknown>> = [];
const classCounter: Record<string, { total: number; passed: number }> = {};
let schemaPass = 0;
let intentPass = 0;
let routePass = 0;
let causalPass = 0;
let highConfidenceErrors = 0;
let requestsTotal = 0;
let retriesUsed = 0;
for (const item of filteredCases) {
const response = await this.normalizerService.normalize({
...payload.normalizeConfig,
userQuestion: item.raw_question,
context: {
expected_route: item.expected.route_hint as NormalizeRequestPayload["context"] extends infer C
? C extends { expected_route?: infer R }
? R
: never
: never,
eval_label: runId,
case_id: item.case_id,
eval_mode: mode
},
retryPolicy: mode === "single-pass-strict" ? "single-pass-strict" : "default",
useMock: payload.useMock
});
const normalized =
response.normalized && (response.normalized as { schema_version?: string }).schema_version === "normalized_query_v1"
? (response.normalized as NormalizedQueryV1)
: null;
const intentMatch = Boolean(normalized && item.expected.intent_class === normalized.intent_class);
const routeMatch = Boolean(normalized && item.expected.route_hint === normalized.route_hint);
const causalMatch = Boolean(
normalized &&
item.expected.requires &&
item.expected.requires.needs_cross_entity_join === normalized.requires.needs_cross_entity_join &&
item.expected.requires.needs_causal_chain === normalized.requires.needs_causal_chain
);
if (response.validation.passed) schemaPass += 1;
if (intentMatch) intentPass += 1;
if (routeMatch) routePass += 1;
if (causalMatch || !item.expected.requires) causalPass += 1;
const requestCount = Number(response.request_count_for_case ?? 0);
requestsTotal += requestCount;
if (requestCount > 1) {
retriesUsed += 1;
}
const classKey = String(item.expected.intent_class ?? "unknown");
if (!classCounter[classKey]) {
classCounter[classKey] = { total: 0, passed: 0 };
}
classCounter[classKey].total += 1;
if (intentMatch) {
classCounter[classKey].passed += 1;
}
const confidenceOverall = normalized?.confidence.overall ?? null;
const hasMismatch = !intentMatch || !routeMatch || (!causalMatch && Boolean(item.expected.requires));
if (confidenceOverall === "high" && hasMismatch) {
highConfidenceErrors += 1;
badConfidenceCases.push({
case_id: item.case_id,
confidence_overall: confidenceOverall,
intent_match: intentMatch,
route_match: routeMatch,
causal_match: causalMatch || !item.expected.requires,
trace_id: response.trace_id
});
}
if (hasMismatch || !response.validation.passed) {
mismatches.push({
case_id: item.case_id,
expected_intent_class: item.expected.intent_class ?? null,
actual_intent_class: normalized?.intent_class ?? null,
expected_route_hint: item.expected.route_hint ?? null,
actual_route_hint: normalized?.route_hint ?? null,
expected_requires: item.expected.requires ?? null,
actual_requires: normalized?.requires ?? null,
comment: shortMismatchComment({
intentMatch,
routeMatch,
causalMatch: causalMatch || !item.expected.requires,
validationPassed: response.validation.passed
}),
trace_id: response.trace_id
});
}
results.push({
case_id: item.case_id,
raw_question: item.raw_question,
validation_passed: response.validation.passed,
intent_match: intentMatch,
route_match: routeMatch,
causal_flags_match: causalMatch || !item.expected.requires,
expected_intent_class: item.expected.intent_class ?? null,
actual_intent_class: normalized?.intent_class ?? null,
expected_route_hint: item.expected.route_hint ?? null,
actual_route_hint: normalized?.route_hint ?? null,
expected_requires: item.expected.requires ?? null,
actual_requires: normalized?.requires ?? null,
confidence_overall: confidenceOverall,
trace_id: response.trace_id,
request_count_for_case: requestCount
});
}
const total = Math.max(1, filteredCases.length);
const metrics: BaselineMetrics = {
schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
intent_class_accuracy: Number(((intentPass / total) * 100).toFixed(2)),
route_hint_accuracy: Number(((routePass / total) * 100).toFixed(2)),
causal_flag_accuracy: Number(((causalPass / total) * 100).toFixed(2)),
high_confidence_error_rate: Number(((highConfidenceErrors / total) * 100).toFixed(2))
};
const classAccuracy = Object.fromEntries(
Object.entries(classCounter).map(([key, value]) => [
key,
{
total: value.total,
passed: value.passed,
accuracy_percent: Number(((value.passed / Math.max(1, value.total)) * 100).toFixed(2))
}
])
);
const baselineAsMap = BASELINE_METRICS as unknown as Record<string, number>;
const baselineDelta = Object.fromEntries(
Object.entries(metrics).map(([key, value]) => [key, Number((value - baselineAsMap[key]).toFixed(2))])
);
const report = {
run_id: runId,
timestamp: new Date().toISOString(),
mode,
use_mock: Boolean(payload.useMock),
prompt_version: payload.normalizeConfig.promptVersion ?? null,
dataset: {
source: payload.caseSetFile ? "file" : "data/eval_cases/*.json",
file: payload.caseSetFile ?? null
},
cases_total: filteredCases.length,
metrics,
baseline_metrics: BASELINE_METRICS,
baseline_delta: baselineDelta,
class_accuracy: classAccuracy,
budget: {
requests_total: requestsTotal,
retries_used: retriesUsed,
guidance: {
forensic_calls_max: 10,
final_eval_calls_max: 30,
target_total_calls_max: 40,
hard_cap_calls_max: 45
}
},
mismatches,
bad_confidence_cases: badConfidenceCases,
results
};
ensureDir(EVAL_CASES_DIR);
writeJsonFile(path.resolve(EVAL_CASES_DIR, `${runId}.report.json`), report);
const shouldWriteV11Artifacts =
mode === "single-pass-strict" &&
Boolean(payload.caseSetFile) &&
path.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_30cases.json";
if (shouldWriteV11Artifacts) {
ensureDir(REPORTS_DIR);
writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_eval_v1_1_run.json"), report);
fs.writeFileSync(
path.resolve(REPORTS_DIR, "normalizer_eval_v1_1_run.md"),
buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1 Eval Run"
}),
"utf-8"
);
}
const shouldWriteV1121EvalArtifacts =
mode === "single-pass-strict" &&
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2_1" &&
Boolean(payload.caseSetFile) &&
path.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_2_1_30cases.json";
if (shouldWriteV1121EvalArtifacts) {
ensureDir(REPORTS_DIR);
writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_v1_1_2_1_eval.json"), report);
fs.writeFileSync(
path.resolve(REPORTS_DIR, "normalizer_v1_1_2_1_eval.md"),
buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1.2.1 Eval Run"
}),
"utf-8"
);
}
const shouldWriteV111MicroArtifacts =
mode === "single-pass-strict" &&
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_1" &&
isSameCaseSet(payload.caseIds, V111_MICRO_CASE_IDS);
if (shouldWriteV111MicroArtifacts) {
ensureDir(REPORTS_DIR);
writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_v1_1_1_micro_eval.json"), report);
fs.writeFileSync(
path.resolve(REPORTS_DIR, "normalizer_v1_1_1_micro_eval.md"),
buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1.1 Micro Eval"
}),
"utf-8"
);
}
const shouldWriteV112MicroArtifacts =
mode === "single-pass-strict" &&
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2" &&
isSameCaseSet(payload.caseIds, V112_MICRO_CASE_IDS);
if (shouldWriteV112MicroArtifacts) {
ensureDir(REPORTS_DIR);
writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_v1_1_2_micro_eval.json"), report);
fs.writeFileSync(
path.resolve(REPORTS_DIR, "normalizer_v1_1_2_micro_eval.md"),
buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1.2 Micro Eval"
}),
"utf-8"
);
}
return report;
}
}