NODEDC_1C/llm_normalizer/backend/dist/services/evalService.js

2508 lines
123 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.EvalService = void 0;
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const nanoid_1 = require("nanoid");
const config_1 = require("../config");
const p0_eval_runner_1 = require("../eval/p0_eval_runner");
const stage1Contracts_1 = require("../types/stage1Contracts");
const stage2EvalContracts_1 = require("../types/stage2EvalContracts");
const http_1 = require("../utils/http");
const assistantService_1 = require("./assistantService");
const assistantSessionStore_1 = require("./assistantSessionStore");
const files_1 = require("../utils/files");
const BASELINE_METRICS = {
schema_validation_pass_rate: 100,
intent_class_accuracy: 72.73,
route_hint_accuracy: 90.91,
causal_flag_accuracy: 81.82,
high_confidence_error_rate: 9.09
};
const V111_MICRO_CASE_IDS = ["NQ-008", "V11-DD-005", "V11-OT-003", "V11-OT-004", "V11-OT-005"];
const V112_MICRO_CASE_IDS = ["NQ-002", "NQ-007", "V11-HA-004", "V11-OT-003", "V11-OT-005"];
function isSameCaseSet(input, target) {
if (!input || input.length !== target.length) {
return false;
}
const left = [...input].sort();
const right = [...target].sort();
return left.every((value, index) => value === right[index]);
}
function formatPercent(value) {
return `${value.toFixed(2)}%`;
}
function shortMismatchComment(input) {
if (!input.validationPassed) {
return "Schema validation failed for this case.";
}
if (!input.intentMatch && input.routeMatch) {
return "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.";
}
if (input.intentMatch && !input.routeMatch) {
return "Intent understood, but route_hint selected a weaker execution route.";
}
if (!input.intentMatch && !input.routeMatch) {
return "Both intent and route misclassified; likely lexical ambiguity in causal vs risk wording.";
}
if (!input.causalMatch) {
return "Causal flags are inconsistent with expected relationship depth.";
}
return "No mismatch.";
}
function buildMarkdownReport(report) {
const metrics = (report.metrics ?? {});
const baseline = (report.baseline_metrics ?? {});
const delta = (report.baseline_delta ?? {});
const classAccuracy = (report.class_accuracy ?? {});
const mismatches = Array.isArray(report.mismatches) ? report.mismatches : [];
const badConfidenceCases = Array.isArray(report.bad_confidence_cases) ? report.bad_confidence_cases : [];
const budget = (report.budget ?? {});
const metricRows = Object.keys(metrics)
.map((key) => {
const current = Number(metrics[key] ?? 0);
const base = Number(baseline[key] ?? 0);
const d = Number(delta[key] ?? 0);
const sign = d > 0 ? "+" : "";
return `| ${key} | ${formatPercent(current)} | ${formatPercent(base)} | ${sign}${d.toFixed(2)} |`;
})
.join("\n");
const classRows = Object.keys(classAccuracy)
.map((key) => {
const row = classAccuracy[key];
return `| ${key} | ${row.passed}/${row.total} | ${formatPercent(row.accuracy_percent)} |`;
})
.join("\n");
const mismatchRows = mismatches.length === 0
? "No mismatches."
: mismatches
.map((item) => {
const row = item;
return `- ${row.case_id}: expected(${row.expected_intent_class} / ${row.expected_route_hint}) -> actual(${row.actual_intent_class} / ${row.actual_route_hint}). ${row.comment}`;
})
.join("\n");
const badConfidenceRows = badConfidenceCases.length === 0
? "No bad-confidence cases."
: badConfidenceCases
.map((item) => {
const row = item;
return `- ${row.case_id}: confidence=${row.confidence_overall}, intent_match=${row.intent_match}, route_match=${row.route_match}`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "LLM Normalizer Eval Run")}`,
"",
`- run_id: ${String(report.run_id ?? "")}`,
`- timestamp: ${String(report.timestamp ?? "")}`,
`- mode: ${String(report.mode ?? "")}`,
`- use_mock: ${String(report.use_mock ?? false)}`,
`- cases_total: ${String(report.cases_total ?? 0)}`,
`- prompt_version: ${String(report.prompt_version ?? "")}`,
"",
"## Metrics vs Baseline",
"",
"| Metric | Current | Baseline | Delta |",
"|---|---:|---:|---:|",
metricRows || "| n/a | n/a | n/a | n/a |",
"",
"## Class Accuracy",
"",
"| Intent class | Passed/Total | Accuracy |",
"|---|---:|---:|",
classRows || "| n/a | n/a | n/a |",
"",
"## Budget",
"",
`- requests_total: ${String(budget.requests_total ?? 0)}`,
`- retries_used: ${String(budget.retries_used ?? 0)}`,
"",
"## Mismatches",
"",
mismatchRows,
"",
"## Bad Confidence Cases",
"",
badConfidenceRows,
""
].join("\n");
}
function parseCaseSetFile(inputPath) {
const filePath = path_1.default.isAbsolute(inputPath) ? inputPath : path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath);
const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed;
}
if (parsed && typeof parsed === "object" && Array.isArray(parsed.cases)) {
return parsed.cases;
}
throw new Error(`Unsupported eval dataset format: ${filePath}`);
}
function formatCaseId(prefix, index) {
return `${prefix}-${String(index + 1).padStart(3, "0")}`;
}
function parseRawQuestions(rawQuestions) {
const text = rawQuestions.replace(/\r\n/g, "\n").trim();
if (!text) {
return [];
}
const bySemicolon = text
.split(";")
.map((item) => item.trim())
.filter(Boolean);
if (bySemicolon.length > 1) {
return bySemicolon;
}
const byBlankLine = text
.split(/\n\s*\n+/)
.map((item) => item.trim())
.filter(Boolean);
if (byBlankLine.length > 1) {
return byBlankLine;
}
const byLine = text
.split("\n")
.map((item) => item.trim())
.filter(Boolean);
return byLine.length > 0 ? byLine : [text];
}
function normalizeAnalysisDate(value) {
if (typeof value !== "string") {
return null;
}
const trimmed = value.trim();
const match = trimmed.match(/^(\d{4})-(\d{2})-(\d{2})$/);
if (!match) {
return null;
}
const year = Number(match[1]);
const month = Number(match[2]);
const day = Number(match[3]);
if (!Number.isFinite(year) || !Number.isFinite(month) || !Number.isFinite(day)) {
return null;
}
const candidate = new Date(Date.UTC(year, month - 1, day));
if (candidate.getUTCFullYear() !== year ||
candidate.getUTCMonth() + 1 !== month ||
candidate.getUTCDate() !== day) {
return null;
}
return `${match[1]}-${match[2]}-${match[3]}`;
}
function executionReadinessOf(fragment) {
return "execution_readiness" in fragment ? fragment.execution_readiness : "executable";
}
function softAssumptionsOf(fragment) {
return "soft_assumption_used" in fragment ? fragment.soft_assumption_used : [];
}
function routeStatusOf(fragment) {
return "route_status" in fragment ? fragment.route_status : null;
}
function noRouteReasonOf(fragment) {
return "no_route_reason" in fragment ? fragment.no_route_reason : null;
}
function expectedScopeInScope(expected) {
if (!expected) {
return null;
}
if (typeof expected.expected_scope_in_scope === "boolean") {
return expected.expected_scope_in_scope;
}
if (expected.expected_no_route_reason === "out_of_scope") {
return false;
}
if (expected.expected_route_status === "routed") {
return true;
}
if (typeof expected.clarification_required === "boolean") {
return true;
}
return null;
}
function isDecisionStateConsistent(decision) {
const readiness = String(decision.execution_readiness ?? "");
const noRouteReason = decision.no_route_reason ?? null;
if (decision.route === "no_route") {
if (!noRouteReason) {
return false;
}
return readiness !== "executable" && readiness !== "executable_with_soft_assumptions";
}
if (noRouteReason) {
return false;
}
return readiness !== "needs_clarification" && readiness !== "no_route";
}
const DEFAULT_ASSISTANT_STAGE1_SUITE_FILE = "assistant_stage1_canonical_v0_1.json";
const ASSISTANT_STAGE1_RUN_SCHEMA_VERSION = "assistant_stage1_eval_run_v0_1";
const ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION = "assistant_stage1_eval_comparison_v0_1";
const DEFAULT_ASSISTANT_STAGE2_SUITE_FILE = "assistant_stage2_canonical_v0_1.json";
const ASSISTANT_STAGE2_RUN_SCHEMA_VERSION = "assistant_stage2_eval_run_v0_1";
const ASSISTANT_STAGE2_COMPARISON_SCHEMA_VERSION = "assistant_stage2_eval_comparison_v0_1";
const INMEM_EVAL_REPORT_PREFIX = "inmem_eval_report:";
const INMEM_EVAL_REPORTS = new Map();
function isNoSpaceError(error) {
const code = error?.code;
return code === "ENOSPC";
}
function tryWriteJsonFile(pathname, value) {
try {
(0, files_1.writeJsonFile)(pathname, value);
return true;
}
catch (error) {
if (isNoSpaceError(error)) {
return false;
}
throw error;
}
}
function tryWriteTextFile(pathname, value) {
try {
fs_1.default.writeFileSync(pathname, value, "utf-8");
return true;
}
catch (error) {
if (isNoSpaceError(error)) {
return false;
}
throw error;
}
}
function putInMemoryEvalReport(report) {
const key = `${INMEM_EVAL_REPORT_PREFIX}${(0, nanoid_1.nanoid)(12)}`;
INMEM_EVAL_REPORTS.set(key, report);
return key;
}
function readEvalReportByRef(ref) {
if (ref.startsWith(INMEM_EVAL_REPORT_PREFIX)) {
const report = INMEM_EVAL_REPORTS.get(ref);
if (!report) {
throw new Error(`In-memory eval report not found: ${ref}`);
}
return {
report,
resolved_path: ref
};
}
const resolvedPath = resolveReadablePath(ref);
const report = JSON.parse(fs_1.default.readFileSync(resolvedPath, "utf-8"));
return {
report,
resolved_path: resolvedPath
};
}
function compactAssistantStage1Report(report) {
const results = Array.isArray(report.results) ? report.results : [];
const compactResults = results.map((item) => ({
case_id: item.case_id ?? null,
scenario_tag: item.scenario_tag ?? null,
accountant_usefulness_score: item.accountant_usefulness_score ?? null,
accountant_metrics: typeof item.accountant_metrics === "object" && item.accountant_metrics !== null ? item.accountant_metrics : null
}));
return {
...report,
results: compactResults
};
}
function compactAssistantStage2Report(report) {
const results = Array.isArray(report.results) ? report.results : [];
const compactResults = results.map((item) => {
const metricSubscores = (item.metric_subscores ?? {});
return {
case_id: item.case_id ?? null,
metric_subscores: {
problem_clarity_score: metricSubscores.problem_clarity_score ?? null,
mechanism_coherence_score: metricSubscores.mechanism_coherence_score ?? null,
problem_first_answer_rate: metricSubscores.problem_first_answer_rate ?? null,
entity_leakage_rate: metricSubscores.entity_leakage_rate ?? null
}
};
});
return {
...report,
results: compactResults
};
}
const KNOWN_PROBLEM_UNIT_TYPES = [
"document_conflict",
"broken_chain_segment",
"lifecycle_anomaly_node",
"unresolved_settlement_cluster",
"period_risk_cluster",
"cross_branch_inconsistency_cluster"
];
function toProblemUnitType(value) {
return KNOWN_PROBLEM_UNIT_TYPES.includes(value) ? value : null;
}
function round2(value) {
return Number(value.toFixed(2));
}
function clampScore(value, min = 0, max = 5) {
if (Number.isNaN(value)) {
return min;
}
if (value < min)
return min;
if (value > max)
return max;
return value;
}
function rateToBandScore(metric, value) {
if (metric === "retrieval_differentiation_rate") {
if (value >= 0.75)
return 5;
if (value >= 0.45)
return 3;
return 0;
}
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
if (value <= 0.25)
return 5;
if (value <= 0.45)
return 3;
return 0;
}
if (metric === "accountant_actionability_score" || metric === "mechanism_specificity_score" || metric === "followup_context_retention_score") {
if (value >= 4)
return 5;
if (value >= 2.5)
return 3;
return 0;
}
return 0;
}
function rubricBandForMetric(metric, value) {
if (value === null) {
return null;
}
const score = rateToBandScore(metric, value);
return stage1Contracts_1.ACCOUNTANT_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null;
}
function rateToBandScoreStage2(metric, value) {
if (metric === "problem_unit_precision" || metric === "problem_unit_recall_proxy" || metric === "problem_first_answer_rate") {
if (value >= 0.75)
return 5;
if (value >= 0.45)
return 3;
return 0;
}
if (metric === "duplicate_collapse_rate") {
if (value >= 0.2)
return 5;
if (value >= 0.08)
return 3;
return 0;
}
if (metric === "entity_leakage_rate") {
if (value <= 0.2)
return 5;
if (value <= 0.4)
return 3;
return 0;
}
if (metric === "mechanism_coherence_score" || metric === "problem_clarity_score") {
if (value >= 4)
return 5;
if (value >= 2.5)
return 3;
return 0;
}
return 0;
}
function rubricBandForMetricStage2(metric, value) {
if (value === null) {
return null;
}
const score = rateToBandScoreStage2(metric, value);
return stage2EvalContracts_1.ASSISTANT_STAGE2_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null;
}
function buildFeatureProfileSnapshot() {
return {
FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1: config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1,
FEATURE_ASSISTANT_ANSWER_POLICY_V11: process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 ?? String(config_1.FEATURE_ASSISTANT_ANSWER_POLICY_V11),
FEATURE_ASSISTANT_BROAD_GUARD_V1: process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 ?? null,
FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1: process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 ?? null,
FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1: process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 ?? null,
FEATURE_ASSISTANT_INVESTIGATION_STATE_V1: process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 ?? null,
FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1: process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 ?? null,
FEATURE_ASSISTANT_PROBLEM_UNITS_V1: process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_UNITS_V1),
FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1: process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1),
FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1: process.env.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1),
FEATURE_ASSISTANT_STAGE2_EVAL_V1: process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 ?? String(config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1)
};
}
function buildCodeVersionMarker() {
return {
git_commit: process.env.GIT_COMMIT ??
process.env.CI_COMMIT_SHA ??
process.env.VERCEL_GIT_COMMIT_SHA ??
process.env.GITHUB_SHA ??
null,
build_marker: process.env.BUILD_MARKER ?? process.env.BUILD_ID ?? process.env.npm_package_version ?? null
};
}
function resolveReadablePath(inputPath) {
if (path_1.default.isAbsolute(inputPath)) {
return inputPath;
}
const candidates = [
path_1.default.resolve(config_1.REPORTS_DIR, inputPath),
path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath),
path_1.default.resolve(config_1.EVAL_CASES_DIR, inputPath),
path_1.default.resolve(inputPath)
];
for (const candidate of candidates) {
if (fs_1.default.existsSync(candidate)) {
return candidate;
}
}
return candidates[0];
}
function parseAssistantSuiteFile(inputPath) {
const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE1_SUITE_FILE);
const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
const parsed = JSON.parse(raw);
if (!parsed || typeof parsed !== "object") {
throw new Error(`Invalid assistant suite format: ${filePath}`);
}
if (!Array.isArray(parsed.cases)) {
throw new Error(`Assistant suite cases[] is required: ${filePath}`);
}
if (!Array.isArray(parsed.case_ids)) {
throw new Error(`Assistant suite case_ids[] is required: ${filePath}`);
}
if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) {
throw new Error(`Assistant suite suite_id is required: ${filePath}`);
}
if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) {
throw new Error(`Assistant suite suite_version is required: ${filePath}`);
}
if (parsed.scenario_count !== parsed.cases.length) {
throw new Error(`Assistant suite scenario_count mismatch: ${filePath}`);
}
const declaredIds = [...parsed.case_ids].sort();
const actualIds = parsed.cases.map((item) => item.case_id).sort();
const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]);
if (!idsMatch) {
throw new Error(`Assistant suite case_ids do not match cases[]: ${filePath}`);
}
for (const item of parsed.cases) {
if (!Array.isArray(item.turns) || item.turns.length === 0) {
throw new Error(`Assistant suite case ${item.case_id} must include at least one turn.`);
}
}
return parsed;
}
function parseAssistantStage2SuiteFile(inputPath) {
const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE2_SUITE_FILE);
const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
const parsed = JSON.parse(raw);
if (!parsed || typeof parsed !== "object") {
throw new Error(`Invalid assistant stage2 suite format: ${filePath}`);
}
if (!Array.isArray(parsed.cases)) {
throw new Error(`Assistant stage2 suite cases[] is required: ${filePath}`);
}
if (!Array.isArray(parsed.case_ids)) {
throw new Error(`Assistant stage2 suite case_ids[] is required: ${filePath}`);
}
if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) {
throw new Error(`Assistant stage2 suite_id is required: ${filePath}`);
}
if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) {
throw new Error(`Assistant stage2 suite_version is required: ${filePath}`);
}
if (parsed.scenario_count !== parsed.cases.length) {
throw new Error(`Assistant stage2 scenario_count mismatch: ${filePath}`);
}
const declaredIds = [...parsed.case_ids].sort();
const actualIds = parsed.cases.map((item) => item.case_id).sort();
const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]);
if (!idsMatch) {
throw new Error(`Assistant stage2 case_ids do not match cases[]: ${filePath}`);
}
for (const item of parsed.cases) {
if (!Array.isArray(item.turns) || item.turns.length === 0) {
throw new Error(`Assistant stage2 case ${item.case_id} must include at least one turn.`);
}
}
return parsed;
}
function hasDomainAnchors(text) {
const source = String(text ?? "");
if (!source.trim()) {
return false;
}
const hasPeriod = /\b20\d{2}(?:[-./](?:0[1-9]|1[0-2]))?\b/.test(source);
const hasAccountingObject = /(счет|контрагент|документ|ндс|ос|period|account|supplier|invoice|guid|объект)/i.test(source);
const hasAccountCode = /\b(?:01|02|03|04|08|10|19|20|25|26|41|43|44|50|51|52|57|60|62|68|69|70|71|73|76|90|91|94|97)\b/.test(source);
const hits = [hasPeriod, hasAccountingObject, hasAccountCode].filter(Boolean).length;
return hits >= 2;
}
function detectEntityLeakage(text) {
const source = String(text ?? "");
if (!source.trim()) {
return false;
}
const uuidHits = source.match(/\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi)?.length ?? 0;
const guidHits = source.match(/\b(?:guid|uuid|entity_id|source_ref|canonical_ref|fragment_id)\b/gi)?.length ?? 0;
const longHexHits = source.match(/\b[0-9a-f]{24,}\b/gi)?.length ?? 0;
return uuidHits > 0 || guidHits > 1 || longHexHits > 0;
}
function extractTextList(value) {
if (!Array.isArray(value)) {
return [];
}
return value
.map((item) => (typeof item === "string" ? item.trim() : ""))
.filter(Boolean);
}
function toNarrowingStrength(value) {
if (value === "weak" || value === "medium" || value === "strong") {
return value;
}
return null;
}
function toDegradedTo(value) {
if (value === "partial" || value === "clarification") {
return value;
}
return null;
}
function buildAssistantEvalMarkdownReport(report) {
const metrics = (report.metrics ?? {}).raw ?? {};
const bands = (report.rubric_bands ?? {});
const subsets = (report.subsets ?? {});
const scenarioSummary = (report.scenario_summary ?? {});
const improvementHints = (report.improvement_hints ?? {});
const rows = Object.keys(metrics)
.map((key) => {
const rawValue = metrics[key];
const band = bands[key];
const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue);
const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a";
return `| ${key} | ${rawPrintable} | ${bandPrintable} |`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "Assistant Stage 1 Eval Run")}`,
"",
`- run_id: ${String(report.run_id ?? "")}`,
`- eval_target: ${String(report.eval_target ?? "")}`,
`- run_timestamp: ${String(report.run_timestamp ?? "")}`,
`- suite_id: ${String(report.suite_id ?? "")}`,
`- suite_version: ${String(report.suite_version ?? "")}`,
`- cases_total: ${String(report.cases_total ?? 0)}`,
"",
"## Raw Metrics and Rubric Bands",
"",
"| Metric | Raw | Rubric band |",
"|---|---:|---|",
rows || "| n/a | n/a | n/a |",
"",
"## Subsets",
"",
`- broad_cases_total: ${String(subsets.broad_cases_total ?? 0)}`,
`- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`,
"",
"## Scenario Summary",
"",
`- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`,
`- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`,
`- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`,
"",
"## Improvement Hints",
"",
`- strongest_signals: ${String(improvementHints.strongest_signals ?? "n/a")}`,
`- weakest_signals: ${String(improvementHints.weakest_signals ?? "n/a")}`,
""
].join("\n");
}
function buildAssistantStage2EvalMarkdownReport(report) {
const metrics = (report.metrics ?? {}).raw ?? {};
const bands = (report.rubric_bands ?? {});
const subsets = (report.subsets ?? {});
const scenarioSummary = (report.scenario_summary ?? {});
const rows = Object.keys(metrics)
.map((key) => {
const rawValue = metrics[key];
const band = bands[key];
const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue);
const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a";
return `| ${key} | ${rawPrintable} | ${bandPrintable} |`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "Assistant Stage 2 Eval Run")}`,
"",
`- run_id: ${String(report.run_id ?? "")}`,
`- eval_target: ${String(report.eval_target ?? "")}`,
`- run_timestamp: ${String(report.run_timestamp ?? "")}`,
`- suite_id: ${String(report.suite_id ?? "")}`,
`- suite_version: ${String(report.suite_version ?? "")}`,
`- cases_total: ${String(report.cases_total ?? 0)}`,
"",
"## Raw Metrics and Rubric Bands",
"",
"| Metric | Raw | Rubric band |",
"|---|---:|---|",
rows || "| n/a | n/a | n/a |",
"",
"## Subsets",
"",
`- expected_problem_cases_total: ${String(subsets.expected_problem_cases_total ?? 0)}`,
`- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`,
`- candidate_cases_total: ${String(subsets.candidate_cases_total ?? 0)}`,
"",
"## Scenario Summary",
"",
`- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`,
`- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`,
`- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`,
""
].join("\n");
}
function buildAssistantComparisonMarkdownReport(report) {
const metrics = (report.metric_deltas ?? {});
const summary = (report.scenario_notes_summary ?? {});
const rows = Object.keys(metrics)
.map((key) => {
const row = metrics[key];
return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "Assistant Stage 1 Baseline vs Current")}`,
"",
`- comparison_id: ${String(report.comparison_id ?? "")}`,
`- baseline_run_id: ${String(report.baseline_run_id ?? "")}`,
`- current_run_id: ${String(report.current_run_id ?? "")}`,
`- suite_version: ${String(report.suite_version ?? "")}`,
"",
"## Metric Deltas",
"",
"| Metric | Baseline | Current | Delta | Trend |",
"|---|---:|---:|---:|---|",
rows || "| n/a | n/a | n/a | n/a | n/a |",
"",
"## Scenario Notes Summary",
"",
`- improved: ${String(summary.improved ?? 0)}`,
`- unchanged: ${String(summary.unchanged ?? 0)}`,
`- weakened: ${String(summary.weakened ?? 0)}`,
""
].join("\n");
}
function buildAssistantStage2ComparisonMarkdownReport(report) {
const metrics = (report.metric_deltas ?? {});
const summary = (report.scenario_notes_summary ?? {});
const rows = Object.keys(metrics)
.map((key) => {
const row = metrics[key];
return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`;
})
.join("\n");
return [
`# ${String(report.report_title ?? "Assistant Stage 2 Baseline vs Current")}`,
"",
`- comparison_id: ${String(report.comparison_id ?? "")}`,
`- baseline_run_id: ${String(report.baseline_run_id ?? "")}`,
`- current_run_id: ${String(report.current_run_id ?? "")}`,
`- suite_version: ${String(report.suite_version ?? "")}`,
"",
"## Metric Deltas",
"",
"| Metric | Baseline | Current | Delta | Trend |",
"|---|---:|---:|---:|---|",
rows || "| n/a | n/a | n/a | n/a | n/a |",
"",
"## Scenario Notes Summary",
"",
`- improved: ${String(summary.improved ?? 0)}`,
`- unchanged: ${String(summary.unchanged ?? 0)}`,
`- weakened: ${String(summary.weakened ?? 0)}`,
""
].join("\n");
}
class EvalService {
normalizerService;
constructor(normalizerService) {
this.normalizerService = normalizerService;
}
listCases() {
(0, files_1.ensureDir)(config_1.EVAL_CASES_DIR);
const files = fs_1.default
.readdirSync(config_1.EVAL_CASES_DIR)
.filter((item) => item.endsWith(".json") && !item.endsWith(".report.json"));
return files
.map((name) => {
const raw = fs_1.default.readFileSync(path_1.default.resolve(config_1.EVAL_CASES_DIR, name), "utf-8");
return JSON.parse(raw);
})
.sort((a, b) => a.case_id.localeCompare(b.case_id));
}
async runV2(payload) {
const runId = `eval-${(0, nanoid_1.nanoid)(10)}`;
const results = [];
const routeCounter = {};
const fallbackCounter = {};
let schemaPass = 0;
let inScopeMessages = 0;
let multiIntentMessages = 0;
let clarificationMessages = 0;
let totalFragments = 0;
let inScopeFragments = 0;
let outOfScopeFragments = 0;
let unclearFragments = 0;
let executableWithSoftAssumptionsFragments = 0;
let softAssumptionFragments = 0;
let routedFragments = 0;
let noRouteFragments = 0;
let requestsTotal = 0;
let retriesUsed = 0;
let clarificationLabeledCases = 0;
let clarificationTruePositive = 0;
let clarificationFalsePositive = 0;
let clarificationFalseNegative = 0;
let scopeLabeledCases = 0;
let scopeCorrectCases = 0;
let routeLabeledCases = 0;
let routeCorrectCases = 0;
let expectedRoutedCases = 0;
let noRouteTruePositive = 0;
let noRouteFalsePositive = 0;
let stateConsistencyChecks = 0;
let stateConsistencyPass = 0;
for (const item of payload.cases) {
const response = await this.normalizerService.normalize({
...payload.normalizeConfig,
userQuestion: item.raw_question,
context: {
period_hint: payload.analysisDate ?? undefined,
analysis_context: payload.analysisDate
? {
as_of_date: payload.analysisDate,
source: "eval_analysis_date"
}
: undefined,
eval_label: runId,
case_id: item.case_id,
eval_mode: payload.mode
},
retryPolicy: payload.mode === "single-pass-strict" ? "single-pass-strict" : "default",
useMock: payload.useMock
});
if (response.validation.passed) {
schemaPass += 1;
}
const requestCount = Number(response.request_count_for_case ?? 0);
requestsTotal += requestCount;
if (requestCount > 1) {
retriesUsed += 1;
}
const normalized = response.normalized &&
["normalized_query_v2", "normalized_query_v2_0_1", "normalized_query_v2_0_2"].includes(String(response.normalized.schema_version ?? ""))
? response.normalized
: null;
const routeSummary = response.route_hint_summary &&
response.route_hint_summary.mode === "deterministic_v2"
? response.route_hint_summary
: null;
if (normalized) {
if (normalized.message_in_scope) {
inScopeMessages += 1;
}
if (normalized.contains_multiple_tasks) {
multiIntentMessages += 1;
}
if (normalized.global_notes.needs_clarification) {
clarificationMessages += 1;
}
totalFragments += normalized.fragments.length;
const inScopeList = normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope");
inScopeFragments += inScopeList.length;
outOfScopeFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length;
unclearFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length;
for (const fragment of inScopeList) {
const readiness = executionReadinessOf(fragment);
if (readiness === "executable_with_soft_assumptions") {
executableWithSoftAssumptionsFragments += 1;
}
if (softAssumptionsOf(fragment).length > 0) {
softAssumptionFragments += 1;
}
}
}
const predictedClarification = Boolean(normalized?.global_notes?.needs_clarification);
const expectedClarification = typeof item.expected?.clarification_required === "boolean" ? item.expected.clarification_required : null;
if (expectedClarification !== null) {
clarificationLabeledCases += 1;
if (predictedClarification && expectedClarification)
clarificationTruePositive += 1;
if (predictedClarification && !expectedClarification)
clarificationFalsePositive += 1;
if (!predictedClarification && expectedClarification)
clarificationFalseNegative += 1;
}
const predictedScope = normalized ? normalized.message_in_scope : null;
const expectedScope = expectedScopeInScope(item.expected);
if (expectedScope !== null && predictedScope !== null) {
scopeLabeledCases += 1;
if (predictedScope === expectedScope) {
scopeCorrectCases += 1;
}
}
const predictedRouteStatus = routeSummary
? routeSummary.decisions.some((decision) => decision.route !== "no_route")
? "routed"
: "no_route"
: null;
const predictedNoRouteReason = routeSummary &&
routeSummary.decisions.length > 0 &&
routeSummary.decisions.every((decision) => decision.route === "no_route")
? (routeSummary.decisions[0]?.no_route_reason ?? null)
: null;
const expectedRouteStatus = item.expected?.expected_route_status ?? null;
const expectedNoRouteReason = item.expected?.expected_no_route_reason ?? null;
if (expectedRouteStatus) {
routeLabeledCases += 1;
if (predictedRouteStatus === expectedRouteStatus) {
routeCorrectCases += 1;
}
if (expectedRouteStatus === "routed") {
expectedRoutedCases += 1;
}
}
if (predictedRouteStatus === "no_route") {
if (expectedRouteStatus === "no_route") {
if (!expectedNoRouteReason || expectedNoRouteReason === predictedNoRouteReason) {
noRouteTruePositive += 1;
}
else {
noRouteFalsePositive += 1;
}
}
else if (expectedRouteStatus === "routed") {
noRouteFalsePositive += 1;
}
}
if (routeSummary) {
for (const decision of routeSummary.decisions) {
stateConsistencyChecks += 1;
if (isDecisionStateConsistent(decision)) {
stateConsistencyPass += 1;
}
routeCounter[decision.route] = (routeCounter[decision.route] ?? 0) + 1;
if (decision.route === "no_route") {
noRouteFragments += 1;
}
else {
routedFragments += 1;
}
}
const fallbackType = String(routeSummary.fallback?.type ?? "none");
fallbackCounter[fallbackType] = (fallbackCounter[fallbackType] ?? 0) + 1;
}
else {
fallbackCounter.none = (fallbackCounter.none ?? 0) + 1;
}
results.push({
case_id: item.case_id,
raw_question: item.raw_question,
validation_passed: response.validation.passed,
message_in_scope: normalized?.message_in_scope ?? null,
scope_confidence: normalized?.scope_confidence ?? null,
contains_multiple_tasks: normalized?.contains_multiple_tasks ?? null,
fragments_total: normalized?.fragments.length ?? 0,
in_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope").length : 0,
out_of_scope_fragments: normalized
? normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length
: 0,
unclear_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length : 0,
fallback_type: routeSummary?.fallback?.type ?? "none",
predicted_route_status: predictedRouteStatus,
expected_route_status: expectedRouteStatus,
predicted_no_route_reason: predictedNoRouteReason,
expected_no_route_reason: expectedNoRouteReason,
predicted_clarification_required: predictedClarification,
expected_clarification_required: expectedClarification,
executable_with_soft_assumptions_fragments: normalized
? normalized.fragments.filter((fragment) => executionReadinessOf(fragment) === "executable_with_soft_assumptions")
.length
: 0,
trace_id: response.trace_id,
request_count_for_case: requestCount
});
}
const total = Math.max(1, payload.cases.length);
const totalFragmentsSafe = Math.max(1, totalFragments);
const totalRoutedDecisions = Math.max(1, routedFragments + noRouteFragments);
const precisionDenominator = clarificationTruePositive + clarificationFalsePositive;
const recallDenominator = clarificationTruePositive + clarificationFalseNegative;
const noRoutePrecisionDenominator = noRouteTruePositive + noRouteFalsePositive;
const metrics = {
schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
scope_detection_accuracy: scopeLabeledCases > 0 ? Number(((scopeCorrectCases / scopeLabeledCases) * 100).toFixed(2)) : null,
scope_in_scope_rate: Number(((inScopeMessages / total) * 100).toFixed(2)),
multi_intent_detected_rate: Number(((multiIntentMessages / total) * 100).toFixed(2)),
clarification_required_rate: Number(((clarificationMessages / total) * 100).toFixed(2)),
avg_fragments_per_message: Number((totalFragments / total).toFixed(2)),
out_of_scope_fragment_rate: Number(((outOfScopeFragments / totalFragmentsSafe) * 100).toFixed(2)),
routed_fragment_rate: Number(((routedFragments / totalRoutedDecisions) * 100).toFixed(2)),
no_route_fragment_rate: Number(((noRouteFragments / totalRoutedDecisions) * 100).toFixed(2)),
route_resolution_accuracy: routeLabeledCases > 0 ? Number(((routeCorrectCases / routeLabeledCases) * 100).toFixed(2)) : null,
no_route_precision: noRoutePrecisionDenominator > 0 ? Number(((noRouteTruePositive / noRoutePrecisionDenominator) * 100).toFixed(2)) : null,
false_no_route_rate: expectedRoutedCases > 0 ? Number(((noRouteFalsePositive / expectedRoutedCases) * 100).toFixed(2)) : null,
execution_state_consistency_rate: stateConsistencyChecks > 0 ? Number(((stateConsistencyPass / stateConsistencyChecks) * 100).toFixed(2)) : null,
executable_with_soft_assumptions_rate: Number(((executableWithSoftAssumptionsFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
soft_assumption_used_fragment_rate: Number(((softAssumptionFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
clarification_precision: precisionDenominator > 0 ? Number(((clarificationTruePositive / precisionDenominator) * 100).toFixed(2)) : null,
clarification_recall: recallDenominator > 0 ? Number(((clarificationTruePositive / recallDenominator) * 100).toFixed(2)) : null,
false_clarification_rate: clarificationLabeledCases > 0 ? Number(((clarificationFalsePositive / clarificationLabeledCases) * 100).toFixed(2)) : null
};
const report = {
run_id: runId,
timestamp: new Date().toISOString(),
mode: payload.mode,
use_mock: Boolean(payload.useMock),
prompt_version: payload.normalizeConfig.promptVersion ?? null,
schema_version: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
.toLowerCase()
.includes("v2_0_2")
? "v2_0_2"
: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
.toLowerCase()
.includes("v2_0_1")
? "v2_0_1"
: "v2",
dataset: {
source: payload.rawQuestions ? "inline_raw_questions" : payload.caseSetFile ? "file" : "data/eval_cases/*.json",
file: payload.caseSetFile ?? null,
raw_questions_count: payload.rawQuestions ? parseRawQuestions(payload.rawQuestions).length : null
},
cases_total: payload.cases.length,
metrics,
budget: {
requests_total: requestsTotal,
retries_used: retriesUsed
},
clarification_eval: {
labeled_cases: clarificationLabeledCases,
true_positive: clarificationTruePositive,
false_positive: clarificationFalsePositive,
false_negative: clarificationFalseNegative
},
route_eval: {
labeled_cases: routeLabeledCases,
correct_cases: routeCorrectCases,
expected_routed_cases: expectedRoutedCases,
no_route_true_positive: noRouteTruePositive,
no_route_false_positive: noRouteFalsePositive
},
scope_eval: {
labeled_cases: scopeLabeledCases,
correct_cases: scopeCorrectCases
},
execution_state_eval: {
checks_total: stateConsistencyChecks,
checks_passed: stateConsistencyPass
},
route_distribution: routeCounter,
fallback_distribution: fallbackCounter,
results
};
(0, files_1.ensureDir)(config_1.EVAL_CASES_DIR);
tryWriteJsonFile(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report);
return report;
}
collectAssistantSignals(finalResponse, turnResponses) {
const debug = finalResponse.debug;
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
const sourceRefSet = new Set();
const limitationCodeSet = new Set();
const routeSet = new Set();
const confidenceScores = [];
const narrowingOrder = { weak: 0, medium: 1, strong: 2 };
let broadQueryDetected = false;
let broadResultFlag = false;
let minimumEvidenceFailed = false;
let degradedTo = null;
let narrowingStrength = null;
for (const result of retrievalResults) {
routeSet.add(String(result.route ?? "unknown"));
const summary = result.summary ?? {};
if (summary.broad_query_detected === true)
broadQueryDetected = true;
if (summary.broad_result_flag === true)
broadResultFlag = true;
if (summary.minimum_evidence_failed === true)
minimumEvidenceFailed = true;
const degraded = toDegradedTo(summary.degraded_to);
if (degraded === "clarification") {
degradedTo = "clarification";
}
else if (!degradedTo && degraded === "partial") {
degradedTo = "partial";
}
const narrowed = toNarrowingStrength(summary.narrowing_strength);
if (narrowed && (!narrowingStrength || narrowingOrder[narrowed] < narrowingOrder[narrowingStrength])) {
narrowingStrength = narrowed;
}
if (result.confidence === "high")
confidenceScores.push(3);
if (result.confidence === "medium")
confidenceScores.push(2);
if (result.confidence === "low")
confidenceScores.push(1);
for (const evidence of Array.isArray(result.evidence) ? result.evidence : []) {
const canonicalRef = String(evidence.source_ref?.canonical_ref ?? "").trim();
if (canonicalRef) {
sourceRefSet.add(canonicalRef);
}
const reasonCode = String(evidence.limitation?.reason_code ?? "").trim();
if (reasonCode) {
limitationCodeSet.add(reasonCode);
}
if (evidence.confidence === "high")
confidenceScores.push(3);
if (evidence.confidence === "medium")
confidenceScores.push(2);
if (evidence.confidence === "low")
confidenceScores.push(1);
}
}
const averageConfidence = confidenceScores.length > 0 ? confidenceScores.reduce((acc, item) => acc + item, 0) / confidenceScores.length : null;
const evidenceConfidence = averageConfidence === null ? null : averageConfidence >= 2.6 ? "high" : averageConfidence >= 1.8 ? "medium" : "low";
const mechanismStatus = debug?.answer_structure_v11?.mechanism_block?.status === "grounded" ||
debug?.answer_structure_v11?.mechanism_block?.status === "limited" ||
debug?.answer_structure_v11?.mechanism_block?.status === "unresolved"
? debug.answer_structure_v11.mechanism_block.status
: null;
const followupStateApplied = turnResponses.some((item) => item.debug?.followup_state_usage?.applied === true);
const uncertaintyLimitationsCount = debug?.answer_structure_v11?.uncertainty_block?.limitations?.length ?? 0;
return {
broad_query_detected: broadQueryDetected,
broad_result_flag: broadResultFlag,
narrowing_strength: narrowingStrength,
minimum_evidence_failed: minimumEvidenceFailed,
degraded_to: degradedTo,
evidence_confidence: evidenceConfidence,
limitation_reason_codes: [...limitationCodeSet],
mechanism_status: mechanismStatus,
source_refs: [...sourceRefSet],
routes: [...routeSet],
followup_state_applied: followupStateApplied,
uncertainty_limitations_count: uncertaintyLimitationsCount
};
}
collectAssistantStage2Signals(finalResponse, turnResponses) {
const base = this.collectAssistantSignals(finalResponse, turnResponses);
const debug = finalResponse.debug;
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
const typeSet = new Set();
const mechanismSummaries = new Set();
let candidateEvidenceTotal = 0;
let problemUnitsTotal = 0;
let duplicateCollapsesTotal = 0;
for (const result of retrievalResults) {
const candidates = Array.isArray(result.candidate_evidence) ? result.candidate_evidence : [];
candidateEvidenceTotal += candidates.length;
const problemUnits = Array.isArray(result.problem_units) ? result.problem_units : [];
problemUnitsTotal += problemUnits.length;
for (const unit of problemUnits) {
const unitType = toProblemUnitType(unit.problem_unit_type);
if (unitType) {
typeSet.add(unitType);
}
const mechanismSummary = String(unit.mechanism_summary ?? "").trim();
if (mechanismSummary) {
mechanismSummaries.add(mechanismSummary);
}
}
if (result.problem_unit_summary && typeof result.problem_unit_summary.duplicate_collapses === "number") {
duplicateCollapsesTotal += Number(result.problem_unit_summary.duplicate_collapses);
}
}
const answerMode = typeof debug?.problem_answer_mode === "string" ? debug.problem_answer_mode : null;
const unitsUsedCount = Number(debug?.problem_units_used_count ?? 0);
const unitIdsUsed = Array.isArray(debug?.problem_unit_ids_used)
? debug.problem_unit_ids_used
.map((item) => String(item ?? "").trim())
.filter(Boolean)
: [];
const problemCentricApplied = debug?.problem_centric_answer_applied === true || answerMode === "stage2_problem_centric_v1";
return {
...base,
candidate_evidence_total: candidateEvidenceTotal,
problem_units_total: problemUnitsTotal,
problem_unit_types: [...typeSet],
problem_mechanism_summaries: [...mechanismSummaries],
duplicate_collapses_total: duplicateCollapsesTotal,
problem_centric_answer_applied: problemCentricApplied,
problem_units_used_count: unitsUsedCount,
problem_answer_mode: answerMode,
problem_unit_ids_used: unitIdsUsed,
entity_leakage_detected: detectEntityLeakage(String(finalResponse.assistant_reply ?? ""))
};
}
getExpectedProblemUnitTypes(suiteCase) {
const expected = Array.isArray(suiteCase.expected_hints?.expected_problem_unit_types)
? suiteCase.expected_hints?.expected_problem_unit_types
: [];
const output = new Set();
for (const value of expected ?? []) {
const mapped = toProblemUnitType(value);
if (mapped) {
output.add(mapped);
}
}
return [...output];
}
computeProblemUnitPrecision(expectedTypes, detectedTypes) {
const uniqueExpected = [...new Set(expectedTypes)];
const uniqueDetected = [...new Set(detectedTypes)];
if (uniqueDetected.length === 0) {
return uniqueExpected.length === 0 ? 1 : 0;
}
if (uniqueExpected.length === 0) {
return 0;
}
const matchedDetected = uniqueDetected.filter((item) => uniqueExpected.includes(item)).length;
return round2(matchedDetected / uniqueDetected.length);
}
computeProblemUnitRecallProxy(expectedTypes, detectedTypes) {
const uniqueExpected = [...new Set(expectedTypes)];
const uniqueDetected = [...new Set(detectedTypes)];
if (uniqueExpected.length === 0) {
return null;
}
if (uniqueDetected.length === 0) {
return 0;
}
const matchedExpected = uniqueExpected.filter((item) => uniqueDetected.includes(item)).length;
return round2(matchedExpected / uniqueExpected.length);
}
computeDuplicateCollapseRate(candidateTotal, duplicateCollapses) {
if (candidateTotal <= 0) {
return null;
}
return round2(Math.min(1, Math.max(0, duplicateCollapses / candidateTotal)));
}
computeMechanismCoherenceScore(finalResponse, signals) {
const mechanismBlock = finalResponse.debug?.answer_structure_v11?.mechanism_block;
const mechanismStatus = mechanismBlock?.status;
const mechanismNotes = extractTextList(mechanismBlock?.mechanism_notes);
const hasProblemMechanism = signals.problem_mechanism_summaries.length > 0;
let score = 0;
if (mechanismStatus === "grounded" && hasProblemMechanism && mechanismNotes.length > 0) {
score = 5;
}
else if ((mechanismStatus === "limited" || mechanismStatus === "unresolved") && (hasProblemMechanism || mechanismNotes.length > 0)) {
score = 3;
}
else if (hasProblemMechanism || mechanismNotes.length > 0) {
score = 2;
}
if (mechanismStatus === "grounded" && !hasProblemMechanism) {
score = Math.min(score, 2);
}
if (signals.limitation_reason_codes.includes("missing_mechanism")) {
score -= 1;
}
return clampScore(score);
}
computeProblemClarityScore(finalResponse, signals) {
const structure = finalResponse.debug?.answer_structure_v11;
const answerSummary = String(structure?.answer_summary ?? "").trim();
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "").trim();
const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions);
const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions);
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
let score = 0;
if (answerSummary.length > 20)
score += 1;
if (directAnswer.length > 20)
score += 1;
if (hasDomainAnchors(`${answerSummary} ${directAnswer}`))
score += 1;
if (recommendedActions.length > 0 || clarificationQuestions.length > 0)
score += 1;
if (signals.problem_units_total > 0 || signals.problem_centric_answer_applied)
score += 1;
if ((signals.minimum_evidence_failed || signals.degraded_to === "clarification") && uncertaintyLimitations.length === 0) {
score -= 1;
}
if (signals.entity_leakage_detected) {
score -= 1;
}
return clampScore(score);
}
computeAssistantMetrics(input) {
const diagnostics = input.diagnostics;
const total = Math.max(1, diagnostics.length);
const signatureCounter = diagnostics.reduce((acc, item) => {
acc[item.signature] = (acc[item.signature] ?? 0) + 1;
return acc;
}, {});
const uniqueSignatures = Object.keys(signatureCounter).length;
const genericCases = diagnostics.filter((item) => item.is_generic).length;
const falseConfidenceCases = diagnostics.filter((item) => item.is_false_confident).length;
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
const avgActionability = diagnostics.length > 0
? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
: null;
const avgMechanism = diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.mechanism_specificity_score, 0) / diagnostics.length : null;
const avgFollowup = followupCases.length > 0
? followupCases.reduce((acc, item) => acc + Number(item.followup_retention_score ?? 0), 0) / followupCases.length
: null;
const raw = {
retrieval_differentiation_rate: round2(uniqueSignatures / total),
generic_explanation_rate: round2(genericCases / total),
accountant_actionability_score: avgActionability === null ? null : round2(avgActionability),
false_confidence_rate: round2(falseConfidenceCases / total),
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
};
const rubric_bands = {
retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
generic_explanation_rate: rubricBandForMetric("generic_explanation_rate", raw.generic_explanation_rate),
accountant_actionability_score: rubricBandForMetric("accountant_actionability_score", raw.accountant_actionability_score),
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
};
return {
raw,
rubric_bands,
denominators: {
cases_total: diagnostics.length,
broad_cases_total: broadCases.length,
followup_cases_total: followupCases.length
},
signature_counts: signatureCounter
};
}
computeAssistantStage2Metrics(input) {
const diagnostics = input.diagnostics;
const signatureCounter = diagnostics.reduce((acc, item) => {
acc[item.signature] = (acc[item.signature] ?? 0) + 1;
return acc;
}, {});
const precisionValues = diagnostics
.map((item) => item.problem_unit_precision)
.filter((item) => typeof item === "number");
const recallValues = diagnostics
.map((item) => item.problem_unit_recall_proxy)
.filter((item) => typeof item === "number");
const collapseValues = diagnostics
.map((item) => item.duplicate_collapse_rate)
.filter((item) => typeof item === "number");
const mechanismValues = diagnostics.map((item) => item.mechanism_coherence_score);
const clarityValues = diagnostics.map((item) => item.problem_clarity_score);
const firstApplicable = diagnostics.filter((item) => item.problem_first_answer_applied !== null);
const firstApplied = firstApplicable.filter((item) => item.problem_first_answer_applied === true).length;
const leakageCases = diagnostics.filter((item) => item.entity_leakage).length;
const followupCases = diagnostics.filter((item) => item.suite_case.question_type === "followup" || item.turn_count > 1);
const candidateCases = diagnostics.filter((item) => item.signals.candidate_evidence_total > 0);
const expectedProblemCases = diagnostics.filter((item) => item.expected_problem_first);
const average = (values) => {
if (values.length === 0)
return null;
return round2(values.reduce((acc, item) => acc + item, 0) / values.length);
};
const raw = {
problem_unit_precision: average(precisionValues),
problem_unit_recall_proxy: average(recallValues),
duplicate_collapse_rate: average(collapseValues),
mechanism_coherence_score: average(mechanismValues),
problem_clarity_score: average(clarityValues),
problem_first_answer_rate: firstApplicable.length > 0 ? round2(firstApplied / firstApplicable.length) : null,
entity_leakage_rate: diagnostics.length > 0 ? round2(leakageCases / diagnostics.length) : null
};
const rubric_bands = {
problem_unit_precision: rubricBandForMetricStage2("problem_unit_precision", raw.problem_unit_precision),
problem_unit_recall_proxy: rubricBandForMetricStage2("problem_unit_recall_proxy", raw.problem_unit_recall_proxy),
duplicate_collapse_rate: rubricBandForMetricStage2("duplicate_collapse_rate", raw.duplicate_collapse_rate),
mechanism_coherence_score: rubricBandForMetricStage2("mechanism_coherence_score", raw.mechanism_coherence_score),
problem_clarity_score: rubricBandForMetricStage2("problem_clarity_score", raw.problem_clarity_score),
problem_first_answer_rate: rubricBandForMetricStage2("problem_first_answer_rate", raw.problem_first_answer_rate),
entity_leakage_rate: rubricBandForMetricStage2("entity_leakage_rate", raw.entity_leakage_rate)
};
return {
raw,
rubric_bands,
denominators: {
cases_total: diagnostics.length,
expected_problem_cases_total: expectedProblemCases.length,
followup_cases_total: followupCases.length,
candidate_cases_total: candidateCases.length,
precision_cases_total: precisionValues.length,
recall_cases_total: recallValues.length,
duplicate_collapse_cases_total: collapseValues.length,
problem_first_applicable_cases_total: firstApplicable.length
},
signature_counts: signatureCounter
};
}
buildAssistantComparisonReport(input) {
const baselineRef = readEvalReportByRef(input.baselineReportFile);
const baselinePath = baselineRef.resolved_path;
const baselineReport = baselineRef.report;
const currentReport = input.currentReport;
const metricKeys = [
"retrieval_differentiation_rate",
"generic_explanation_rate",
"accountant_actionability_score",
"false_confidence_rate",
"broad_answer_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
];
const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
const currentRaw = (currentReport.metrics ?? {}).raw ?? {};
const deltas = {};
for (const metric of metricKeys) {
const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null;
const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null;
const delta = baseline !== null && current !== null ? round2(current - baseline) : null;
let trend = "n/a";
if (baseline !== null && current !== null) {
const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01;
const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01;
trend = improved ? "improved" : weakened ? "weakened" : "unchanged";
}
deltas[metric] = { baseline, current, delta, trend };
}
const baselineResults = Array.isArray(baselineReport.results) ? baselineReport.results : [];
const currentResults = Array.isArray(currentReport.results) ? currentReport.results : [];
const baselineByCase = new Map();
for (const row of baselineResults) {
baselineByCase.set(String(row.case_id ?? ""), row);
}
const improvedNotes = [];
const unchangedNotes = [];
const weakenedNotes = [];
for (const row of currentResults) {
const caseId = String(row.case_id ?? "");
const currentUsefulness = typeof row.accountant_usefulness_score === "number" ? Number(row.accountant_usefulness_score) : null;
const baselineRow = baselineByCase.get(caseId);
const baselineUsefulness = baselineRow && typeof baselineRow.accountant_usefulness_score === "number"
? Number(baselineRow.accountant_usefulness_score)
: null;
if (baselineUsefulness === null || currentUsefulness === null) {
continue;
}
const delta = round2(currentUsefulness - baselineUsefulness);
const note = `${caseId}: usefulness ${baselineUsefulness} -> ${currentUsefulness} (delta ${delta})`;
if (delta > 0.25) {
improvedNotes.push(note);
}
else if (delta < -0.25) {
weakenedNotes.push(note);
}
else {
unchangedNotes.push(note);
}
}
const comparisonId = `assistant-compare-${(0, nanoid_1.nanoid)(8)}`;
const comparisonReport = {
schema_version: ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION,
comparison_id: comparisonId,
run_timestamp: new Date().toISOString(),
baseline_run_id: baselineReport.run_id ?? null,
current_run_id: currentReport.run_id ?? null,
eval_target: "assistant_stage1",
suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null,
suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null,
baseline_report_file: baselinePath,
current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object"
? currentReport.artifacts.run_report_json_path ?? null
: null,
metric_deltas: deltas,
scenario_notes_summary: {
improved: improvedNotes.length,
unchanged: unchangedNotes.length,
weakened: weakenedNotes.length
},
scenario_notes: {
improved: improvedNotes,
unchanged: unchangedNotes,
weakened: weakenedNotes
},
known_limitations: currentReport.known_limitations ?? [
"Comparison is run-to-run and depends on stable mock/runtime flags.",
"Metrics remain Stage 1 heuristic bands, not full product scorecards."
],
report_title: "Assistant Stage 1 Baseline vs Current"
};
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
const jsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.json`);
const mdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.md`);
const jsonWritten = tryWriteJsonFile(jsonPath, comparisonReport);
const mdWritten = tryWriteTextFile(mdPath, buildAssistantComparisonMarkdownReport(comparisonReport));
const comparisonRef = jsonWritten ? jsonPath : putInMemoryEvalReport(comparisonReport);
return {
...comparisonReport,
artifacts: {
comparison_report_json_path: comparisonRef,
comparison_report_md_path: mdWritten ? mdPath : null
}
};
}
buildAssistantStage2ComparisonReport(input) {
const baselineRef = readEvalReportByRef(input.baselineReportFile);
const baselinePath = baselineRef.resolved_path;
const baselineReport = baselineRef.report;
const currentReport = input.currentReport;
const metricKeys = [
"problem_unit_precision",
"problem_unit_recall_proxy",
"duplicate_collapse_rate",
"mechanism_coherence_score",
"problem_clarity_score",
"problem_first_answer_rate",
"entity_leakage_rate"
];
const lowerIsBetter = new Set(["entity_leakage_rate"]);
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
const currentRaw = (currentReport.metrics ?? {}).raw ?? {};
const deltas = {};
for (const metric of metricKeys) {
const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null;
const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null;
const delta = baseline !== null && current !== null ? round2(current - baseline) : null;
let trend = "n/a";
if (baseline !== null && current !== null) {
const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01;
const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01;
trend = improved ? "improved" : weakened ? "weakened" : "unchanged";
}
deltas[metric] = { baseline, current, delta, trend };
}
const baselineResults = Array.isArray(baselineReport.results) ? baselineReport.results : [];
const currentResults = Array.isArray(currentReport.results) ? currentReport.results : [];
const baselineByCase = new Map();
for (const row of baselineResults) {
baselineByCase.set(String(row.case_id ?? ""), row);
}
const improvedNotes = [];
const unchangedNotes = [];
const weakenedNotes = [];
const toComposite = (row) => {
if (!row || typeof row !== "object")
return null;
const metricSubscores = row.metric_subscores;
if (!metricSubscores)
return null;
const clarity = typeof metricSubscores.problem_clarity_score === "number" ? Number(metricSubscores.problem_clarity_score) : null;
const mechanism = typeof metricSubscores.mechanism_coherence_score === "number" ? Number(metricSubscores.mechanism_coherence_score) : null;
const firstRate = typeof metricSubscores.problem_first_answer_rate === "number" ? Number(metricSubscores.problem_first_answer_rate) : null;
const leakageRate = typeof metricSubscores.entity_leakage_rate === "number" ? Number(metricSubscores.entity_leakage_rate) : null;
if (clarity === null || mechanism === null || firstRate === null || leakageRate === null) {
return null;
}
return round2((clarity + mechanism + firstRate * 5 + (1 - leakageRate) * 5) / 4);
};
for (const row of currentResults) {
const caseId = String(row.case_id ?? "");
const currentComposite = toComposite(row);
const baselineComposite = toComposite(baselineByCase.get(caseId));
if (currentComposite === null || baselineComposite === null) {
continue;
}
const delta = round2(currentComposite - baselineComposite);
const note = `${caseId}: composite ${baselineComposite} -> ${currentComposite} (delta ${delta})`;
if (delta > 0.25) {
improvedNotes.push(note);
}
else if (delta < -0.25) {
weakenedNotes.push(note);
}
else {
unchangedNotes.push(note);
}
}
const comparisonId = `assistant-stage2-compare-${(0, nanoid_1.nanoid)(8)}`;
const comparisonReport = {
schema_version: ASSISTANT_STAGE2_COMPARISON_SCHEMA_VERSION,
comparison_id: comparisonId,
run_timestamp: new Date().toISOString(),
baseline_run_id: baselineReport.run_id ?? null,
current_run_id: currentReport.run_id ?? null,
eval_target: "assistant_stage2",
suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null,
suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null,
baseline_report_file: baselinePath,
current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object"
? currentReport.artifacts.run_report_json_path ?? null
: null,
metric_deltas: deltas,
scenario_notes_summary: {
improved: improvedNotes.length,
unchanged: unchangedNotes.length,
weakened: weakenedNotes.length
},
scenario_notes: {
improved: improvedNotes,
unchanged: unchangedNotes,
weakened: weakenedNotes
},
known_limitations: currentReport.known_limitations ?? [
"Stage 2 comparison remains run-to-run and depends on stable feature profile.",
"Metrics are Stage 2 Wave 5 heuristics, not final product scorecards."
],
report_title: "Assistant Stage 2 Baseline vs Current"
};
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
const jsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.json`);
const mdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.md`);
const jsonWritten = tryWriteJsonFile(jsonPath, comparisonReport);
const mdWritten = tryWriteTextFile(mdPath, buildAssistantStage2ComparisonMarkdownReport(comparisonReport));
const comparisonRef = jsonWritten ? jsonPath : putInMemoryEvalReport(comparisonReport);
return {
...comparisonReport,
artifacts: {
comparison_report_json_path: comparisonRef,
comparison_report_md_path: mdWritten ? mdPath : null
}
};
}
async runAssistantStage1(payload) {
if (!config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1) {
throw new http_1.ApiError("ASSISTANT_STAGE1_EVAL_DISABLED", "Assistant Stage 1 eval target is disabled by FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1.", 409);
}
const suite = parseAssistantSuiteFile(payload.caseSetFile);
const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
const runId = typeof payload.runId === "string" && payload.runId.trim().length > 0 ? payload.runId.trim() : `assistant-stage1-${(0, nanoid_1.nanoid)(10)}`;
const analysisDate = normalizeAnalysisDate(payload.analysisDate);
const assistantService = new assistantService_1.AssistantService(this.normalizerService, new assistantSessionStore_1.AssistantSessionStore());
const diagnostics = [];
let requestsTotal = 0;
for (const suiteCase of suiteCases) {
const sessionId = `${runId}-${suiteCase.case_id}`;
const turnResponses = [];
const notes = [];
const limitations = [];
try {
for (const turn of suiteCase.turns) {
const response = (await assistantService.handleMessage({
session_id: sessionId,
user_message: turn.user_message,
message: turn.user_message,
mode: "assistant",
llmProvider: payload.normalizeConfig.llmProvider,
apiKey: payload.normalizeConfig.apiKey,
model: payload.normalizeConfig.model,
baseUrl: payload.normalizeConfig.baseUrl,
temperature: payload.normalizeConfig.temperature,
maxOutputTokens: payload.normalizeConfig.maxOutputTokens,
promptVersion: payload.normalizeConfig.promptVersion,
systemPrompt: payload.normalizeConfig.systemPrompt,
developerPrompt: payload.normalizeConfig.developerPrompt,
domainPrompt: payload.normalizeConfig.domainPrompt,
fewShotExamples: payload.normalizeConfig.fewShotExamples,
context: analysisDate
? {
period_hint: analysisDate,
analysis_context: {
as_of_date: analysisDate,
source: "eval_analysis_date"
}
}
: undefined,
useMock: payload.useMock
}));
turnResponses.push(response);
requestsTotal += 1;
}
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
diagnostics.push({
suite_case: suiteCase,
session_id: sessionId,
trace_id: null,
final_reply_type: "backend_error",
turn_count: turnResponses.length,
narrowing_result: "failed",
signature: `backend_error|${suiteCase.scenario_tag}`,
is_generic: true,
is_false_confident: false,
is_broad_answer: suiteCase.broadness_level === "low" ? null : false,
followup_retention_score: suiteCase.question_type === "followup" || suiteCase.turns.length > 1 ? 0 : null,
evidence_quality_score: 0,
mechanism_specificity_score: 0,
genericness_score: 5,
accountant_actionability_score: 0,
accountant_usefulness_score: 0,
signals: {
broad_query_detected: suiteCase.broadness_level !== "low",
broad_result_flag: false,
narrowing_strength: null,
minimum_evidence_failed: true,
degraded_to: "clarification",
evidence_confidence: "low",
limitation_reason_codes: [],
mechanism_status: null,
source_refs: [],
routes: [],
followup_state_applied: false,
uncertainty_limitations_count: 0
},
limitations: [errorMessage],
notes: [`Case execution failed: ${errorMessage}`]
});
continue;
}
const finalResponse = turnResponses[turnResponses.length - 1];
const signals = this.collectAssistantSignals(finalResponse, turnResponses);
const structure = finalResponse.debug?.answer_structure_v11 ?? null;
const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions);
const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions);
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" "));
let genericnessScore = 0;
if (!hasAnchors)
genericnessScore += 2;
if (mechanismNotes.length === 0)
genericnessScore += 1;
if (signals.source_refs.length === 0)
genericnessScore += 1;
if (recommendedActions.length === 0)
genericnessScore += 1;
genericnessScore = clampScore(genericnessScore);
let actionabilityScore = 0;
if (recommendedActions.length > 0)
actionabilityScore += 2;
if (recommendedActions.some((item) => hasDomainAnchors(item)))
actionabilityScore += 2;
if (clarificationQuestions.length > 0 && (finalResponse.reply_type === "clarification_required" || signals.degraded_to === "clarification")) {
actionabilityScore += 1;
}
if (signals.source_refs.length > 0 && actionabilityScore < 5) {
actionabilityScore += 1;
}
actionabilityScore = clampScore(actionabilityScore);
let evidenceQualityScore = 0;
if (signals.source_refs.length >= 3)
evidenceQualityScore += 2;
else if (signals.source_refs.length > 0)
evidenceQualityScore += 1;
if (signals.evidence_confidence === "high")
evidenceQualityScore += 2;
if (signals.evidence_confidence === "medium")
evidenceQualityScore += 1;
if (signals.minimum_evidence_failed)
evidenceQualityScore -= 2;
if (signals.limitation_reason_codes.includes("insufficient_detail"))
evidenceQualityScore -= 1;
if (signals.limitation_reason_codes.includes("missing_mechanism"))
evidenceQualityScore -= 1;
evidenceQualityScore = clampScore(evidenceQualityScore);
let mechanismSpecificityScore = 0;
if (signals.mechanism_status === "grounded" && mechanismNotes.length > 0 && !signals.limitation_reason_codes.includes("missing_mechanism")) {
mechanismSpecificityScore = 5;
}
else if (signals.mechanism_status === "limited" && mechanismNotes.length > 0) {
mechanismSpecificityScore = 3;
}
else if (mechanismNotes.length > 0) {
mechanismSpecificityScore = 2;
}
else {
mechanismSpecificityScore = 0;
}
const usefulnessScore = clampScore((actionabilityScore + (5 - genericnessScore) + evidenceQualityScore + mechanismSpecificityScore) / 4);
const isGeneric = genericnessScore >= 3;
const factualReply = finalResponse.reply_type === "factual" || finalResponse.reply_type === "factual_with_explanation";
const isFalseConfident = factualReply &&
(signals.minimum_evidence_failed ||
signals.degraded_to !== null ||
signals.evidence_confidence === "low" ||
(signals.limitation_reason_codes.length > 0 && signals.uncertainty_limitations_count === 0));
const isBroadCase = suiteCase.broadness_level !== "low" || signals.broad_query_detected;
const isBroadAnswer = isBroadCase
? factualReply && signals.degraded_to === null && !signals.minimum_evidence_failed
: null;
const isFollowupCase = suiteCase.question_type === "followup" || suiteCase.turns.length > 1;
let followupRetentionScore = null;
if (isFollowupCase) {
const finalTurnIndex = Number(finalResponse.debug?.investigation_state_snapshot?.turn_index ?? 0);
if (signals.followup_state_applied && finalTurnIndex >= suiteCase.turns.length) {
followupRetentionScore = 5;
}
else if (finalTurnIndex >= suiteCase.turns.length) {
followupRetentionScore = 3;
}
else {
followupRetentionScore = 0;
}
}
let narrowingResult = "not_required";
if (signals.degraded_to === "clarification" || finalResponse.reply_type === "clarification_required") {
narrowingResult = "clarification_requested";
}
else if (signals.broad_query_detected || signals.broad_result_flag) {
narrowingResult = signals.minimum_evidence_failed ? "failed" : "applied";
}
if (signals.minimum_evidence_failed) {
limitations.push("minimum_evidence_failed");
}
limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`));
if (signals.mechanism_status === "unresolved") {
limitations.push("mechanism_unresolved");
}
limitations.push(...uncertaintyLimitations);
if (isGeneric)
notes.push("genericness_high");
if (isFalseConfident)
notes.push("false_confidence_risk");
if (isBroadCase && isBroadAnswer)
notes.push("broad_answer_without_degradation");
if (followupRetentionScore !== null && followupRetentionScore < 3)
notes.push("followup_context_retention_weak");
diagnostics.push({
suite_case: suiteCase,
session_id: sessionId,
trace_id: finalResponse.debug?.trace_id ?? null,
final_reply_type: finalResponse.reply_type,
turn_count: suiteCase.turns.length,
narrowing_result: narrowingResult,
signature: [
finalResponse.reply_type,
signals.routes.sort().join(","),
signals.degraded_to ?? "none",
signals.mechanism_status ?? "unknown",
signals.source_refs.slice(0, 2).join(",")
].join("|"),
is_generic: isGeneric,
is_false_confident: isFalseConfident,
is_broad_answer: isBroadAnswer,
followup_retention_score: followupRetentionScore,
evidence_quality_score: evidenceQualityScore,
mechanism_specificity_score: mechanismSpecificityScore,
genericness_score: genericnessScore,
accountant_actionability_score: actionabilityScore,
accountant_usefulness_score: round2(usefulnessScore),
signals,
limitations: Array.from(new Set(limitations)),
notes
});
}
const metrics = this.computeAssistantMetrics({ diagnostics });
const caseRecords = diagnostics.map((item) => {
const signatureHits = metrics.signature_counts[item.signature] ?? 1;
const caseMetricVector = {
retrieval_differentiation_rate: signatureHits === 1 ? 1 : 0,
generic_explanation_rate: item.is_generic ? 1 : 0,
accountant_actionability_score: round2(item.accountant_actionability_score),
false_confidence_rate: item.is_false_confident ? 1 : 0,
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
mechanism_specificity_score: round2(item.mechanism_specificity_score),
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score)
};
return {
schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
created_at: new Date().toISOString(),
case_id: item.suite_case.case_id,
scenario_tag: item.suite_case.scenario_tag,
session_id: item.session_id,
trace_id: item.trace_id,
question_type: item.suite_case.question_type,
broadness_level: item.suite_case.broadness_level,
narrowing_result: item.narrowing_result,
evidence_quality_score: round2(item.evidence_quality_score),
genericness_score: round2(item.genericness_score),
accountant_usefulness_score: round2(item.accountant_usefulness_score),
accountant_metrics: caseMetricVector,
raw_signals: {
final_reply_type: item.final_reply_type,
turn_count: item.turn_count,
broad_query_detected: item.signals.broad_query_detected,
broad_result_flag: item.signals.broad_result_flag,
narrowing_strength: item.signals.narrowing_strength,
minimum_evidence_failed: item.signals.minimum_evidence_failed,
degraded_to: item.signals.degraded_to,
evidence_confidence: item.signals.evidence_confidence,
limitation_reason_codes: item.signals.limitation_reason_codes,
mechanism_status: item.signals.mechanism_status,
source_refs: item.signals.source_refs,
routes: item.signals.routes,
followup_state_applied: item.signals.followup_state_applied
},
metric_subscores: caseMetricVector,
limitations: item.limitations,
notes: item.notes
};
});
const strongestSignals = Object.entries(metrics.rubric_bands)
.filter(([, band]) => band?.score === 5)
.map(([name]) => name);
const weakestSignals = Object.entries(metrics.rubric_bands)
.filter(([, band]) => band?.score === 0)
.map(([name]) => name);
const runTimestamp = new Date().toISOString();
const report = {
schema_version: ASSISTANT_STAGE1_RUN_SCHEMA_VERSION,
run_id: runId,
run_timestamp: runTimestamp,
eval_target: "assistant_stage1",
mode: payload.mode,
use_mock: Boolean(payload.useMock),
analysis_date: analysisDate,
prompt_version: payload.normalizeConfig.promptVersion ?? null,
suite_id: suite.suite_id,
suite_version: suite.suite_version,
suite_schema_version: suite.schema_version ?? null,
scenario_count: suite.scenario_count,
case_ids: suiteCases.map((item) => item.case_id),
cases_total: caseRecords.length,
feature_profile_snapshot: buildFeatureProfileSnapshot(),
code_version: buildCodeVersionMarker(),
metrics: {
raw: metrics.raw,
denominators: metrics.denominators
},
rubric_bands: metrics.rubric_bands,
subsets: {
broad_cases_total: metrics.denominators.broad_cases_total,
followup_cases_total: metrics.denominators.followup_cases_total
},
budget: {
requests_total: requestsTotal
},
results: caseRecords,
scenario_summary: {
improved_or_strong: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) >= 4).length,
unchanged_or_mixed: caseRecords.filter((item) => {
const value = Number(item.accountant_usefulness_score ?? 0);
return value >= 2.5 && value < 4;
}).length,
weak_or_regressed: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) < 2.5).length
},
improvement_hints: {
strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none",
weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none"
},
known_limitations: [
"Snapshot-only retrieval contour remains (no live verification core in Stage 1).",
"Metric mapping for genericness/false confidence is heuristic by design.",
"Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)."
],
report_title: "Assistant Stage 1 Eval Run"
};
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
const runJsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.json`);
const runMdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.md`);
const compactReport = compactAssistantStage1Report(report);
const jsonWritten = tryWriteJsonFile(runJsonPath, compactReport);
const mdWritten = tryWriteTextFile(runMdPath, buildAssistantEvalMarkdownReport(compactReport));
const runReportRef = jsonWritten ? runJsonPath : putInMemoryEvalReport(compactReport);
report.artifacts = {
run_report_json_path: runReportRef,
run_report_md_path: mdWritten ? runMdPath : null
};
if (payload.compareWithReportFile) {
report.comparison = this.buildAssistantComparisonReport({
currentReport: report,
baselineReportFile: payload.compareWithReportFile
});
}
return report;
}
async runAssistantStage2(payload) {
if (!config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1) {
throw new http_1.ApiError("ASSISTANT_STAGE2_EVAL_DISABLED", "Assistant Stage 2 eval target is disabled by FEATURE_ASSISTANT_STAGE2_EVAL_V1.", 409);
}
const suite = parseAssistantStage2SuiteFile(payload.caseSetFile);
const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
const runId = typeof payload.runId === "string" && payload.runId.trim().length > 0 ? payload.runId.trim() : `assistant-stage2-${(0, nanoid_1.nanoid)(10)}`;
const analysisDate = normalizeAnalysisDate(payload.analysisDate);
const assistantService = new assistantService_1.AssistantService(this.normalizerService, new assistantSessionStore_1.AssistantSessionStore());
const diagnostics = [];
let requestsTotal = 0;
for (const suiteCase of suiteCases) {
const sessionId = `${runId}-${suiteCase.case_id}`;
const turnResponses = [];
const notes = [];
const limitations = [];
const expectedProblemUnitTypes = this.getExpectedProblemUnitTypes(suiteCase);
const expectedProblemFirst = suiteCase.expected_hints?.expected_problem_first ?? (suiteCase.broadness_level !== "low" || suiteCase.question_type !== "direct");
try {
for (const turn of suiteCase.turns) {
const response = (await assistantService.handleMessage({
session_id: sessionId,
user_message: turn.user_message,
message: turn.user_message,
mode: "assistant",
llmProvider: payload.normalizeConfig.llmProvider,
apiKey: payload.normalizeConfig.apiKey,
model: payload.normalizeConfig.model,
baseUrl: payload.normalizeConfig.baseUrl,
temperature: payload.normalizeConfig.temperature,
maxOutputTokens: payload.normalizeConfig.maxOutputTokens,
promptVersion: payload.normalizeConfig.promptVersion,
systemPrompt: payload.normalizeConfig.systemPrompt,
developerPrompt: payload.normalizeConfig.developerPrompt,
domainPrompt: payload.normalizeConfig.domainPrompt,
fewShotExamples: payload.normalizeConfig.fewShotExamples,
context: analysisDate
? {
period_hint: analysisDate,
analysis_context: {
as_of_date: analysisDate,
source: "eval_analysis_date"
}
}
: undefined,
useMock: payload.useMock
}));
turnResponses.push(response);
requestsTotal += 1;
}
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
diagnostics.push({
suite_case: suiteCase,
session_id: sessionId,
trace_id: null,
final_reply_type: "backend_error",
turn_count: turnResponses.length,
signature: `backend_error|${suiteCase.scenario_tag}`,
expected_problem_unit_types: expectedProblemUnitTypes,
expected_problem_first: expectedProblemFirst,
problem_unit_precision: 0,
problem_unit_recall_proxy: expectedProblemUnitTypes.length > 0 ? 0 : null,
duplicate_collapse_rate: null,
mechanism_coherence_score: 0,
problem_clarity_score: 0,
problem_first_answer_applied: expectedProblemFirst ? false : null,
entity_leakage: false,
signals: {
broad_query_detected: suiteCase.broadness_level !== "low",
broad_result_flag: false,
narrowing_strength: null,
minimum_evidence_failed: true,
degraded_to: "clarification",
evidence_confidence: "low",
limitation_reason_codes: [],
mechanism_status: null,
source_refs: [],
routes: [],
followup_state_applied: false,
uncertainty_limitations_count: 0,
candidate_evidence_total: 0,
problem_units_total: 0,
problem_unit_types: [],
problem_mechanism_summaries: [],
duplicate_collapses_total: 0,
problem_centric_answer_applied: false,
problem_units_used_count: 0,
problem_answer_mode: null,
problem_unit_ids_used: [],
entity_leakage_detected: false
},
limitations: [errorMessage],
notes: [`Case execution failed: ${errorMessage}`]
});
continue;
}
const finalResponse = turnResponses[turnResponses.length - 1];
const signals = this.collectAssistantStage2Signals(finalResponse, turnResponses);
const problemUnitPrecision = this.computeProblemUnitPrecision(expectedProblemUnitTypes, signals.problem_unit_types);
const problemUnitRecallProxy = this.computeProblemUnitRecallProxy(expectedProblemUnitTypes, signals.problem_unit_types);
const duplicateCollapseRate = this.computeDuplicateCollapseRate(signals.candidate_evidence_total, signals.duplicate_collapses_total);
const mechanismCoherenceScore = this.computeMechanismCoherenceScore(finalResponse, signals);
const problemClarityScore = this.computeProblemClarityScore(finalResponse, signals);
const problemFirstAnswerApplied = expectedProblemFirst ? signals.problem_centric_answer_applied && signals.problem_units_used_count > 0 : null;
if (signals.problem_units_total === 0 && expectedProblemUnitTypes.length > 0) {
limitations.push("missing_problem_units");
}
if (signals.problem_centric_answer_applied && signals.problem_units_used_count <= 0) {
limitations.push("problem_mode_without_units");
}
limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`));
if (signals.entity_leakage_detected) {
limitations.push("entity_leakage_detected");
}
if (problemFirstAnswerApplied === false)
notes.push("problem_first_not_applied");
if (signals.problem_units_total === 0)
notes.push("problem_units_missing");
if (signals.problem_unit_types.length > 0)
notes.push(`problem_types:${signals.problem_unit_types.join(",")}`);
if (signals.entity_leakage_detected)
notes.push("entity_leakage");
if (signals.degraded_to === "clarification")
notes.push("clarification_degraded");
diagnostics.push({
suite_case: suiteCase,
session_id: sessionId,
trace_id: finalResponse.debug?.trace_id ?? null,
final_reply_type: finalResponse.reply_type,
turn_count: suiteCase.turns.length,
signature: [
finalResponse.reply_type,
signals.problem_answer_mode ?? "unknown",
signals.problem_unit_types.sort().join(","),
signals.degraded_to ?? "none"
].join("|"),
expected_problem_unit_types: expectedProblemUnitTypes,
expected_problem_first: expectedProblemFirst,
problem_unit_precision: problemUnitPrecision,
problem_unit_recall_proxy: problemUnitRecallProxy,
duplicate_collapse_rate: duplicateCollapseRate,
mechanism_coherence_score: mechanismCoherenceScore,
problem_clarity_score: problemClarityScore,
problem_first_answer_applied: problemFirstAnswerApplied,
entity_leakage: signals.entity_leakage_detected,
signals,
limitations: Array.from(new Set(limitations)),
notes
});
}
const metrics = this.computeAssistantStage2Metrics({ diagnostics });
const caseRecords = diagnostics.map((item) => {
const caseMetricVector = {
problem_unit_precision: item.problem_unit_precision,
problem_unit_recall_proxy: item.problem_unit_recall_proxy,
duplicate_collapse_rate: item.duplicate_collapse_rate,
mechanism_coherence_score: round2(item.mechanism_coherence_score),
problem_clarity_score: round2(item.problem_clarity_score),
problem_first_answer_rate: item.problem_first_answer_applied === null ? null : item.problem_first_answer_applied ? 1 : 0,
entity_leakage_rate: item.entity_leakage ? 1 : 0
};
return {
schema_version: stage2EvalContracts_1.ASSISTANT_STAGE2_EVAL_RECORD_SCHEMA_VERSION,
created_at: new Date().toISOString(),
case_id: item.suite_case.case_id,
scenario_tag: item.suite_case.scenario_tag,
session_id: item.session_id,
trace_id: item.trace_id,
question_type: item.suite_case.question_type,
broadness_level: item.suite_case.broadness_level,
expected_problem_unit_types: item.expected_problem_unit_types,
expected_problem_first: item.expected_problem_first,
problem_units_detected: item.signals.problem_units_total,
candidate_evidence_detected: item.signals.candidate_evidence_total,
duplicate_collapses_detected: item.signals.duplicate_collapses_total,
metric_subscores: caseMetricVector,
raw_signals: {
final_reply_type: item.final_reply_type,
turn_count: item.turn_count,
broad_query_detected: item.signals.broad_query_detected,
broad_result_flag: item.signals.broad_result_flag,
narrowing_strength: item.signals.narrowing_strength,
minimum_evidence_failed: item.signals.minimum_evidence_failed,
degraded_to: item.signals.degraded_to,
evidence_confidence: item.signals.evidence_confidence,
limitation_reason_codes: item.signals.limitation_reason_codes,
mechanism_status: item.signals.mechanism_status,
source_refs: item.signals.source_refs,
routes: item.signals.routes,
followup_state_applied: item.signals.followup_state_applied,
problem_units_total: item.signals.problem_units_total,
candidate_evidence_total: item.signals.candidate_evidence_total,
problem_unit_types: item.signals.problem_unit_types,
duplicate_collapses_total: item.signals.duplicate_collapses_total,
problem_centric_answer_applied: item.signals.problem_centric_answer_applied,
problem_units_used_count: item.signals.problem_units_used_count,
problem_answer_mode: item.signals.problem_answer_mode,
problem_unit_ids_used: item.signals.problem_unit_ids_used,
entity_leakage_detected: item.signals.entity_leakage_detected
},
limitations: item.limitations,
notes: item.notes
};
});
const strongestSignals = Object.entries(metrics.rubric_bands)
.filter(([, band]) => band?.score === 5)
.map(([name]) => name);
const weakestSignals = Object.entries(metrics.rubric_bands)
.filter(([, band]) => band?.score === 0)
.map(([name]) => name);
const runTimestamp = new Date().toISOString();
const report = {
schema_version: ASSISTANT_STAGE2_RUN_SCHEMA_VERSION,
run_id: runId,
run_timestamp: runTimestamp,
eval_target: "assistant_stage2",
mode: payload.mode,
use_mock: Boolean(payload.useMock),
analysis_date: analysisDate,
prompt_version: payload.normalizeConfig.promptVersion ?? null,
suite_id: suite.suite_id,
suite_version: suite.suite_version,
suite_schema_version: suite.schema_version ?? null,
scenario_count: suite.scenario_count,
case_ids: suiteCases.map((item) => item.case_id),
cases_total: caseRecords.length,
feature_profile_snapshot: buildFeatureProfileSnapshot(),
code_version: buildCodeVersionMarker(),
metrics: {
raw: metrics.raw,
denominators: metrics.denominators
},
rubric_bands: metrics.rubric_bands,
subsets: {
expected_problem_cases_total: metrics.denominators.expected_problem_cases_total,
followup_cases_total: metrics.denominators.followup_cases_total,
candidate_cases_total: metrics.denominators.candidate_cases_total
},
budget: {
requests_total: requestsTotal
},
results: caseRecords,
scenario_summary: {
improved_or_strong: caseRecords.filter((item) => {
const clarity = Number(item.metric_subscores.problem_clarity_score ?? 0);
const mechanism = Number(item.metric_subscores.mechanism_coherence_score ?? 0);
return clarity >= 4 && mechanism >= 3;
}).length,
unchanged_or_mixed: caseRecords.filter((item) => {
const clarity = Number(item.metric_subscores.problem_clarity_score ?? 0);
return clarity >= 2.5 && clarity < 4;
}).length,
weak_or_regressed: caseRecords.filter((item) => Number(item.metric_subscores.problem_clarity_score ?? 0) < 2.5).length
},
improvement_hints: {
strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none",
weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none"
},
known_limitations: [
"Stage 2 eval remains heuristic and scoped to problem-unit baseline (no graph/lifecycle/investigation runtime scoring).",
"problem_unit_recall_proxy uses suite expected types as lightweight proxy, not full ground-truth labeling.",
"Comparison quality depends on stable feature profile and reproducible mock/runtime setup."
],
report_title: "Assistant Stage 2 Eval Run"
};
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
const runJsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.json`);
const runMdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.md`);
const compactReport = compactAssistantStage2Report(report);
const jsonWritten = tryWriteJsonFile(runJsonPath, compactReport);
const mdWritten = tryWriteTextFile(runMdPath, buildAssistantStage2EvalMarkdownReport(compactReport));
const runReportRef = jsonWritten ? runJsonPath : putInMemoryEvalReport(compactReport);
report.artifacts = {
run_report_json_path: runReportRef,
run_report_md_path: mdWritten ? runMdPath : null
};
if (payload.compareWithReportFile) {
report.comparison = this.buildAssistantStage2ComparisonReport({
currentReport: report,
baselineReportFile: payload.compareWithReportFile
});
}
return report;
}
async runAssistantP0(payload) {
if (!config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1) {
throw new http_1.ApiError("ASSISTANT_P0_EVAL_DISABLED", "Assistant P0 eval target is disabled by FEATURE_ASSISTANT_STAGE2_EVAL_V1.", 409);
}
const runner = new p0_eval_runner_1.P0EvalRunner(this.normalizerService);
return runner.run({
normalizeConfig: payload.normalizeConfig,
caseIds: payload.caseIds,
useMock: payload.useMock,
mode: payload.mode,
caseSetFile: payload.caseSetFile,
compareWithReportFile: payload.compareWithReportFile
});
}
async run(payload) {
const mode = payload.mode ?? "standard";
const evalTarget = payload.evalTarget ?? "normalizer";
const analysisDate = normalizeAnalysisDate(payload.analysisDate);
if (evalTarget === "assistant_stage1") {
return this.runAssistantStage1({
normalizeConfig: payload.normalizeConfig,
caseIds: payload.caseIds,
useMock: payload.useMock,
mode,
caseSetFile: payload.caseSetFile,
compareWithReportFile: payload.compareWithReportFile,
analysisDate: analysisDate ?? undefined,
runId: payload.runId
});
}
if (evalTarget === "assistant_stage2") {
return this.runAssistantStage2({
normalizeConfig: payload.normalizeConfig,
caseIds: payload.caseIds,
useMock: payload.useMock,
mode,
caseSetFile: payload.caseSetFile,
compareWithReportFile: payload.compareWithReportFile,
analysisDate: analysisDate ?? undefined,
runId: payload.runId
});
}
if (evalTarget === "assistant_p0") {
return this.runAssistantP0({
normalizeConfig: payload.normalizeConfig,
caseIds: payload.caseIds,
useMock: payload.useMock,
mode,
caseSetFile: payload.caseSetFile,
compareWithReportFile: payload.compareWithReportFile
});
}
const promptVersion = String(payload.normalizeConfig.promptVersion ?? "").toLowerCase();
const schemaVersion = String(payload.normalizeConfig.schemaVersion ?? "").toLowerCase();
const isV2 = promptVersion.startsWith("normalizer_v2") || schemaVersion === "v2" || schemaVersion === "v2_0_1" || schemaVersion === "v2_0_2";
const inlineQuestions = payload.rawQuestions ? parseRawQuestions(payload.rawQuestions) : [];
const inlineCases = inlineQuestions.map((question, index) => ({
case_id: formatCaseId("BQ", index),
raw_question: question,
expected: null
}));
if (isV2) {
const sourceCases = inlineCases.length > 0
? inlineCases
: payload.caseSetFile
? parseCaseSetFile(payload.caseSetFile).map((item) => ({
case_id: item.case_id,
raw_question: item.raw_question,
expected: item.expected
}))
: this.listCases().map((item) => ({
case_id: item.case_id,
raw_question: item.raw_question,
expected: item.expected
}));
const filtered = sourceCases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
return this.runV2({
...payload,
mode,
analysisDate: analysisDate ?? undefined,
cases: filtered
});
}
if (inlineCases.length > 0) {
throw new Error("rawQuestions batch is supported for normalizer_v2 only.");
}
const casesSource = payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile) : this.listCases();
const filteredCases = casesSource.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
const runId = `eval-${(0, nanoid_1.nanoid)(10)}`;
const results = [];
const mismatches = [];
const badConfidenceCases = [];
const classCounter = {};
let schemaPass = 0;
let intentPass = 0;
let routePass = 0;
let causalPass = 0;
let highConfidenceErrors = 0;
let requestsTotal = 0;
let retriesUsed = 0;
for (const item of filteredCases) {
const response = await this.normalizerService.normalize({
...payload.normalizeConfig,
userQuestion: item.raw_question,
context: {
period_hint: analysisDate ?? undefined,
analysis_context: analysisDate
? {
as_of_date: analysisDate,
source: "eval_analysis_date"
}
: undefined,
expected_route: item.expected.route_hint,
eval_label: runId,
case_id: item.case_id,
eval_mode: mode
},
retryPolicy: mode === "single-pass-strict" ? "single-pass-strict" : "default",
useMock: payload.useMock
});
const normalized = response.normalized && response.normalized.schema_version === "normalized_query_v1"
? response.normalized
: null;
const intentMatch = Boolean(normalized && item.expected.intent_class === normalized.intent_class);
const routeMatch = Boolean(normalized && item.expected.route_hint === normalized.route_hint);
const causalMatch = Boolean(normalized &&
item.expected.requires &&
item.expected.requires.needs_cross_entity_join === normalized.requires.needs_cross_entity_join &&
item.expected.requires.needs_causal_chain === normalized.requires.needs_causal_chain);
if (response.validation.passed)
schemaPass += 1;
if (intentMatch)
intentPass += 1;
if (routeMatch)
routePass += 1;
if (causalMatch || !item.expected.requires)
causalPass += 1;
const requestCount = Number(response.request_count_for_case ?? 0);
requestsTotal += requestCount;
if (requestCount > 1) {
retriesUsed += 1;
}
const classKey = String(item.expected.intent_class ?? "unknown");
if (!classCounter[classKey]) {
classCounter[classKey] = { total: 0, passed: 0 };
}
classCounter[classKey].total += 1;
if (intentMatch) {
classCounter[classKey].passed += 1;
}
const confidenceOverall = normalized?.confidence.overall ?? null;
const hasMismatch = !intentMatch || !routeMatch || (!causalMatch && Boolean(item.expected.requires));
if (confidenceOverall === "high" && hasMismatch) {
highConfidenceErrors += 1;
badConfidenceCases.push({
case_id: item.case_id,
confidence_overall: confidenceOverall,
intent_match: intentMatch,
route_match: routeMatch,
causal_match: causalMatch || !item.expected.requires,
trace_id: response.trace_id
});
}
if (hasMismatch || !response.validation.passed) {
mismatches.push({
case_id: item.case_id,
expected_intent_class: item.expected.intent_class ?? null,
actual_intent_class: normalized?.intent_class ?? null,
expected_route_hint: item.expected.route_hint ?? null,
actual_route_hint: normalized?.route_hint ?? null,
expected_requires: item.expected.requires ?? null,
actual_requires: normalized?.requires ?? null,
comment: shortMismatchComment({
intentMatch,
routeMatch,
causalMatch: causalMatch || !item.expected.requires,
validationPassed: response.validation.passed
}),
trace_id: response.trace_id
});
}
results.push({
case_id: item.case_id,
raw_question: item.raw_question,
validation_passed: response.validation.passed,
intent_match: intentMatch,
route_match: routeMatch,
causal_flags_match: causalMatch || !item.expected.requires,
expected_intent_class: item.expected.intent_class ?? null,
actual_intent_class: normalized?.intent_class ?? null,
expected_route_hint: item.expected.route_hint ?? null,
actual_route_hint: normalized?.route_hint ?? null,
expected_requires: item.expected.requires ?? null,
actual_requires: normalized?.requires ?? null,
confidence_overall: confidenceOverall,
trace_id: response.trace_id,
request_count_for_case: requestCount
});
}
const total = Math.max(1, filteredCases.length);
const metrics = {
schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
intent_class_accuracy: Number(((intentPass / total) * 100).toFixed(2)),
route_hint_accuracy: Number(((routePass / total) * 100).toFixed(2)),
causal_flag_accuracy: Number(((causalPass / total) * 100).toFixed(2)),
high_confidence_error_rate: Number(((highConfidenceErrors / total) * 100).toFixed(2))
};
const classAccuracy = Object.fromEntries(Object.entries(classCounter).map(([key, value]) => [
key,
{
total: value.total,
passed: value.passed,
accuracy_percent: Number(((value.passed / Math.max(1, value.total)) * 100).toFixed(2))
}
]));
const baselineAsMap = BASELINE_METRICS;
const baselineDelta = Object.fromEntries(Object.entries(metrics).map(([key, value]) => [key, Number((value - baselineAsMap[key]).toFixed(2))]));
const report = {
run_id: runId,
timestamp: new Date().toISOString(),
mode,
use_mock: Boolean(payload.useMock),
analysis_date: analysisDate,
prompt_version: payload.normalizeConfig.promptVersion ?? null,
dataset: {
source: payload.caseSetFile ? "file" : "data/eval_cases/*.json",
file: payload.caseSetFile ?? null
},
cases_total: filteredCases.length,
metrics,
baseline_metrics: BASELINE_METRICS,
baseline_delta: baselineDelta,
class_accuracy: classAccuracy,
budget: {
requests_total: requestsTotal,
retries_used: retriesUsed,
guidance: {
forensic_calls_max: 10,
final_eval_calls_max: 30,
target_total_calls_max: 40,
hard_cap_calls_max: 45
}
},
mismatches,
bad_confidence_cases: badConfidenceCases,
results
};
(0, files_1.ensureDir)(config_1.EVAL_CASES_DIR);
tryWriteJsonFile(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report);
const shouldWriteV11Artifacts = mode === "single-pass-strict" &&
Boolean(payload.caseSetFile) &&
path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_30cases.json";
if (shouldWriteV11Artifacts) {
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.json"), report);
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.md"), buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1 Eval Run"
}));
}
const shouldWriteV1121EvalArtifacts = mode === "single-pass-strict" &&
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2_1" &&
Boolean(payload.caseSetFile) &&
path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_2_1_30cases.json";
if (shouldWriteV1121EvalArtifacts) {
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.json"), report);
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.md"), buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1.2.1 Eval Run"
}));
}
const shouldWriteV111MicroArtifacts = mode === "single-pass-strict" &&
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_1" &&
isSameCaseSet(payload.caseIds, V111_MICRO_CASE_IDS);
if (shouldWriteV111MicroArtifacts) {
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.json"), report);
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.md"), buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1.1 Micro Eval"
}));
}
const shouldWriteV112MicroArtifacts = mode === "single-pass-strict" &&
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2" &&
isSameCaseSet(payload.caseIds, V112_MICRO_CASE_IDS);
if (shouldWriteV112MicroArtifacts) {
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.json"), report);
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.md"), buildMarkdownReport({
...report,
report_title: "LLM Normalizer v1.1.2 Micro Eval"
}));
}
return report;
}
}
exports.EvalService = EvalService;