2440 lines
120 KiB
JavaScript
2440 lines
120 KiB
JavaScript
"use strict";
|
||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||
};
|
||
Object.defineProperty(exports, "__esModule", { value: true });
|
||
exports.EvalService = void 0;
|
||
const fs_1 = __importDefault(require("fs"));
|
||
const path_1 = __importDefault(require("path"));
|
||
const nanoid_1 = require("nanoid");
|
||
const config_1 = require("../config");
|
||
const p0_eval_runner_1 = require("../eval/p0_eval_runner");
|
||
const stage1Contracts_1 = require("../types/stage1Contracts");
|
||
const stage2EvalContracts_1 = require("../types/stage2EvalContracts");
|
||
const http_1 = require("../utils/http");
|
||
const assistantService_1 = require("./assistantService");
|
||
const assistantSessionStore_1 = require("./assistantSessionStore");
|
||
const files_1 = require("../utils/files");
|
||
const BASELINE_METRICS = {
|
||
schema_validation_pass_rate: 100,
|
||
intent_class_accuracy: 72.73,
|
||
route_hint_accuracy: 90.91,
|
||
causal_flag_accuracy: 81.82,
|
||
high_confidence_error_rate: 9.09
|
||
};
|
||
const V111_MICRO_CASE_IDS = ["NQ-008", "V11-DD-005", "V11-OT-003", "V11-OT-004", "V11-OT-005"];
|
||
const V112_MICRO_CASE_IDS = ["NQ-002", "NQ-007", "V11-HA-004", "V11-OT-003", "V11-OT-005"];
|
||
function isSameCaseSet(input, target) {
|
||
if (!input || input.length !== target.length) {
|
||
return false;
|
||
}
|
||
const left = [...input].sort();
|
||
const right = [...target].sort();
|
||
return left.every((value, index) => value === right[index]);
|
||
}
|
||
function formatPercent(value) {
|
||
return `${value.toFixed(2)}%`;
|
||
}
|
||
function shortMismatchComment(input) {
|
||
if (!input.validationPassed) {
|
||
return "Schema validation failed for this case.";
|
||
}
|
||
if (!input.intentMatch && input.routeMatch) {
|
||
return "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.";
|
||
}
|
||
if (input.intentMatch && !input.routeMatch) {
|
||
return "Intent understood, but route_hint selected a weaker execution route.";
|
||
}
|
||
if (!input.intentMatch && !input.routeMatch) {
|
||
return "Both intent and route misclassified; likely lexical ambiguity in causal vs risk wording.";
|
||
}
|
||
if (!input.causalMatch) {
|
||
return "Causal flags are inconsistent with expected relationship depth.";
|
||
}
|
||
return "No mismatch.";
|
||
}
|
||
function buildMarkdownReport(report) {
|
||
const metrics = (report.metrics ?? {});
|
||
const baseline = (report.baseline_metrics ?? {});
|
||
const delta = (report.baseline_delta ?? {});
|
||
const classAccuracy = (report.class_accuracy ?? {});
|
||
const mismatches = Array.isArray(report.mismatches) ? report.mismatches : [];
|
||
const badConfidenceCases = Array.isArray(report.bad_confidence_cases) ? report.bad_confidence_cases : [];
|
||
const budget = (report.budget ?? {});
|
||
const metricRows = Object.keys(metrics)
|
||
.map((key) => {
|
||
const current = Number(metrics[key] ?? 0);
|
||
const base = Number(baseline[key] ?? 0);
|
||
const d = Number(delta[key] ?? 0);
|
||
const sign = d > 0 ? "+" : "";
|
||
return `| ${key} | ${formatPercent(current)} | ${formatPercent(base)} | ${sign}${d.toFixed(2)} |`;
|
||
})
|
||
.join("\n");
|
||
const classRows = Object.keys(classAccuracy)
|
||
.map((key) => {
|
||
const row = classAccuracy[key];
|
||
return `| ${key} | ${row.passed}/${row.total} | ${formatPercent(row.accuracy_percent)} |`;
|
||
})
|
||
.join("\n");
|
||
const mismatchRows = mismatches.length === 0
|
||
? "No mismatches."
|
||
: mismatches
|
||
.map((item) => {
|
||
const row = item;
|
||
return `- ${row.case_id}: expected(${row.expected_intent_class} / ${row.expected_route_hint}) -> actual(${row.actual_intent_class} / ${row.actual_route_hint}). ${row.comment}`;
|
||
})
|
||
.join("\n");
|
||
const badConfidenceRows = badConfidenceCases.length === 0
|
||
? "No bad-confidence cases."
|
||
: badConfidenceCases
|
||
.map((item) => {
|
||
const row = item;
|
||
return `- ${row.case_id}: confidence=${row.confidence_overall}, intent_match=${row.intent_match}, route_match=${row.route_match}`;
|
||
})
|
||
.join("\n");
|
||
return [
|
||
`# ${String(report.report_title ?? "LLM Normalizer Eval Run")}`,
|
||
"",
|
||
`- run_id: ${String(report.run_id ?? "")}`,
|
||
`- timestamp: ${String(report.timestamp ?? "")}`,
|
||
`- mode: ${String(report.mode ?? "")}`,
|
||
`- use_mock: ${String(report.use_mock ?? false)}`,
|
||
`- cases_total: ${String(report.cases_total ?? 0)}`,
|
||
`- prompt_version: ${String(report.prompt_version ?? "")}`,
|
||
"",
|
||
"## Metrics vs Baseline",
|
||
"",
|
||
"| Metric | Current | Baseline | Delta |",
|
||
"|---|---:|---:|---:|",
|
||
metricRows || "| n/a | n/a | n/a | n/a |",
|
||
"",
|
||
"## Class Accuracy",
|
||
"",
|
||
"| Intent class | Passed/Total | Accuracy |",
|
||
"|---|---:|---:|",
|
||
classRows || "| n/a | n/a | n/a |",
|
||
"",
|
||
"## Budget",
|
||
"",
|
||
`- requests_total: ${String(budget.requests_total ?? 0)}`,
|
||
`- retries_used: ${String(budget.retries_used ?? 0)}`,
|
||
"",
|
||
"## Mismatches",
|
||
"",
|
||
mismatchRows,
|
||
"",
|
||
"## Bad Confidence Cases",
|
||
"",
|
||
badConfidenceRows,
|
||
""
|
||
].join("\n");
|
||
}
|
||
function parseCaseSetFile(inputPath) {
|
||
const filePath = path_1.default.isAbsolute(inputPath) ? inputPath : path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath);
|
||
const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
|
||
const parsed = JSON.parse(raw);
|
||
if (Array.isArray(parsed)) {
|
||
return parsed;
|
||
}
|
||
if (parsed && typeof parsed === "object" && Array.isArray(parsed.cases)) {
|
||
return parsed.cases;
|
||
}
|
||
throw new Error(`Unsupported eval dataset format: ${filePath}`);
|
||
}
|
||
function formatCaseId(prefix, index) {
|
||
return `${prefix}-${String(index + 1).padStart(3, "0")}`;
|
||
}
|
||
function parseRawQuestions(rawQuestions) {
|
||
const text = rawQuestions.replace(/\r\n/g, "\n").trim();
|
||
if (!text) {
|
||
return [];
|
||
}
|
||
const bySemicolon = text
|
||
.split(";")
|
||
.map((item) => item.trim())
|
||
.filter(Boolean);
|
||
if (bySemicolon.length > 1) {
|
||
return bySemicolon;
|
||
}
|
||
const byBlankLine = text
|
||
.split(/\n\s*\n+/)
|
||
.map((item) => item.trim())
|
||
.filter(Boolean);
|
||
if (byBlankLine.length > 1) {
|
||
return byBlankLine;
|
||
}
|
||
const byLine = text
|
||
.split("\n")
|
||
.map((item) => item.trim())
|
||
.filter(Boolean);
|
||
return byLine.length > 0 ? byLine : [text];
|
||
}
|
||
function executionReadinessOf(fragment) {
|
||
return "execution_readiness" in fragment ? fragment.execution_readiness : "executable";
|
||
}
|
||
function softAssumptionsOf(fragment) {
|
||
return "soft_assumption_used" in fragment ? fragment.soft_assumption_used : [];
|
||
}
|
||
function routeStatusOf(fragment) {
|
||
return "route_status" in fragment ? fragment.route_status : null;
|
||
}
|
||
function noRouteReasonOf(fragment) {
|
||
return "no_route_reason" in fragment ? fragment.no_route_reason : null;
|
||
}
|
||
function expectedScopeInScope(expected) {
|
||
if (!expected) {
|
||
return null;
|
||
}
|
||
if (typeof expected.expected_scope_in_scope === "boolean") {
|
||
return expected.expected_scope_in_scope;
|
||
}
|
||
if (expected.expected_no_route_reason === "out_of_scope") {
|
||
return false;
|
||
}
|
||
if (expected.expected_route_status === "routed") {
|
||
return true;
|
||
}
|
||
if (typeof expected.clarification_required === "boolean") {
|
||
return true;
|
||
}
|
||
return null;
|
||
}
|
||
function isDecisionStateConsistent(decision) {
|
||
const readiness = String(decision.execution_readiness ?? "");
|
||
const noRouteReason = decision.no_route_reason ?? null;
|
||
if (decision.route === "no_route") {
|
||
if (!noRouteReason) {
|
||
return false;
|
||
}
|
||
return readiness !== "executable" && readiness !== "executable_with_soft_assumptions";
|
||
}
|
||
if (noRouteReason) {
|
||
return false;
|
||
}
|
||
return readiness !== "needs_clarification" && readiness !== "no_route";
|
||
}
|
||
const DEFAULT_ASSISTANT_STAGE1_SUITE_FILE = "assistant_stage1_canonical_v0_1.json";
|
||
const ASSISTANT_STAGE1_RUN_SCHEMA_VERSION = "assistant_stage1_eval_run_v0_1";
|
||
const ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION = "assistant_stage1_eval_comparison_v0_1";
|
||
const DEFAULT_ASSISTANT_STAGE2_SUITE_FILE = "assistant_stage2_canonical_v0_1.json";
|
||
const ASSISTANT_STAGE2_RUN_SCHEMA_VERSION = "assistant_stage2_eval_run_v0_1";
|
||
const ASSISTANT_STAGE2_COMPARISON_SCHEMA_VERSION = "assistant_stage2_eval_comparison_v0_1";
|
||
const INMEM_EVAL_REPORT_PREFIX = "inmem_eval_report:";
|
||
const INMEM_EVAL_REPORTS = new Map();
|
||
function isNoSpaceError(error) {
|
||
const code = error?.code;
|
||
return code === "ENOSPC";
|
||
}
|
||
function tryWriteJsonFile(pathname, value) {
|
||
try {
|
||
(0, files_1.writeJsonFile)(pathname, value);
|
||
return true;
|
||
}
|
||
catch (error) {
|
||
if (isNoSpaceError(error)) {
|
||
return false;
|
||
}
|
||
throw error;
|
||
}
|
||
}
|
||
function tryWriteTextFile(pathname, value) {
|
||
try {
|
||
fs_1.default.writeFileSync(pathname, value, "utf-8");
|
||
return true;
|
||
}
|
||
catch (error) {
|
||
if (isNoSpaceError(error)) {
|
||
return false;
|
||
}
|
||
throw error;
|
||
}
|
||
}
|
||
function putInMemoryEvalReport(report) {
|
||
const key = `${INMEM_EVAL_REPORT_PREFIX}${(0, nanoid_1.nanoid)(12)}`;
|
||
INMEM_EVAL_REPORTS.set(key, report);
|
||
return key;
|
||
}
|
||
function readEvalReportByRef(ref) {
|
||
if (ref.startsWith(INMEM_EVAL_REPORT_PREFIX)) {
|
||
const report = INMEM_EVAL_REPORTS.get(ref);
|
||
if (!report) {
|
||
throw new Error(`In-memory eval report not found: ${ref}`);
|
||
}
|
||
return {
|
||
report,
|
||
resolved_path: ref
|
||
};
|
||
}
|
||
const resolvedPath = resolveReadablePath(ref);
|
||
const report = JSON.parse(fs_1.default.readFileSync(resolvedPath, "utf-8"));
|
||
return {
|
||
report,
|
||
resolved_path: resolvedPath
|
||
};
|
||
}
|
||
function compactAssistantStage1Report(report) {
|
||
const results = Array.isArray(report.results) ? report.results : [];
|
||
const compactResults = results.map((item) => ({
|
||
case_id: item.case_id ?? null,
|
||
scenario_tag: item.scenario_tag ?? null,
|
||
accountant_usefulness_score: item.accountant_usefulness_score ?? null,
|
||
accountant_metrics: typeof item.accountant_metrics === "object" && item.accountant_metrics !== null ? item.accountant_metrics : null
|
||
}));
|
||
return {
|
||
...report,
|
||
results: compactResults
|
||
};
|
||
}
|
||
function compactAssistantStage2Report(report) {
|
||
const results = Array.isArray(report.results) ? report.results : [];
|
||
const compactResults = results.map((item) => {
|
||
const metricSubscores = (item.metric_subscores ?? {});
|
||
return {
|
||
case_id: item.case_id ?? null,
|
||
metric_subscores: {
|
||
problem_clarity_score: metricSubscores.problem_clarity_score ?? null,
|
||
mechanism_coherence_score: metricSubscores.mechanism_coherence_score ?? null,
|
||
problem_first_answer_rate: metricSubscores.problem_first_answer_rate ?? null,
|
||
entity_leakage_rate: metricSubscores.entity_leakage_rate ?? null
|
||
}
|
||
};
|
||
});
|
||
return {
|
||
...report,
|
||
results: compactResults
|
||
};
|
||
}
|
||
const KNOWN_PROBLEM_UNIT_TYPES = [
|
||
"document_conflict",
|
||
"broken_chain_segment",
|
||
"lifecycle_anomaly_node",
|
||
"unresolved_settlement_cluster",
|
||
"period_risk_cluster",
|
||
"cross_branch_inconsistency_cluster"
|
||
];
|
||
function toProblemUnitType(value) {
|
||
return KNOWN_PROBLEM_UNIT_TYPES.includes(value) ? value : null;
|
||
}
|
||
function round2(value) {
|
||
return Number(value.toFixed(2));
|
||
}
|
||
function clampScore(value, min = 0, max = 5) {
|
||
if (Number.isNaN(value)) {
|
||
return min;
|
||
}
|
||
if (value < min)
|
||
return min;
|
||
if (value > max)
|
||
return max;
|
||
return value;
|
||
}
|
||
function rateToBandScore(metric, value) {
|
||
if (metric === "retrieval_differentiation_rate") {
|
||
if (value >= 0.75)
|
||
return 5;
|
||
if (value >= 0.45)
|
||
return 3;
|
||
return 0;
|
||
}
|
||
if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
|
||
if (value <= 0.25)
|
||
return 5;
|
||
if (value <= 0.45)
|
||
return 3;
|
||
return 0;
|
||
}
|
||
if (metric === "accountant_actionability_score" || metric === "mechanism_specificity_score" || metric === "followup_context_retention_score") {
|
||
if (value >= 4)
|
||
return 5;
|
||
if (value >= 2.5)
|
||
return 3;
|
||
return 0;
|
||
}
|
||
return 0;
|
||
}
|
||
function rubricBandForMetric(metric, value) {
|
||
if (value === null) {
|
||
return null;
|
||
}
|
||
const score = rateToBandScore(metric, value);
|
||
return stage1Contracts_1.ACCOUNTANT_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null;
|
||
}
|
||
function rateToBandScoreStage2(metric, value) {
|
||
if (metric === "problem_unit_precision" || metric === "problem_unit_recall_proxy" || metric === "problem_first_answer_rate") {
|
||
if (value >= 0.75)
|
||
return 5;
|
||
if (value >= 0.45)
|
||
return 3;
|
||
return 0;
|
||
}
|
||
if (metric === "duplicate_collapse_rate") {
|
||
if (value >= 0.2)
|
||
return 5;
|
||
if (value >= 0.08)
|
||
return 3;
|
||
return 0;
|
||
}
|
||
if (metric === "entity_leakage_rate") {
|
||
if (value <= 0.2)
|
||
return 5;
|
||
if (value <= 0.4)
|
||
return 3;
|
||
return 0;
|
||
}
|
||
if (metric === "mechanism_coherence_score" || metric === "problem_clarity_score") {
|
||
if (value >= 4)
|
||
return 5;
|
||
if (value >= 2.5)
|
||
return 3;
|
||
return 0;
|
||
}
|
||
return 0;
|
||
}
|
||
function rubricBandForMetricStage2(metric, value) {
|
||
if (value === null) {
|
||
return null;
|
||
}
|
||
const score = rateToBandScoreStage2(metric, value);
|
||
return stage2EvalContracts_1.ASSISTANT_STAGE2_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null;
|
||
}
|
||
function buildFeatureProfileSnapshot() {
|
||
return {
|
||
FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1: config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1,
|
||
FEATURE_ASSISTANT_ANSWER_POLICY_V11: process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 ?? String(config_1.FEATURE_ASSISTANT_ANSWER_POLICY_V11),
|
||
FEATURE_ASSISTANT_BROAD_GUARD_V1: process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 ?? null,
|
||
FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1: process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 ?? null,
|
||
FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1: process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 ?? null,
|
||
FEATURE_ASSISTANT_INVESTIGATION_STATE_V1: process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 ?? null,
|
||
FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1: process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 ?? null,
|
||
FEATURE_ASSISTANT_PROBLEM_UNITS_V1: process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_UNITS_V1),
|
||
FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1: process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1),
|
||
FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1: process.env.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1),
|
||
FEATURE_ASSISTANT_STAGE2_EVAL_V1: process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 ?? String(config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1)
|
||
};
|
||
}
|
||
function buildCodeVersionMarker() {
|
||
return {
|
||
git_commit: process.env.GIT_COMMIT ??
|
||
process.env.CI_COMMIT_SHA ??
|
||
process.env.VERCEL_GIT_COMMIT_SHA ??
|
||
process.env.GITHUB_SHA ??
|
||
null,
|
||
build_marker: process.env.BUILD_MARKER ?? process.env.BUILD_ID ?? process.env.npm_package_version ?? null
|
||
};
|
||
}
|
||
function resolveReadablePath(inputPath) {
|
||
if (path_1.default.isAbsolute(inputPath)) {
|
||
return inputPath;
|
||
}
|
||
const candidates = [
|
||
path_1.default.resolve(config_1.REPORTS_DIR, inputPath),
|
||
path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath),
|
||
path_1.default.resolve(config_1.EVAL_CASES_DIR, inputPath),
|
||
path_1.default.resolve(inputPath)
|
||
];
|
||
for (const candidate of candidates) {
|
||
if (fs_1.default.existsSync(candidate)) {
|
||
return candidate;
|
||
}
|
||
}
|
||
return candidates[0];
|
||
}
|
||
function parseAssistantSuiteFile(inputPath) {
|
||
const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE1_SUITE_FILE);
|
||
const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
|
||
const parsed = JSON.parse(raw);
|
||
if (!parsed || typeof parsed !== "object") {
|
||
throw new Error(`Invalid assistant suite format: ${filePath}`);
|
||
}
|
||
if (!Array.isArray(parsed.cases)) {
|
||
throw new Error(`Assistant suite cases[] is required: ${filePath}`);
|
||
}
|
||
if (!Array.isArray(parsed.case_ids)) {
|
||
throw new Error(`Assistant suite case_ids[] is required: ${filePath}`);
|
||
}
|
||
if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) {
|
||
throw new Error(`Assistant suite suite_id is required: ${filePath}`);
|
||
}
|
||
if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) {
|
||
throw new Error(`Assistant suite suite_version is required: ${filePath}`);
|
||
}
|
||
if (parsed.scenario_count !== parsed.cases.length) {
|
||
throw new Error(`Assistant suite scenario_count mismatch: ${filePath}`);
|
||
}
|
||
const declaredIds = [...parsed.case_ids].sort();
|
||
const actualIds = parsed.cases.map((item) => item.case_id).sort();
|
||
const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]);
|
||
if (!idsMatch) {
|
||
throw new Error(`Assistant suite case_ids do not match cases[]: ${filePath}`);
|
||
}
|
||
for (const item of parsed.cases) {
|
||
if (!Array.isArray(item.turns) || item.turns.length === 0) {
|
||
throw new Error(`Assistant suite case ${item.case_id} must include at least one turn.`);
|
||
}
|
||
}
|
||
return parsed;
|
||
}
|
||
function parseAssistantStage2SuiteFile(inputPath) {
|
||
const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE2_SUITE_FILE);
|
||
const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
|
||
const parsed = JSON.parse(raw);
|
||
if (!parsed || typeof parsed !== "object") {
|
||
throw new Error(`Invalid assistant stage2 suite format: ${filePath}`);
|
||
}
|
||
if (!Array.isArray(parsed.cases)) {
|
||
throw new Error(`Assistant stage2 suite cases[] is required: ${filePath}`);
|
||
}
|
||
if (!Array.isArray(parsed.case_ids)) {
|
||
throw new Error(`Assistant stage2 suite case_ids[] is required: ${filePath}`);
|
||
}
|
||
if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) {
|
||
throw new Error(`Assistant stage2 suite_id is required: ${filePath}`);
|
||
}
|
||
if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) {
|
||
throw new Error(`Assistant stage2 suite_version is required: ${filePath}`);
|
||
}
|
||
if (parsed.scenario_count !== parsed.cases.length) {
|
||
throw new Error(`Assistant stage2 scenario_count mismatch: ${filePath}`);
|
||
}
|
||
const declaredIds = [...parsed.case_ids].sort();
|
||
const actualIds = parsed.cases.map((item) => item.case_id).sort();
|
||
const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]);
|
||
if (!idsMatch) {
|
||
throw new Error(`Assistant stage2 case_ids do not match cases[]: ${filePath}`);
|
||
}
|
||
for (const item of parsed.cases) {
|
||
if (!Array.isArray(item.turns) || item.turns.length === 0) {
|
||
throw new Error(`Assistant stage2 case ${item.case_id} must include at least one turn.`);
|
||
}
|
||
}
|
||
return parsed;
|
||
}
|
||
function hasDomainAnchors(text) {
|
||
const source = String(text ?? "");
|
||
if (!source.trim()) {
|
||
return false;
|
||
}
|
||
const hasPeriod = /\b20\d{2}(?:[-./](?:0[1-9]|1[0-2]))?\b/.test(source);
|
||
const hasAccountingObject = /(счет|контрагент|документ|ндс|ос|period|account|supplier|invoice|guid|объект)/i.test(source);
|
||
const hasAccountCode = /\b(?:01|02|03|04|08|10|19|20|25|26|41|43|44|50|51|52|57|60|62|68|69|70|71|73|76|90|91|94|97)\b/.test(source);
|
||
const hits = [hasPeriod, hasAccountingObject, hasAccountCode].filter(Boolean).length;
|
||
return hits >= 2;
|
||
}
|
||
function detectEntityLeakage(text) {
|
||
const source = String(text ?? "");
|
||
if (!source.trim()) {
|
||
return false;
|
||
}
|
||
const uuidHits = source.match(/\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi)?.length ?? 0;
|
||
const guidHits = source.match(/\b(?:guid|uuid|entity_id|source_ref|canonical_ref|fragment_id)\b/gi)?.length ?? 0;
|
||
const longHexHits = source.match(/\b[0-9a-f]{24,}\b/gi)?.length ?? 0;
|
||
return uuidHits > 0 || guidHits > 1 || longHexHits > 0;
|
||
}
|
||
function extractTextList(value) {
|
||
if (!Array.isArray(value)) {
|
||
return [];
|
||
}
|
||
return value
|
||
.map((item) => (typeof item === "string" ? item.trim() : ""))
|
||
.filter(Boolean);
|
||
}
|
||
function toNarrowingStrength(value) {
|
||
if (value === "weak" || value === "medium" || value === "strong") {
|
||
return value;
|
||
}
|
||
return null;
|
||
}
|
||
function toDegradedTo(value) {
|
||
if (value === "partial" || value === "clarification") {
|
||
return value;
|
||
}
|
||
return null;
|
||
}
|
||
function buildAssistantEvalMarkdownReport(report) {
|
||
const metrics = (report.metrics ?? {}).raw ?? {};
|
||
const bands = (report.rubric_bands ?? {});
|
||
const subsets = (report.subsets ?? {});
|
||
const scenarioSummary = (report.scenario_summary ?? {});
|
||
const improvementHints = (report.improvement_hints ?? {});
|
||
const rows = Object.keys(metrics)
|
||
.map((key) => {
|
||
const rawValue = metrics[key];
|
||
const band = bands[key];
|
||
const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue);
|
||
const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a";
|
||
return `| ${key} | ${rawPrintable} | ${bandPrintable} |`;
|
||
})
|
||
.join("\n");
|
||
return [
|
||
`# ${String(report.report_title ?? "Assistant Stage 1 Eval Run")}`,
|
||
"",
|
||
`- run_id: ${String(report.run_id ?? "")}`,
|
||
`- eval_target: ${String(report.eval_target ?? "")}`,
|
||
`- run_timestamp: ${String(report.run_timestamp ?? "")}`,
|
||
`- suite_id: ${String(report.suite_id ?? "")}`,
|
||
`- suite_version: ${String(report.suite_version ?? "")}`,
|
||
`- cases_total: ${String(report.cases_total ?? 0)}`,
|
||
"",
|
||
"## Raw Metrics and Rubric Bands",
|
||
"",
|
||
"| Metric | Raw | Rubric band |",
|
||
"|---|---:|---|",
|
||
rows || "| n/a | n/a | n/a |",
|
||
"",
|
||
"## Subsets",
|
||
"",
|
||
`- broad_cases_total: ${String(subsets.broad_cases_total ?? 0)}`,
|
||
`- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`,
|
||
"",
|
||
"## Scenario Summary",
|
||
"",
|
||
`- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`,
|
||
`- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`,
|
||
`- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`,
|
||
"",
|
||
"## Improvement Hints",
|
||
"",
|
||
`- strongest_signals: ${String(improvementHints.strongest_signals ?? "n/a")}`,
|
||
`- weakest_signals: ${String(improvementHints.weakest_signals ?? "n/a")}`,
|
||
""
|
||
].join("\n");
|
||
}
|
||
function buildAssistantStage2EvalMarkdownReport(report) {
|
||
const metrics = (report.metrics ?? {}).raw ?? {};
|
||
const bands = (report.rubric_bands ?? {});
|
||
const subsets = (report.subsets ?? {});
|
||
const scenarioSummary = (report.scenario_summary ?? {});
|
||
const rows = Object.keys(metrics)
|
||
.map((key) => {
|
||
const rawValue = metrics[key];
|
||
const band = bands[key];
|
||
const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue);
|
||
const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a";
|
||
return `| ${key} | ${rawPrintable} | ${bandPrintable} |`;
|
||
})
|
||
.join("\n");
|
||
return [
|
||
`# ${String(report.report_title ?? "Assistant Stage 2 Eval Run")}`,
|
||
"",
|
||
`- run_id: ${String(report.run_id ?? "")}`,
|
||
`- eval_target: ${String(report.eval_target ?? "")}`,
|
||
`- run_timestamp: ${String(report.run_timestamp ?? "")}`,
|
||
`- suite_id: ${String(report.suite_id ?? "")}`,
|
||
`- suite_version: ${String(report.suite_version ?? "")}`,
|
||
`- cases_total: ${String(report.cases_total ?? 0)}`,
|
||
"",
|
||
"## Raw Metrics and Rubric Bands",
|
||
"",
|
||
"| Metric | Raw | Rubric band |",
|
||
"|---|---:|---|",
|
||
rows || "| n/a | n/a | n/a |",
|
||
"",
|
||
"## Subsets",
|
||
"",
|
||
`- expected_problem_cases_total: ${String(subsets.expected_problem_cases_total ?? 0)}`,
|
||
`- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`,
|
||
`- candidate_cases_total: ${String(subsets.candidate_cases_total ?? 0)}`,
|
||
"",
|
||
"## Scenario Summary",
|
||
"",
|
||
`- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`,
|
||
`- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`,
|
||
`- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`,
|
||
""
|
||
].join("\n");
|
||
}
|
||
function buildAssistantComparisonMarkdownReport(report) {
|
||
const metrics = (report.metric_deltas ?? {});
|
||
const summary = (report.scenario_notes_summary ?? {});
|
||
const rows = Object.keys(metrics)
|
||
.map((key) => {
|
||
const row = metrics[key];
|
||
return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`;
|
||
})
|
||
.join("\n");
|
||
return [
|
||
`# ${String(report.report_title ?? "Assistant Stage 1 Baseline vs Current")}`,
|
||
"",
|
||
`- comparison_id: ${String(report.comparison_id ?? "")}`,
|
||
`- baseline_run_id: ${String(report.baseline_run_id ?? "")}`,
|
||
`- current_run_id: ${String(report.current_run_id ?? "")}`,
|
||
`- suite_version: ${String(report.suite_version ?? "")}`,
|
||
"",
|
||
"## Metric Deltas",
|
||
"",
|
||
"| Metric | Baseline | Current | Delta | Trend |",
|
||
"|---|---:|---:|---:|---|",
|
||
rows || "| n/a | n/a | n/a | n/a | n/a |",
|
||
"",
|
||
"## Scenario Notes Summary",
|
||
"",
|
||
`- improved: ${String(summary.improved ?? 0)}`,
|
||
`- unchanged: ${String(summary.unchanged ?? 0)}`,
|
||
`- weakened: ${String(summary.weakened ?? 0)}`,
|
||
""
|
||
].join("\n");
|
||
}
|
||
function buildAssistantStage2ComparisonMarkdownReport(report) {
|
||
const metrics = (report.metric_deltas ?? {});
|
||
const summary = (report.scenario_notes_summary ?? {});
|
||
const rows = Object.keys(metrics)
|
||
.map((key) => {
|
||
const row = metrics[key];
|
||
return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`;
|
||
})
|
||
.join("\n");
|
||
return [
|
||
`# ${String(report.report_title ?? "Assistant Stage 2 Baseline vs Current")}`,
|
||
"",
|
||
`- comparison_id: ${String(report.comparison_id ?? "")}`,
|
||
`- baseline_run_id: ${String(report.baseline_run_id ?? "")}`,
|
||
`- current_run_id: ${String(report.current_run_id ?? "")}`,
|
||
`- suite_version: ${String(report.suite_version ?? "")}`,
|
||
"",
|
||
"## Metric Deltas",
|
||
"",
|
||
"| Metric | Baseline | Current | Delta | Trend |",
|
||
"|---|---:|---:|---:|---|",
|
||
rows || "| n/a | n/a | n/a | n/a | n/a |",
|
||
"",
|
||
"## Scenario Notes Summary",
|
||
"",
|
||
`- improved: ${String(summary.improved ?? 0)}`,
|
||
`- unchanged: ${String(summary.unchanged ?? 0)}`,
|
||
`- weakened: ${String(summary.weakened ?? 0)}`,
|
||
""
|
||
].join("\n");
|
||
}
|
||
class EvalService {
|
||
normalizerService;
|
||
constructor(normalizerService) {
|
||
this.normalizerService = normalizerService;
|
||
}
|
||
listCases() {
|
||
(0, files_1.ensureDir)(config_1.EVAL_CASES_DIR);
|
||
const files = fs_1.default
|
||
.readdirSync(config_1.EVAL_CASES_DIR)
|
||
.filter((item) => item.endsWith(".json") && !item.endsWith(".report.json"));
|
||
return files
|
||
.map((name) => {
|
||
const raw = fs_1.default.readFileSync(path_1.default.resolve(config_1.EVAL_CASES_DIR, name), "utf-8");
|
||
return JSON.parse(raw);
|
||
})
|
||
.sort((a, b) => a.case_id.localeCompare(b.case_id));
|
||
}
|
||
async runV2(payload) {
|
||
const runId = `eval-${(0, nanoid_1.nanoid)(10)}`;
|
||
const results = [];
|
||
const routeCounter = {};
|
||
const fallbackCounter = {};
|
||
let schemaPass = 0;
|
||
let inScopeMessages = 0;
|
||
let multiIntentMessages = 0;
|
||
let clarificationMessages = 0;
|
||
let totalFragments = 0;
|
||
let inScopeFragments = 0;
|
||
let outOfScopeFragments = 0;
|
||
let unclearFragments = 0;
|
||
let executableWithSoftAssumptionsFragments = 0;
|
||
let softAssumptionFragments = 0;
|
||
let routedFragments = 0;
|
||
let noRouteFragments = 0;
|
||
let requestsTotal = 0;
|
||
let retriesUsed = 0;
|
||
let clarificationLabeledCases = 0;
|
||
let clarificationTruePositive = 0;
|
||
let clarificationFalsePositive = 0;
|
||
let clarificationFalseNegative = 0;
|
||
let scopeLabeledCases = 0;
|
||
let scopeCorrectCases = 0;
|
||
let routeLabeledCases = 0;
|
||
let routeCorrectCases = 0;
|
||
let expectedRoutedCases = 0;
|
||
let noRouteTruePositive = 0;
|
||
let noRouteFalsePositive = 0;
|
||
let stateConsistencyChecks = 0;
|
||
let stateConsistencyPass = 0;
|
||
for (const item of payload.cases) {
|
||
const response = await this.normalizerService.normalize({
|
||
...payload.normalizeConfig,
|
||
userQuestion: item.raw_question,
|
||
context: {
|
||
eval_label: runId,
|
||
case_id: item.case_id,
|
||
eval_mode: payload.mode
|
||
},
|
||
retryPolicy: payload.mode === "single-pass-strict" ? "single-pass-strict" : "default",
|
||
useMock: payload.useMock
|
||
});
|
||
if (response.validation.passed) {
|
||
schemaPass += 1;
|
||
}
|
||
const requestCount = Number(response.request_count_for_case ?? 0);
|
||
requestsTotal += requestCount;
|
||
if (requestCount > 1) {
|
||
retriesUsed += 1;
|
||
}
|
||
const normalized = response.normalized &&
|
||
["normalized_query_v2", "normalized_query_v2_0_1", "normalized_query_v2_0_2"].includes(String(response.normalized.schema_version ?? ""))
|
||
? response.normalized
|
||
: null;
|
||
const routeSummary = response.route_hint_summary &&
|
||
response.route_hint_summary.mode === "deterministic_v2"
|
||
? response.route_hint_summary
|
||
: null;
|
||
if (normalized) {
|
||
if (normalized.message_in_scope) {
|
||
inScopeMessages += 1;
|
||
}
|
||
if (normalized.contains_multiple_tasks) {
|
||
multiIntentMessages += 1;
|
||
}
|
||
if (normalized.global_notes.needs_clarification) {
|
||
clarificationMessages += 1;
|
||
}
|
||
totalFragments += normalized.fragments.length;
|
||
const inScopeList = normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope");
|
||
inScopeFragments += inScopeList.length;
|
||
outOfScopeFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length;
|
||
unclearFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length;
|
||
for (const fragment of inScopeList) {
|
||
const readiness = executionReadinessOf(fragment);
|
||
if (readiness === "executable_with_soft_assumptions") {
|
||
executableWithSoftAssumptionsFragments += 1;
|
||
}
|
||
if (softAssumptionsOf(fragment).length > 0) {
|
||
softAssumptionFragments += 1;
|
||
}
|
||
}
|
||
}
|
||
const predictedClarification = Boolean(normalized?.global_notes?.needs_clarification);
|
||
const expectedClarification = typeof item.expected?.clarification_required === "boolean" ? item.expected.clarification_required : null;
|
||
if (expectedClarification !== null) {
|
||
clarificationLabeledCases += 1;
|
||
if (predictedClarification && expectedClarification)
|
||
clarificationTruePositive += 1;
|
||
if (predictedClarification && !expectedClarification)
|
||
clarificationFalsePositive += 1;
|
||
if (!predictedClarification && expectedClarification)
|
||
clarificationFalseNegative += 1;
|
||
}
|
||
const predictedScope = normalized ? normalized.message_in_scope : null;
|
||
const expectedScope = expectedScopeInScope(item.expected);
|
||
if (expectedScope !== null && predictedScope !== null) {
|
||
scopeLabeledCases += 1;
|
||
if (predictedScope === expectedScope) {
|
||
scopeCorrectCases += 1;
|
||
}
|
||
}
|
||
const predictedRouteStatus = routeSummary
|
||
? routeSummary.decisions.some((decision) => decision.route !== "no_route")
|
||
? "routed"
|
||
: "no_route"
|
||
: null;
|
||
const predictedNoRouteReason = routeSummary &&
|
||
routeSummary.decisions.length > 0 &&
|
||
routeSummary.decisions.every((decision) => decision.route === "no_route")
|
||
? (routeSummary.decisions[0]?.no_route_reason ?? null)
|
||
: null;
|
||
const expectedRouteStatus = item.expected?.expected_route_status ?? null;
|
||
const expectedNoRouteReason = item.expected?.expected_no_route_reason ?? null;
|
||
if (expectedRouteStatus) {
|
||
routeLabeledCases += 1;
|
||
if (predictedRouteStatus === expectedRouteStatus) {
|
||
routeCorrectCases += 1;
|
||
}
|
||
if (expectedRouteStatus === "routed") {
|
||
expectedRoutedCases += 1;
|
||
}
|
||
}
|
||
if (predictedRouteStatus === "no_route") {
|
||
if (expectedRouteStatus === "no_route") {
|
||
if (!expectedNoRouteReason || expectedNoRouteReason === predictedNoRouteReason) {
|
||
noRouteTruePositive += 1;
|
||
}
|
||
else {
|
||
noRouteFalsePositive += 1;
|
||
}
|
||
}
|
||
else if (expectedRouteStatus === "routed") {
|
||
noRouteFalsePositive += 1;
|
||
}
|
||
}
|
||
if (routeSummary) {
|
||
for (const decision of routeSummary.decisions) {
|
||
stateConsistencyChecks += 1;
|
||
if (isDecisionStateConsistent(decision)) {
|
||
stateConsistencyPass += 1;
|
||
}
|
||
routeCounter[decision.route] = (routeCounter[decision.route] ?? 0) + 1;
|
||
if (decision.route === "no_route") {
|
||
noRouteFragments += 1;
|
||
}
|
||
else {
|
||
routedFragments += 1;
|
||
}
|
||
}
|
||
const fallbackType = String(routeSummary.fallback?.type ?? "none");
|
||
fallbackCounter[fallbackType] = (fallbackCounter[fallbackType] ?? 0) + 1;
|
||
}
|
||
else {
|
||
fallbackCounter.none = (fallbackCounter.none ?? 0) + 1;
|
||
}
|
||
results.push({
|
||
case_id: item.case_id,
|
||
raw_question: item.raw_question,
|
||
validation_passed: response.validation.passed,
|
||
message_in_scope: normalized?.message_in_scope ?? null,
|
||
scope_confidence: normalized?.scope_confidence ?? null,
|
||
contains_multiple_tasks: normalized?.contains_multiple_tasks ?? null,
|
||
fragments_total: normalized?.fragments.length ?? 0,
|
||
in_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope").length : 0,
|
||
out_of_scope_fragments: normalized
|
||
? normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length
|
||
: 0,
|
||
unclear_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length : 0,
|
||
fallback_type: routeSummary?.fallback?.type ?? "none",
|
||
predicted_route_status: predictedRouteStatus,
|
||
expected_route_status: expectedRouteStatus,
|
||
predicted_no_route_reason: predictedNoRouteReason,
|
||
expected_no_route_reason: expectedNoRouteReason,
|
||
predicted_clarification_required: predictedClarification,
|
||
expected_clarification_required: expectedClarification,
|
||
executable_with_soft_assumptions_fragments: normalized
|
||
? normalized.fragments.filter((fragment) => executionReadinessOf(fragment) === "executable_with_soft_assumptions")
|
||
.length
|
||
: 0,
|
||
trace_id: response.trace_id,
|
||
request_count_for_case: requestCount
|
||
});
|
||
}
|
||
const total = Math.max(1, payload.cases.length);
|
||
const totalFragmentsSafe = Math.max(1, totalFragments);
|
||
const totalRoutedDecisions = Math.max(1, routedFragments + noRouteFragments);
|
||
const precisionDenominator = clarificationTruePositive + clarificationFalsePositive;
|
||
const recallDenominator = clarificationTruePositive + clarificationFalseNegative;
|
||
const noRoutePrecisionDenominator = noRouteTruePositive + noRouteFalsePositive;
|
||
const metrics = {
|
||
schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
|
||
scope_detection_accuracy: scopeLabeledCases > 0 ? Number(((scopeCorrectCases / scopeLabeledCases) * 100).toFixed(2)) : null,
|
||
scope_in_scope_rate: Number(((inScopeMessages / total) * 100).toFixed(2)),
|
||
multi_intent_detected_rate: Number(((multiIntentMessages / total) * 100).toFixed(2)),
|
||
clarification_required_rate: Number(((clarificationMessages / total) * 100).toFixed(2)),
|
||
avg_fragments_per_message: Number((totalFragments / total).toFixed(2)),
|
||
out_of_scope_fragment_rate: Number(((outOfScopeFragments / totalFragmentsSafe) * 100).toFixed(2)),
|
||
routed_fragment_rate: Number(((routedFragments / totalRoutedDecisions) * 100).toFixed(2)),
|
||
no_route_fragment_rate: Number(((noRouteFragments / totalRoutedDecisions) * 100).toFixed(2)),
|
||
route_resolution_accuracy: routeLabeledCases > 0 ? Number(((routeCorrectCases / routeLabeledCases) * 100).toFixed(2)) : null,
|
||
no_route_precision: noRoutePrecisionDenominator > 0 ? Number(((noRouteTruePositive / noRoutePrecisionDenominator) * 100).toFixed(2)) : null,
|
||
false_no_route_rate: expectedRoutedCases > 0 ? Number(((noRouteFalsePositive / expectedRoutedCases) * 100).toFixed(2)) : null,
|
||
execution_state_consistency_rate: stateConsistencyChecks > 0 ? Number(((stateConsistencyPass / stateConsistencyChecks) * 100).toFixed(2)) : null,
|
||
executable_with_soft_assumptions_rate: Number(((executableWithSoftAssumptionsFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
|
||
soft_assumption_used_fragment_rate: Number(((softAssumptionFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
|
||
clarification_precision: precisionDenominator > 0 ? Number(((clarificationTruePositive / precisionDenominator) * 100).toFixed(2)) : null,
|
||
clarification_recall: recallDenominator > 0 ? Number(((clarificationTruePositive / recallDenominator) * 100).toFixed(2)) : null,
|
||
false_clarification_rate: clarificationLabeledCases > 0 ? Number(((clarificationFalsePositive / clarificationLabeledCases) * 100).toFixed(2)) : null
|
||
};
|
||
const report = {
|
||
run_id: runId,
|
||
timestamp: new Date().toISOString(),
|
||
mode: payload.mode,
|
||
use_mock: Boolean(payload.useMock),
|
||
prompt_version: payload.normalizeConfig.promptVersion ?? null,
|
||
schema_version: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
|
||
.toLowerCase()
|
||
.includes("v2_0_2")
|
||
? "v2_0_2"
|
||
: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
|
||
.toLowerCase()
|
||
.includes("v2_0_1")
|
||
? "v2_0_1"
|
||
: "v2",
|
||
dataset: {
|
||
source: payload.rawQuestions ? "inline_raw_questions" : payload.caseSetFile ? "file" : "data/eval_cases/*.json",
|
||
file: payload.caseSetFile ?? null,
|
||
raw_questions_count: payload.rawQuestions ? parseRawQuestions(payload.rawQuestions).length : null
|
||
},
|
||
cases_total: payload.cases.length,
|
||
metrics,
|
||
budget: {
|
||
requests_total: requestsTotal,
|
||
retries_used: retriesUsed
|
||
},
|
||
clarification_eval: {
|
||
labeled_cases: clarificationLabeledCases,
|
||
true_positive: clarificationTruePositive,
|
||
false_positive: clarificationFalsePositive,
|
||
false_negative: clarificationFalseNegative
|
||
},
|
||
route_eval: {
|
||
labeled_cases: routeLabeledCases,
|
||
correct_cases: routeCorrectCases,
|
||
expected_routed_cases: expectedRoutedCases,
|
||
no_route_true_positive: noRouteTruePositive,
|
||
no_route_false_positive: noRouteFalsePositive
|
||
},
|
||
scope_eval: {
|
||
labeled_cases: scopeLabeledCases,
|
||
correct_cases: scopeCorrectCases
|
||
},
|
||
execution_state_eval: {
|
||
checks_total: stateConsistencyChecks,
|
||
checks_passed: stateConsistencyPass
|
||
},
|
||
route_distribution: routeCounter,
|
||
fallback_distribution: fallbackCounter,
|
||
results
|
||
};
|
||
(0, files_1.ensureDir)(config_1.EVAL_CASES_DIR);
|
||
tryWriteJsonFile(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report);
|
||
return report;
|
||
}
|
||
collectAssistantSignals(finalResponse, turnResponses) {
|
||
const debug = finalResponse.debug;
|
||
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
|
||
const sourceRefSet = new Set();
|
||
const limitationCodeSet = new Set();
|
||
const routeSet = new Set();
|
||
const confidenceScores = [];
|
||
const narrowingOrder = { weak: 0, medium: 1, strong: 2 };
|
||
let broadQueryDetected = false;
|
||
let broadResultFlag = false;
|
||
let minimumEvidenceFailed = false;
|
||
let degradedTo = null;
|
||
let narrowingStrength = null;
|
||
for (const result of retrievalResults) {
|
||
routeSet.add(String(result.route ?? "unknown"));
|
||
const summary = result.summary ?? {};
|
||
if (summary.broad_query_detected === true)
|
||
broadQueryDetected = true;
|
||
if (summary.broad_result_flag === true)
|
||
broadResultFlag = true;
|
||
if (summary.minimum_evidence_failed === true)
|
||
minimumEvidenceFailed = true;
|
||
const degraded = toDegradedTo(summary.degraded_to);
|
||
if (degraded === "clarification") {
|
||
degradedTo = "clarification";
|
||
}
|
||
else if (!degradedTo && degraded === "partial") {
|
||
degradedTo = "partial";
|
||
}
|
||
const narrowed = toNarrowingStrength(summary.narrowing_strength);
|
||
if (narrowed && (!narrowingStrength || narrowingOrder[narrowed] < narrowingOrder[narrowingStrength])) {
|
||
narrowingStrength = narrowed;
|
||
}
|
||
if (result.confidence === "high")
|
||
confidenceScores.push(3);
|
||
if (result.confidence === "medium")
|
||
confidenceScores.push(2);
|
||
if (result.confidence === "low")
|
||
confidenceScores.push(1);
|
||
for (const evidence of Array.isArray(result.evidence) ? result.evidence : []) {
|
||
const canonicalRef = String(evidence.source_ref?.canonical_ref ?? "").trim();
|
||
if (canonicalRef) {
|
||
sourceRefSet.add(canonicalRef);
|
||
}
|
||
const reasonCode = String(evidence.limitation?.reason_code ?? "").trim();
|
||
if (reasonCode) {
|
||
limitationCodeSet.add(reasonCode);
|
||
}
|
||
if (evidence.confidence === "high")
|
||
confidenceScores.push(3);
|
||
if (evidence.confidence === "medium")
|
||
confidenceScores.push(2);
|
||
if (evidence.confidence === "low")
|
||
confidenceScores.push(1);
|
||
}
|
||
}
|
||
const averageConfidence = confidenceScores.length > 0 ? confidenceScores.reduce((acc, item) => acc + item, 0) / confidenceScores.length : null;
|
||
const evidenceConfidence = averageConfidence === null ? null : averageConfidence >= 2.6 ? "high" : averageConfidence >= 1.8 ? "medium" : "low";
|
||
const mechanismStatus = debug?.answer_structure_v11?.mechanism_block?.status === "grounded" ||
|
||
debug?.answer_structure_v11?.mechanism_block?.status === "limited" ||
|
||
debug?.answer_structure_v11?.mechanism_block?.status === "unresolved"
|
||
? debug.answer_structure_v11.mechanism_block.status
|
||
: null;
|
||
const followupStateApplied = turnResponses.some((item) => item.debug?.followup_state_usage?.applied === true);
|
||
const uncertaintyLimitationsCount = debug?.answer_structure_v11?.uncertainty_block?.limitations?.length ?? 0;
|
||
return {
|
||
broad_query_detected: broadQueryDetected,
|
||
broad_result_flag: broadResultFlag,
|
||
narrowing_strength: narrowingStrength,
|
||
minimum_evidence_failed: minimumEvidenceFailed,
|
||
degraded_to: degradedTo,
|
||
evidence_confidence: evidenceConfidence,
|
||
limitation_reason_codes: [...limitationCodeSet],
|
||
mechanism_status: mechanismStatus,
|
||
source_refs: [...sourceRefSet],
|
||
routes: [...routeSet],
|
||
followup_state_applied: followupStateApplied,
|
||
uncertainty_limitations_count: uncertaintyLimitationsCount
|
||
};
|
||
}
|
||
collectAssistantStage2Signals(finalResponse, turnResponses) {
|
||
const base = this.collectAssistantSignals(finalResponse, turnResponses);
|
||
const debug = finalResponse.debug;
|
||
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
|
||
const typeSet = new Set();
|
||
const mechanismSummaries = new Set();
|
||
let candidateEvidenceTotal = 0;
|
||
let problemUnitsTotal = 0;
|
||
let duplicateCollapsesTotal = 0;
|
||
for (const result of retrievalResults) {
|
||
const candidates = Array.isArray(result.candidate_evidence) ? result.candidate_evidence : [];
|
||
candidateEvidenceTotal += candidates.length;
|
||
const problemUnits = Array.isArray(result.problem_units) ? result.problem_units : [];
|
||
problemUnitsTotal += problemUnits.length;
|
||
for (const unit of problemUnits) {
|
||
const unitType = toProblemUnitType(unit.problem_unit_type);
|
||
if (unitType) {
|
||
typeSet.add(unitType);
|
||
}
|
||
const mechanismSummary = String(unit.mechanism_summary ?? "").trim();
|
||
if (mechanismSummary) {
|
||
mechanismSummaries.add(mechanismSummary);
|
||
}
|
||
}
|
||
if (result.problem_unit_summary && typeof result.problem_unit_summary.duplicate_collapses === "number") {
|
||
duplicateCollapsesTotal += Number(result.problem_unit_summary.duplicate_collapses);
|
||
}
|
||
}
|
||
const answerMode = typeof debug?.problem_answer_mode === "string" ? debug.problem_answer_mode : null;
|
||
const unitsUsedCount = Number(debug?.problem_units_used_count ?? 0);
|
||
const unitIdsUsed = Array.isArray(debug?.problem_unit_ids_used)
|
||
? debug.problem_unit_ids_used
|
||
.map((item) => String(item ?? "").trim())
|
||
.filter(Boolean)
|
||
: [];
|
||
const problemCentricApplied = debug?.problem_centric_answer_applied === true || answerMode === "stage2_problem_centric_v1";
|
||
return {
|
||
...base,
|
||
candidate_evidence_total: candidateEvidenceTotal,
|
||
problem_units_total: problemUnitsTotal,
|
||
problem_unit_types: [...typeSet],
|
||
problem_mechanism_summaries: [...mechanismSummaries],
|
||
duplicate_collapses_total: duplicateCollapsesTotal,
|
||
problem_centric_answer_applied: problemCentricApplied,
|
||
problem_units_used_count: unitsUsedCount,
|
||
problem_answer_mode: answerMode,
|
||
problem_unit_ids_used: unitIdsUsed,
|
||
entity_leakage_detected: detectEntityLeakage(String(finalResponse.assistant_reply ?? ""))
|
||
};
|
||
}
|
||
getExpectedProblemUnitTypes(suiteCase) {
|
||
const expected = Array.isArray(suiteCase.expected_hints?.expected_problem_unit_types)
|
||
? suiteCase.expected_hints?.expected_problem_unit_types
|
||
: [];
|
||
const output = new Set();
|
||
for (const value of expected ?? []) {
|
||
const mapped = toProblemUnitType(value);
|
||
if (mapped) {
|
||
output.add(mapped);
|
||
}
|
||
}
|
||
return [...output];
|
||
}
|
||
computeProblemUnitPrecision(expectedTypes, detectedTypes) {
|
||
const uniqueExpected = [...new Set(expectedTypes)];
|
||
const uniqueDetected = [...new Set(detectedTypes)];
|
||
if (uniqueDetected.length === 0) {
|
||
return uniqueExpected.length === 0 ? 1 : 0;
|
||
}
|
||
if (uniqueExpected.length === 0) {
|
||
return 0;
|
||
}
|
||
const matchedDetected = uniqueDetected.filter((item) => uniqueExpected.includes(item)).length;
|
||
return round2(matchedDetected / uniqueDetected.length);
|
||
}
|
||
computeProblemUnitRecallProxy(expectedTypes, detectedTypes) {
|
||
const uniqueExpected = [...new Set(expectedTypes)];
|
||
const uniqueDetected = [...new Set(detectedTypes)];
|
||
if (uniqueExpected.length === 0) {
|
||
return null;
|
||
}
|
||
if (uniqueDetected.length === 0) {
|
||
return 0;
|
||
}
|
||
const matchedExpected = uniqueExpected.filter((item) => uniqueDetected.includes(item)).length;
|
||
return round2(matchedExpected / uniqueExpected.length);
|
||
}
|
||
computeDuplicateCollapseRate(candidateTotal, duplicateCollapses) {
|
||
if (candidateTotal <= 0) {
|
||
return null;
|
||
}
|
||
return round2(Math.min(1, Math.max(0, duplicateCollapses / candidateTotal)));
|
||
}
|
||
computeMechanismCoherenceScore(finalResponse, signals) {
|
||
const mechanismBlock = finalResponse.debug?.answer_structure_v11?.mechanism_block;
|
||
const mechanismStatus = mechanismBlock?.status;
|
||
const mechanismNotes = extractTextList(mechanismBlock?.mechanism_notes);
|
||
const hasProblemMechanism = signals.problem_mechanism_summaries.length > 0;
|
||
let score = 0;
|
||
if (mechanismStatus === "grounded" && hasProblemMechanism && mechanismNotes.length > 0) {
|
||
score = 5;
|
||
}
|
||
else if ((mechanismStatus === "limited" || mechanismStatus === "unresolved") && (hasProblemMechanism || mechanismNotes.length > 0)) {
|
||
score = 3;
|
||
}
|
||
else if (hasProblemMechanism || mechanismNotes.length > 0) {
|
||
score = 2;
|
||
}
|
||
if (mechanismStatus === "grounded" && !hasProblemMechanism) {
|
||
score = Math.min(score, 2);
|
||
}
|
||
if (signals.limitation_reason_codes.includes("missing_mechanism")) {
|
||
score -= 1;
|
||
}
|
||
return clampScore(score);
|
||
}
|
||
computeProblemClarityScore(finalResponse, signals) {
|
||
const structure = finalResponse.debug?.answer_structure_v11;
|
||
const answerSummary = String(structure?.answer_summary ?? "").trim();
|
||
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "").trim();
|
||
const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions);
|
||
const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions);
|
||
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
||
let score = 0;
|
||
if (answerSummary.length > 20)
|
||
score += 1;
|
||
if (directAnswer.length > 20)
|
||
score += 1;
|
||
if (hasDomainAnchors(`${answerSummary} ${directAnswer}`))
|
||
score += 1;
|
||
if (recommendedActions.length > 0 || clarificationQuestions.length > 0)
|
||
score += 1;
|
||
if (signals.problem_units_total > 0 || signals.problem_centric_answer_applied)
|
||
score += 1;
|
||
if ((signals.minimum_evidence_failed || signals.degraded_to === "clarification") && uncertaintyLimitations.length === 0) {
|
||
score -= 1;
|
||
}
|
||
if (signals.entity_leakage_detected) {
|
||
score -= 1;
|
||
}
|
||
return clampScore(score);
|
||
}
|
||
computeAssistantMetrics(input) {
|
||
const diagnostics = input.diagnostics;
|
||
const total = Math.max(1, diagnostics.length);
|
||
const signatureCounter = diagnostics.reduce((acc, item) => {
|
||
acc[item.signature] = (acc[item.signature] ?? 0) + 1;
|
||
return acc;
|
||
}, {});
|
||
const uniqueSignatures = Object.keys(signatureCounter).length;
|
||
const genericCases = diagnostics.filter((item) => item.is_generic).length;
|
||
const falseConfidenceCases = diagnostics.filter((item) => item.is_false_confident).length;
|
||
const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
|
||
const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
|
||
const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);
|
||
const avgActionability = diagnostics.length > 0
|
||
? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
|
||
: null;
|
||
const avgMechanism = diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.mechanism_specificity_score, 0) / diagnostics.length : null;
|
||
const avgFollowup = followupCases.length > 0
|
||
? followupCases.reduce((acc, item) => acc + Number(item.followup_retention_score ?? 0), 0) / followupCases.length
|
||
: null;
|
||
const raw = {
|
||
retrieval_differentiation_rate: round2(uniqueSignatures / total),
|
||
generic_explanation_rate: round2(genericCases / total),
|
||
accountant_actionability_score: avgActionability === null ? null : round2(avgActionability),
|
||
false_confidence_rate: round2(falseConfidenceCases / total),
|
||
broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
|
||
mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
|
||
followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
|
||
};
|
||
const rubric_bands = {
|
||
retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
|
||
generic_explanation_rate: rubricBandForMetric("generic_explanation_rate", raw.generic_explanation_rate),
|
||
accountant_actionability_score: rubricBandForMetric("accountant_actionability_score", raw.accountant_actionability_score),
|
||
false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
|
||
broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
|
||
mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
|
||
followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
|
||
};
|
||
return {
|
||
raw,
|
||
rubric_bands,
|
||
denominators: {
|
||
cases_total: diagnostics.length,
|
||
broad_cases_total: broadCases.length,
|
||
followup_cases_total: followupCases.length
|
||
},
|
||
signature_counts: signatureCounter
|
||
};
|
||
}
|
||
computeAssistantStage2Metrics(input) {
|
||
const diagnostics = input.diagnostics;
|
||
const signatureCounter = diagnostics.reduce((acc, item) => {
|
||
acc[item.signature] = (acc[item.signature] ?? 0) + 1;
|
||
return acc;
|
||
}, {});
|
||
const precisionValues = diagnostics
|
||
.map((item) => item.problem_unit_precision)
|
||
.filter((item) => typeof item === "number");
|
||
const recallValues = diagnostics
|
||
.map((item) => item.problem_unit_recall_proxy)
|
||
.filter((item) => typeof item === "number");
|
||
const collapseValues = diagnostics
|
||
.map((item) => item.duplicate_collapse_rate)
|
||
.filter((item) => typeof item === "number");
|
||
const mechanismValues = diagnostics.map((item) => item.mechanism_coherence_score);
|
||
const clarityValues = diagnostics.map((item) => item.problem_clarity_score);
|
||
const firstApplicable = diagnostics.filter((item) => item.problem_first_answer_applied !== null);
|
||
const firstApplied = firstApplicable.filter((item) => item.problem_first_answer_applied === true).length;
|
||
const leakageCases = diagnostics.filter((item) => item.entity_leakage).length;
|
||
const followupCases = diagnostics.filter((item) => item.suite_case.question_type === "followup" || item.turn_count > 1);
|
||
const candidateCases = diagnostics.filter((item) => item.signals.candidate_evidence_total > 0);
|
||
const expectedProblemCases = diagnostics.filter((item) => item.expected_problem_first);
|
||
const average = (values) => {
|
||
if (values.length === 0)
|
||
return null;
|
||
return round2(values.reduce((acc, item) => acc + item, 0) / values.length);
|
||
};
|
||
const raw = {
|
||
problem_unit_precision: average(precisionValues),
|
||
problem_unit_recall_proxy: average(recallValues),
|
||
duplicate_collapse_rate: average(collapseValues),
|
||
mechanism_coherence_score: average(mechanismValues),
|
||
problem_clarity_score: average(clarityValues),
|
||
problem_first_answer_rate: firstApplicable.length > 0 ? round2(firstApplied / firstApplicable.length) : null,
|
||
entity_leakage_rate: diagnostics.length > 0 ? round2(leakageCases / diagnostics.length) : null
|
||
};
|
||
const rubric_bands = {
|
||
problem_unit_precision: rubricBandForMetricStage2("problem_unit_precision", raw.problem_unit_precision),
|
||
problem_unit_recall_proxy: rubricBandForMetricStage2("problem_unit_recall_proxy", raw.problem_unit_recall_proxy),
|
||
duplicate_collapse_rate: rubricBandForMetricStage2("duplicate_collapse_rate", raw.duplicate_collapse_rate),
|
||
mechanism_coherence_score: rubricBandForMetricStage2("mechanism_coherence_score", raw.mechanism_coherence_score),
|
||
problem_clarity_score: rubricBandForMetricStage2("problem_clarity_score", raw.problem_clarity_score),
|
||
problem_first_answer_rate: rubricBandForMetricStage2("problem_first_answer_rate", raw.problem_first_answer_rate),
|
||
entity_leakage_rate: rubricBandForMetricStage2("entity_leakage_rate", raw.entity_leakage_rate)
|
||
};
|
||
return {
|
||
raw,
|
||
rubric_bands,
|
||
denominators: {
|
||
cases_total: diagnostics.length,
|
||
expected_problem_cases_total: expectedProblemCases.length,
|
||
followup_cases_total: followupCases.length,
|
||
candidate_cases_total: candidateCases.length,
|
||
precision_cases_total: precisionValues.length,
|
||
recall_cases_total: recallValues.length,
|
||
duplicate_collapse_cases_total: collapseValues.length,
|
||
problem_first_applicable_cases_total: firstApplicable.length
|
||
},
|
||
signature_counts: signatureCounter
|
||
};
|
||
}
|
||
buildAssistantComparisonReport(input) {
|
||
const baselineRef = readEvalReportByRef(input.baselineReportFile);
|
||
const baselinePath = baselineRef.resolved_path;
|
||
const baselineReport = baselineRef.report;
|
||
const currentReport = input.currentReport;
|
||
const metricKeys = [
|
||
"retrieval_differentiation_rate",
|
||
"generic_explanation_rate",
|
||
"accountant_actionability_score",
|
||
"false_confidence_rate",
|
||
"broad_answer_rate",
|
||
"mechanism_specificity_score",
|
||
"followup_context_retention_score"
|
||
];
|
||
const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);
|
||
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
|
||
const currentRaw = (currentReport.metrics ?? {}).raw ?? {};
|
||
const deltas = {};
|
||
for (const metric of metricKeys) {
|
||
const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null;
|
||
const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null;
|
||
const delta = baseline !== null && current !== null ? round2(current - baseline) : null;
|
||
let trend = "n/a";
|
||
if (baseline !== null && current !== null) {
|
||
const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01;
|
||
const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01;
|
||
trend = improved ? "improved" : weakened ? "weakened" : "unchanged";
|
||
}
|
||
deltas[metric] = { baseline, current, delta, trend };
|
||
}
|
||
const baselineResults = Array.isArray(baselineReport.results) ? baselineReport.results : [];
|
||
const currentResults = Array.isArray(currentReport.results) ? currentReport.results : [];
|
||
const baselineByCase = new Map();
|
||
for (const row of baselineResults) {
|
||
baselineByCase.set(String(row.case_id ?? ""), row);
|
||
}
|
||
const improvedNotes = [];
|
||
const unchangedNotes = [];
|
||
const weakenedNotes = [];
|
||
for (const row of currentResults) {
|
||
const caseId = String(row.case_id ?? "");
|
||
const currentUsefulness = typeof row.accountant_usefulness_score === "number" ? Number(row.accountant_usefulness_score) : null;
|
||
const baselineRow = baselineByCase.get(caseId);
|
||
const baselineUsefulness = baselineRow && typeof baselineRow.accountant_usefulness_score === "number"
|
||
? Number(baselineRow.accountant_usefulness_score)
|
||
: null;
|
||
if (baselineUsefulness === null || currentUsefulness === null) {
|
||
continue;
|
||
}
|
||
const delta = round2(currentUsefulness - baselineUsefulness);
|
||
const note = `${caseId}: usefulness ${baselineUsefulness} -> ${currentUsefulness} (delta ${delta})`;
|
||
if (delta > 0.25) {
|
||
improvedNotes.push(note);
|
||
}
|
||
else if (delta < -0.25) {
|
||
weakenedNotes.push(note);
|
||
}
|
||
else {
|
||
unchangedNotes.push(note);
|
||
}
|
||
}
|
||
const comparisonId = `assistant-compare-${(0, nanoid_1.nanoid)(8)}`;
|
||
const comparisonReport = {
|
||
schema_version: ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION,
|
||
comparison_id: comparisonId,
|
||
run_timestamp: new Date().toISOString(),
|
||
baseline_run_id: baselineReport.run_id ?? null,
|
||
current_run_id: currentReport.run_id ?? null,
|
||
eval_target: "assistant_stage1",
|
||
suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null,
|
||
suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null,
|
||
baseline_report_file: baselinePath,
|
||
current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object"
|
||
? currentReport.artifacts.run_report_json_path ?? null
|
||
: null,
|
||
metric_deltas: deltas,
|
||
scenario_notes_summary: {
|
||
improved: improvedNotes.length,
|
||
unchanged: unchangedNotes.length,
|
||
weakened: weakenedNotes.length
|
||
},
|
||
scenario_notes: {
|
||
improved: improvedNotes,
|
||
unchanged: unchangedNotes,
|
||
weakened: weakenedNotes
|
||
},
|
||
known_limitations: currentReport.known_limitations ?? [
|
||
"Comparison is run-to-run and depends on stable mock/runtime flags.",
|
||
"Metrics remain Stage 1 heuristic bands, not full product scorecards."
|
||
],
|
||
report_title: "Assistant Stage 1 Baseline vs Current"
|
||
};
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
const jsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.json`);
|
||
const mdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.md`);
|
||
const jsonWritten = tryWriteJsonFile(jsonPath, comparisonReport);
|
||
const mdWritten = tryWriteTextFile(mdPath, buildAssistantComparisonMarkdownReport(comparisonReport));
|
||
const comparisonRef = jsonWritten ? jsonPath : putInMemoryEvalReport(comparisonReport);
|
||
return {
|
||
...comparisonReport,
|
||
artifacts: {
|
||
comparison_report_json_path: comparisonRef,
|
||
comparison_report_md_path: mdWritten ? mdPath : null
|
||
}
|
||
};
|
||
}
|
||
buildAssistantStage2ComparisonReport(input) {
|
||
const baselineRef = readEvalReportByRef(input.baselineReportFile);
|
||
const baselinePath = baselineRef.resolved_path;
|
||
const baselineReport = baselineRef.report;
|
||
const currentReport = input.currentReport;
|
||
const metricKeys = [
|
||
"problem_unit_precision",
|
||
"problem_unit_recall_proxy",
|
||
"duplicate_collapse_rate",
|
||
"mechanism_coherence_score",
|
||
"problem_clarity_score",
|
||
"problem_first_answer_rate",
|
||
"entity_leakage_rate"
|
||
];
|
||
const lowerIsBetter = new Set(["entity_leakage_rate"]);
|
||
const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {};
|
||
const currentRaw = (currentReport.metrics ?? {}).raw ?? {};
|
||
const deltas = {};
|
||
for (const metric of metricKeys) {
|
||
const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null;
|
||
const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null;
|
||
const delta = baseline !== null && current !== null ? round2(current - baseline) : null;
|
||
let trend = "n/a";
|
||
if (baseline !== null && current !== null) {
|
||
const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01;
|
||
const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01;
|
||
trend = improved ? "improved" : weakened ? "weakened" : "unchanged";
|
||
}
|
||
deltas[metric] = { baseline, current, delta, trend };
|
||
}
|
||
const baselineResults = Array.isArray(baselineReport.results) ? baselineReport.results : [];
|
||
const currentResults = Array.isArray(currentReport.results) ? currentReport.results : [];
|
||
const baselineByCase = new Map();
|
||
for (const row of baselineResults) {
|
||
baselineByCase.set(String(row.case_id ?? ""), row);
|
||
}
|
||
const improvedNotes = [];
|
||
const unchangedNotes = [];
|
||
const weakenedNotes = [];
|
||
const toComposite = (row) => {
|
||
if (!row || typeof row !== "object")
|
||
return null;
|
||
const metricSubscores = row.metric_subscores;
|
||
if (!metricSubscores)
|
||
return null;
|
||
const clarity = typeof metricSubscores.problem_clarity_score === "number" ? Number(metricSubscores.problem_clarity_score) : null;
|
||
const mechanism = typeof metricSubscores.mechanism_coherence_score === "number" ? Number(metricSubscores.mechanism_coherence_score) : null;
|
||
const firstRate = typeof metricSubscores.problem_first_answer_rate === "number" ? Number(metricSubscores.problem_first_answer_rate) : null;
|
||
const leakageRate = typeof metricSubscores.entity_leakage_rate === "number" ? Number(metricSubscores.entity_leakage_rate) : null;
|
||
if (clarity === null || mechanism === null || firstRate === null || leakageRate === null) {
|
||
return null;
|
||
}
|
||
return round2((clarity + mechanism + firstRate * 5 + (1 - leakageRate) * 5) / 4);
|
||
};
|
||
for (const row of currentResults) {
|
||
const caseId = String(row.case_id ?? "");
|
||
const currentComposite = toComposite(row);
|
||
const baselineComposite = toComposite(baselineByCase.get(caseId));
|
||
if (currentComposite === null || baselineComposite === null) {
|
||
continue;
|
||
}
|
||
const delta = round2(currentComposite - baselineComposite);
|
||
const note = `${caseId}: composite ${baselineComposite} -> ${currentComposite} (delta ${delta})`;
|
||
if (delta > 0.25) {
|
||
improvedNotes.push(note);
|
||
}
|
||
else if (delta < -0.25) {
|
||
weakenedNotes.push(note);
|
||
}
|
||
else {
|
||
unchangedNotes.push(note);
|
||
}
|
||
}
|
||
const comparisonId = `assistant-stage2-compare-${(0, nanoid_1.nanoid)(8)}`;
|
||
const comparisonReport = {
|
||
schema_version: ASSISTANT_STAGE2_COMPARISON_SCHEMA_VERSION,
|
||
comparison_id: comparisonId,
|
||
run_timestamp: new Date().toISOString(),
|
||
baseline_run_id: baselineReport.run_id ?? null,
|
||
current_run_id: currentReport.run_id ?? null,
|
||
eval_target: "assistant_stage2",
|
||
suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null,
|
||
suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null,
|
||
baseline_report_file: baselinePath,
|
||
current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object"
|
||
? currentReport.artifacts.run_report_json_path ?? null
|
||
: null,
|
||
metric_deltas: deltas,
|
||
scenario_notes_summary: {
|
||
improved: improvedNotes.length,
|
||
unchanged: unchangedNotes.length,
|
||
weakened: weakenedNotes.length
|
||
},
|
||
scenario_notes: {
|
||
improved: improvedNotes,
|
||
unchanged: unchangedNotes,
|
||
weakened: weakenedNotes
|
||
},
|
||
known_limitations: currentReport.known_limitations ?? [
|
||
"Stage 2 comparison remains run-to-run and depends on stable feature profile.",
|
||
"Metrics are Stage 2 Wave 5 heuristics, not final product scorecards."
|
||
],
|
||
report_title: "Assistant Stage 2 Baseline vs Current"
|
||
};
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
const jsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.json`);
|
||
const mdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.md`);
|
||
const jsonWritten = tryWriteJsonFile(jsonPath, comparisonReport);
|
||
const mdWritten = tryWriteTextFile(mdPath, buildAssistantStage2ComparisonMarkdownReport(comparisonReport));
|
||
const comparisonRef = jsonWritten ? jsonPath : putInMemoryEvalReport(comparisonReport);
|
||
return {
|
||
...comparisonReport,
|
||
artifacts: {
|
||
comparison_report_json_path: comparisonRef,
|
||
comparison_report_md_path: mdWritten ? mdPath : null
|
||
}
|
||
};
|
||
}
|
||
async runAssistantStage1(payload) {
|
||
if (!config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1) {
|
||
throw new http_1.ApiError("ASSISTANT_STAGE1_EVAL_DISABLED", "Assistant Stage 1 eval target is disabled by FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1.", 409);
|
||
}
|
||
const suite = parseAssistantSuiteFile(payload.caseSetFile);
|
||
const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
|
||
const runId = `assistant-stage1-${(0, nanoid_1.nanoid)(10)}`;
|
||
const assistantService = new assistantService_1.AssistantService(this.normalizerService, new assistantSessionStore_1.AssistantSessionStore());
|
||
const diagnostics = [];
|
||
let requestsTotal = 0;
|
||
for (const suiteCase of suiteCases) {
|
||
const sessionId = `${runId}-${suiteCase.case_id}`;
|
||
const turnResponses = [];
|
||
const notes = [];
|
||
const limitations = [];
|
||
try {
|
||
for (const turn of suiteCase.turns) {
|
||
const response = (await assistantService.handleMessage({
|
||
session_id: sessionId,
|
||
user_message: turn.user_message,
|
||
message: turn.user_message,
|
||
mode: "assistant",
|
||
apiKey: payload.normalizeConfig.apiKey,
|
||
model: payload.normalizeConfig.model,
|
||
baseUrl: payload.normalizeConfig.baseUrl,
|
||
temperature: payload.normalizeConfig.temperature,
|
||
maxOutputTokens: payload.normalizeConfig.maxOutputTokens,
|
||
promptVersion: payload.normalizeConfig.promptVersion,
|
||
systemPrompt: payload.normalizeConfig.systemPrompt,
|
||
developerPrompt: payload.normalizeConfig.developerPrompt,
|
||
domainPrompt: payload.normalizeConfig.domainPrompt,
|
||
fewShotExamples: payload.normalizeConfig.fewShotExamples,
|
||
useMock: payload.useMock
|
||
}));
|
||
turnResponses.push(response);
|
||
requestsTotal += 1;
|
||
}
|
||
}
|
||
catch (error) {
|
||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||
diagnostics.push({
|
||
suite_case: suiteCase,
|
||
session_id: sessionId,
|
||
trace_id: null,
|
||
final_reply_type: "backend_error",
|
||
turn_count: turnResponses.length,
|
||
narrowing_result: "failed",
|
||
signature: `backend_error|${suiteCase.scenario_tag}`,
|
||
is_generic: true,
|
||
is_false_confident: false,
|
||
is_broad_answer: suiteCase.broadness_level === "low" ? null : false,
|
||
followup_retention_score: suiteCase.question_type === "followup" || suiteCase.turns.length > 1 ? 0 : null,
|
||
evidence_quality_score: 0,
|
||
mechanism_specificity_score: 0,
|
||
genericness_score: 5,
|
||
accountant_actionability_score: 0,
|
||
accountant_usefulness_score: 0,
|
||
signals: {
|
||
broad_query_detected: suiteCase.broadness_level !== "low",
|
||
broad_result_flag: false,
|
||
narrowing_strength: null,
|
||
minimum_evidence_failed: true,
|
||
degraded_to: "clarification",
|
||
evidence_confidence: "low",
|
||
limitation_reason_codes: [],
|
||
mechanism_status: null,
|
||
source_refs: [],
|
||
routes: [],
|
||
followup_state_applied: false,
|
||
uncertainty_limitations_count: 0
|
||
},
|
||
limitations: [errorMessage],
|
||
notes: [`Case execution failed: ${errorMessage}`]
|
||
});
|
||
continue;
|
||
}
|
||
const finalResponse = turnResponses[turnResponses.length - 1];
|
||
const signals = this.collectAssistantSignals(finalResponse, turnResponses);
|
||
const structure = finalResponse.debug?.answer_structure_v11 ?? null;
|
||
const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions);
|
||
const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions);
|
||
const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
|
||
const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
|
||
const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");
|
||
const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" "));
|
||
let genericnessScore = 0;
|
||
if (!hasAnchors)
|
||
genericnessScore += 2;
|
||
if (mechanismNotes.length === 0)
|
||
genericnessScore += 1;
|
||
if (signals.source_refs.length === 0)
|
||
genericnessScore += 1;
|
||
if (recommendedActions.length === 0)
|
||
genericnessScore += 1;
|
||
genericnessScore = clampScore(genericnessScore);
|
||
let actionabilityScore = 0;
|
||
if (recommendedActions.length > 0)
|
||
actionabilityScore += 2;
|
||
if (recommendedActions.some((item) => hasDomainAnchors(item)))
|
||
actionabilityScore += 2;
|
||
if (clarificationQuestions.length > 0 && (finalResponse.reply_type === "clarification_required" || signals.degraded_to === "clarification")) {
|
||
actionabilityScore += 1;
|
||
}
|
||
if (signals.source_refs.length > 0 && actionabilityScore < 5) {
|
||
actionabilityScore += 1;
|
||
}
|
||
actionabilityScore = clampScore(actionabilityScore);
|
||
let evidenceQualityScore = 0;
|
||
if (signals.source_refs.length >= 3)
|
||
evidenceQualityScore += 2;
|
||
else if (signals.source_refs.length > 0)
|
||
evidenceQualityScore += 1;
|
||
if (signals.evidence_confidence === "high")
|
||
evidenceQualityScore += 2;
|
||
if (signals.evidence_confidence === "medium")
|
||
evidenceQualityScore += 1;
|
||
if (signals.minimum_evidence_failed)
|
||
evidenceQualityScore -= 2;
|
||
if (signals.limitation_reason_codes.includes("insufficient_detail"))
|
||
evidenceQualityScore -= 1;
|
||
if (signals.limitation_reason_codes.includes("missing_mechanism"))
|
||
evidenceQualityScore -= 1;
|
||
evidenceQualityScore = clampScore(evidenceQualityScore);
|
||
let mechanismSpecificityScore = 0;
|
||
if (signals.mechanism_status === "grounded" && mechanismNotes.length > 0 && !signals.limitation_reason_codes.includes("missing_mechanism")) {
|
||
mechanismSpecificityScore = 5;
|
||
}
|
||
else if (signals.mechanism_status === "limited" && mechanismNotes.length > 0) {
|
||
mechanismSpecificityScore = 3;
|
||
}
|
||
else if (mechanismNotes.length > 0) {
|
||
mechanismSpecificityScore = 2;
|
||
}
|
||
else {
|
||
mechanismSpecificityScore = 0;
|
||
}
|
||
const usefulnessScore = clampScore((actionabilityScore + (5 - genericnessScore) + evidenceQualityScore + mechanismSpecificityScore) / 4);
|
||
const isGeneric = genericnessScore >= 3;
|
||
const factualReply = finalResponse.reply_type === "factual" || finalResponse.reply_type === "factual_with_explanation";
|
||
const isFalseConfident = factualReply &&
|
||
(signals.minimum_evidence_failed ||
|
||
signals.degraded_to !== null ||
|
||
signals.evidence_confidence === "low" ||
|
||
(signals.limitation_reason_codes.length > 0 && signals.uncertainty_limitations_count === 0));
|
||
const isBroadCase = suiteCase.broadness_level !== "low" || signals.broad_query_detected;
|
||
const isBroadAnswer = isBroadCase
|
||
? factualReply && signals.degraded_to === null && !signals.minimum_evidence_failed
|
||
: null;
|
||
const isFollowupCase = suiteCase.question_type === "followup" || suiteCase.turns.length > 1;
|
||
let followupRetentionScore = null;
|
||
if (isFollowupCase) {
|
||
const finalTurnIndex = Number(finalResponse.debug?.investigation_state_snapshot?.turn_index ?? 0);
|
||
if (signals.followup_state_applied && finalTurnIndex >= suiteCase.turns.length) {
|
||
followupRetentionScore = 5;
|
||
}
|
||
else if (finalTurnIndex >= suiteCase.turns.length) {
|
||
followupRetentionScore = 3;
|
||
}
|
||
else {
|
||
followupRetentionScore = 0;
|
||
}
|
||
}
|
||
let narrowingResult = "not_required";
|
||
if (signals.degraded_to === "clarification" || finalResponse.reply_type === "clarification_required") {
|
||
narrowingResult = "clarification_requested";
|
||
}
|
||
else if (signals.broad_query_detected || signals.broad_result_flag) {
|
||
narrowingResult = signals.minimum_evidence_failed ? "failed" : "applied";
|
||
}
|
||
if (signals.minimum_evidence_failed) {
|
||
limitations.push("minimum_evidence_failed");
|
||
}
|
||
limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`));
|
||
if (signals.mechanism_status === "unresolved") {
|
||
limitations.push("mechanism_unresolved");
|
||
}
|
||
limitations.push(...uncertaintyLimitations);
|
||
if (isGeneric)
|
||
notes.push("genericness_high");
|
||
if (isFalseConfident)
|
||
notes.push("false_confidence_risk");
|
||
if (isBroadCase && isBroadAnswer)
|
||
notes.push("broad_answer_without_degradation");
|
||
if (followupRetentionScore !== null && followupRetentionScore < 3)
|
||
notes.push("followup_context_retention_weak");
|
||
diagnostics.push({
|
||
suite_case: suiteCase,
|
||
session_id: sessionId,
|
||
trace_id: finalResponse.debug?.trace_id ?? null,
|
||
final_reply_type: finalResponse.reply_type,
|
||
turn_count: suiteCase.turns.length,
|
||
narrowing_result: narrowingResult,
|
||
signature: [
|
||
finalResponse.reply_type,
|
||
signals.routes.sort().join(","),
|
||
signals.degraded_to ?? "none",
|
||
signals.mechanism_status ?? "unknown",
|
||
signals.source_refs.slice(0, 2).join(",")
|
||
].join("|"),
|
||
is_generic: isGeneric,
|
||
is_false_confident: isFalseConfident,
|
||
is_broad_answer: isBroadAnswer,
|
||
followup_retention_score: followupRetentionScore,
|
||
evidence_quality_score: evidenceQualityScore,
|
||
mechanism_specificity_score: mechanismSpecificityScore,
|
||
genericness_score: genericnessScore,
|
||
accountant_actionability_score: actionabilityScore,
|
||
accountant_usefulness_score: round2(usefulnessScore),
|
||
signals,
|
||
limitations: Array.from(new Set(limitations)),
|
||
notes
|
||
});
|
||
}
|
||
const metrics = this.computeAssistantMetrics({ diagnostics });
|
||
const caseRecords = diagnostics.map((item) => {
|
||
const signatureHits = metrics.signature_counts[item.signature] ?? 1;
|
||
const caseMetricVector = {
|
||
retrieval_differentiation_rate: signatureHits === 1 ? 1 : 0,
|
||
generic_explanation_rate: item.is_generic ? 1 : 0,
|
||
accountant_actionability_score: round2(item.accountant_actionability_score),
|
||
false_confidence_rate: item.is_false_confident ? 1 : 0,
|
||
broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
|
||
mechanism_specificity_score: round2(item.mechanism_specificity_score),
|
||
followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score)
|
||
};
|
||
return {
|
||
schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
|
||
created_at: new Date().toISOString(),
|
||
case_id: item.suite_case.case_id,
|
||
scenario_tag: item.suite_case.scenario_tag,
|
||
session_id: item.session_id,
|
||
trace_id: item.trace_id,
|
||
question_type: item.suite_case.question_type,
|
||
broadness_level: item.suite_case.broadness_level,
|
||
narrowing_result: item.narrowing_result,
|
||
evidence_quality_score: round2(item.evidence_quality_score),
|
||
genericness_score: round2(item.genericness_score),
|
||
accountant_usefulness_score: round2(item.accountant_usefulness_score),
|
||
accountant_metrics: caseMetricVector,
|
||
raw_signals: {
|
||
final_reply_type: item.final_reply_type,
|
||
turn_count: item.turn_count,
|
||
broad_query_detected: item.signals.broad_query_detected,
|
||
broad_result_flag: item.signals.broad_result_flag,
|
||
narrowing_strength: item.signals.narrowing_strength,
|
||
minimum_evidence_failed: item.signals.minimum_evidence_failed,
|
||
degraded_to: item.signals.degraded_to,
|
||
evidence_confidence: item.signals.evidence_confidence,
|
||
limitation_reason_codes: item.signals.limitation_reason_codes,
|
||
mechanism_status: item.signals.mechanism_status,
|
||
source_refs: item.signals.source_refs,
|
||
routes: item.signals.routes,
|
||
followup_state_applied: item.signals.followup_state_applied
|
||
},
|
||
metric_subscores: caseMetricVector,
|
||
limitations: item.limitations,
|
||
notes: item.notes
|
||
};
|
||
});
|
||
const strongestSignals = Object.entries(metrics.rubric_bands)
|
||
.filter(([, band]) => band?.score === 5)
|
||
.map(([name]) => name);
|
||
const weakestSignals = Object.entries(metrics.rubric_bands)
|
||
.filter(([, band]) => band?.score === 0)
|
||
.map(([name]) => name);
|
||
const runTimestamp = new Date().toISOString();
|
||
const report = {
|
||
schema_version: ASSISTANT_STAGE1_RUN_SCHEMA_VERSION,
|
||
run_id: runId,
|
||
run_timestamp: runTimestamp,
|
||
eval_target: "assistant_stage1",
|
||
mode: payload.mode,
|
||
use_mock: Boolean(payload.useMock),
|
||
prompt_version: payload.normalizeConfig.promptVersion ?? null,
|
||
suite_id: suite.suite_id,
|
||
suite_version: suite.suite_version,
|
||
suite_schema_version: suite.schema_version ?? null,
|
||
scenario_count: suite.scenario_count,
|
||
case_ids: suiteCases.map((item) => item.case_id),
|
||
cases_total: caseRecords.length,
|
||
feature_profile_snapshot: buildFeatureProfileSnapshot(),
|
||
code_version: buildCodeVersionMarker(),
|
||
metrics: {
|
||
raw: metrics.raw,
|
||
denominators: metrics.denominators
|
||
},
|
||
rubric_bands: metrics.rubric_bands,
|
||
subsets: {
|
||
broad_cases_total: metrics.denominators.broad_cases_total,
|
||
followup_cases_total: metrics.denominators.followup_cases_total
|
||
},
|
||
budget: {
|
||
requests_total: requestsTotal
|
||
},
|
||
results: caseRecords,
|
||
scenario_summary: {
|
||
improved_or_strong: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) >= 4).length,
|
||
unchanged_or_mixed: caseRecords.filter((item) => {
|
||
const value = Number(item.accountant_usefulness_score ?? 0);
|
||
return value >= 2.5 && value < 4;
|
||
}).length,
|
||
weak_or_regressed: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) < 2.5).length
|
||
},
|
||
improvement_hints: {
|
||
strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none",
|
||
weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none"
|
||
},
|
||
known_limitations: [
|
||
"Snapshot-only retrieval contour remains (no live verification core in Stage 1).",
|
||
"Metric mapping for genericness/false confidence is heuristic by design.",
|
||
"Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)."
|
||
],
|
||
report_title: "Assistant Stage 1 Eval Run"
|
||
};
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
const runJsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.json`);
|
||
const runMdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.md`);
|
||
const compactReport = compactAssistantStage1Report(report);
|
||
const jsonWritten = tryWriteJsonFile(runJsonPath, compactReport);
|
||
const mdWritten = tryWriteTextFile(runMdPath, buildAssistantEvalMarkdownReport(compactReport));
|
||
const runReportRef = jsonWritten ? runJsonPath : putInMemoryEvalReport(compactReport);
|
||
report.artifacts = {
|
||
run_report_json_path: runReportRef,
|
||
run_report_md_path: mdWritten ? runMdPath : null
|
||
};
|
||
if (payload.compareWithReportFile) {
|
||
report.comparison = this.buildAssistantComparisonReport({
|
||
currentReport: report,
|
||
baselineReportFile: payload.compareWithReportFile
|
||
});
|
||
}
|
||
return report;
|
||
}
|
||
async runAssistantStage2(payload) {
|
||
if (!config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1) {
|
||
throw new http_1.ApiError("ASSISTANT_STAGE2_EVAL_DISABLED", "Assistant Stage 2 eval target is disabled by FEATURE_ASSISTANT_STAGE2_EVAL_V1.", 409);
|
||
}
|
||
const suite = parseAssistantStage2SuiteFile(payload.caseSetFile);
|
||
const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
|
||
const runId = `assistant-stage2-${(0, nanoid_1.nanoid)(10)}`;
|
||
const assistantService = new assistantService_1.AssistantService(this.normalizerService, new assistantSessionStore_1.AssistantSessionStore());
|
||
const diagnostics = [];
|
||
let requestsTotal = 0;
|
||
for (const suiteCase of suiteCases) {
|
||
const sessionId = `${runId}-${suiteCase.case_id}`;
|
||
const turnResponses = [];
|
||
const notes = [];
|
||
const limitations = [];
|
||
const expectedProblemUnitTypes = this.getExpectedProblemUnitTypes(suiteCase);
|
||
const expectedProblemFirst = suiteCase.expected_hints?.expected_problem_first ?? (suiteCase.broadness_level !== "low" || suiteCase.question_type !== "direct");
|
||
try {
|
||
for (const turn of suiteCase.turns) {
|
||
const response = (await assistantService.handleMessage({
|
||
session_id: sessionId,
|
||
user_message: turn.user_message,
|
||
message: turn.user_message,
|
||
mode: "assistant",
|
||
apiKey: payload.normalizeConfig.apiKey,
|
||
model: payload.normalizeConfig.model,
|
||
baseUrl: payload.normalizeConfig.baseUrl,
|
||
temperature: payload.normalizeConfig.temperature,
|
||
maxOutputTokens: payload.normalizeConfig.maxOutputTokens,
|
||
promptVersion: payload.normalizeConfig.promptVersion,
|
||
systemPrompt: payload.normalizeConfig.systemPrompt,
|
||
developerPrompt: payload.normalizeConfig.developerPrompt,
|
||
domainPrompt: payload.normalizeConfig.domainPrompt,
|
||
fewShotExamples: payload.normalizeConfig.fewShotExamples,
|
||
useMock: payload.useMock
|
||
}));
|
||
turnResponses.push(response);
|
||
requestsTotal += 1;
|
||
}
|
||
}
|
||
catch (error) {
|
||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||
diagnostics.push({
|
||
suite_case: suiteCase,
|
||
session_id: sessionId,
|
||
trace_id: null,
|
||
final_reply_type: "backend_error",
|
||
turn_count: turnResponses.length,
|
||
signature: `backend_error|${suiteCase.scenario_tag}`,
|
||
expected_problem_unit_types: expectedProblemUnitTypes,
|
||
expected_problem_first: expectedProblemFirst,
|
||
problem_unit_precision: 0,
|
||
problem_unit_recall_proxy: expectedProblemUnitTypes.length > 0 ? 0 : null,
|
||
duplicate_collapse_rate: null,
|
||
mechanism_coherence_score: 0,
|
||
problem_clarity_score: 0,
|
||
problem_first_answer_applied: expectedProblemFirst ? false : null,
|
||
entity_leakage: false,
|
||
signals: {
|
||
broad_query_detected: suiteCase.broadness_level !== "low",
|
||
broad_result_flag: false,
|
||
narrowing_strength: null,
|
||
minimum_evidence_failed: true,
|
||
degraded_to: "clarification",
|
||
evidence_confidence: "low",
|
||
limitation_reason_codes: [],
|
||
mechanism_status: null,
|
||
source_refs: [],
|
||
routes: [],
|
||
followup_state_applied: false,
|
||
uncertainty_limitations_count: 0,
|
||
candidate_evidence_total: 0,
|
||
problem_units_total: 0,
|
||
problem_unit_types: [],
|
||
problem_mechanism_summaries: [],
|
||
duplicate_collapses_total: 0,
|
||
problem_centric_answer_applied: false,
|
||
problem_units_used_count: 0,
|
||
problem_answer_mode: null,
|
||
problem_unit_ids_used: [],
|
||
entity_leakage_detected: false
|
||
},
|
||
limitations: [errorMessage],
|
||
notes: [`Case execution failed: ${errorMessage}`]
|
||
});
|
||
continue;
|
||
}
|
||
const finalResponse = turnResponses[turnResponses.length - 1];
|
||
const signals = this.collectAssistantStage2Signals(finalResponse, turnResponses);
|
||
const problemUnitPrecision = this.computeProblemUnitPrecision(expectedProblemUnitTypes, signals.problem_unit_types);
|
||
const problemUnitRecallProxy = this.computeProblemUnitRecallProxy(expectedProblemUnitTypes, signals.problem_unit_types);
|
||
const duplicateCollapseRate = this.computeDuplicateCollapseRate(signals.candidate_evidence_total, signals.duplicate_collapses_total);
|
||
const mechanismCoherenceScore = this.computeMechanismCoherenceScore(finalResponse, signals);
|
||
const problemClarityScore = this.computeProblemClarityScore(finalResponse, signals);
|
||
const problemFirstAnswerApplied = expectedProblemFirst ? signals.problem_centric_answer_applied && signals.problem_units_used_count > 0 : null;
|
||
if (signals.problem_units_total === 0 && expectedProblemUnitTypes.length > 0) {
|
||
limitations.push("missing_problem_units");
|
||
}
|
||
if (signals.problem_centric_answer_applied && signals.problem_units_used_count <= 0) {
|
||
limitations.push("problem_mode_without_units");
|
||
}
|
||
limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`));
|
||
if (signals.entity_leakage_detected) {
|
||
limitations.push("entity_leakage_detected");
|
||
}
|
||
if (problemFirstAnswerApplied === false)
|
||
notes.push("problem_first_not_applied");
|
||
if (signals.problem_units_total === 0)
|
||
notes.push("problem_units_missing");
|
||
if (signals.problem_unit_types.length > 0)
|
||
notes.push(`problem_types:${signals.problem_unit_types.join(",")}`);
|
||
if (signals.entity_leakage_detected)
|
||
notes.push("entity_leakage");
|
||
if (signals.degraded_to === "clarification")
|
||
notes.push("clarification_degraded");
|
||
diagnostics.push({
|
||
suite_case: suiteCase,
|
||
session_id: sessionId,
|
||
trace_id: finalResponse.debug?.trace_id ?? null,
|
||
final_reply_type: finalResponse.reply_type,
|
||
turn_count: suiteCase.turns.length,
|
||
signature: [
|
||
finalResponse.reply_type,
|
||
signals.problem_answer_mode ?? "unknown",
|
||
signals.problem_unit_types.sort().join(","),
|
||
signals.degraded_to ?? "none"
|
||
].join("|"),
|
||
expected_problem_unit_types: expectedProblemUnitTypes,
|
||
expected_problem_first: expectedProblemFirst,
|
||
problem_unit_precision: problemUnitPrecision,
|
||
problem_unit_recall_proxy: problemUnitRecallProxy,
|
||
duplicate_collapse_rate: duplicateCollapseRate,
|
||
mechanism_coherence_score: mechanismCoherenceScore,
|
||
problem_clarity_score: problemClarityScore,
|
||
problem_first_answer_applied: problemFirstAnswerApplied,
|
||
entity_leakage: signals.entity_leakage_detected,
|
||
signals,
|
||
limitations: Array.from(new Set(limitations)),
|
||
notes
|
||
});
|
||
}
|
||
const metrics = this.computeAssistantStage2Metrics({ diagnostics });
|
||
const caseRecords = diagnostics.map((item) => {
|
||
const caseMetricVector = {
|
||
problem_unit_precision: item.problem_unit_precision,
|
||
problem_unit_recall_proxy: item.problem_unit_recall_proxy,
|
||
duplicate_collapse_rate: item.duplicate_collapse_rate,
|
||
mechanism_coherence_score: round2(item.mechanism_coherence_score),
|
||
problem_clarity_score: round2(item.problem_clarity_score),
|
||
problem_first_answer_rate: item.problem_first_answer_applied === null ? null : item.problem_first_answer_applied ? 1 : 0,
|
||
entity_leakage_rate: item.entity_leakage ? 1 : 0
|
||
};
|
||
return {
|
||
schema_version: stage2EvalContracts_1.ASSISTANT_STAGE2_EVAL_RECORD_SCHEMA_VERSION,
|
||
created_at: new Date().toISOString(),
|
||
case_id: item.suite_case.case_id,
|
||
scenario_tag: item.suite_case.scenario_tag,
|
||
session_id: item.session_id,
|
||
trace_id: item.trace_id,
|
||
question_type: item.suite_case.question_type,
|
||
broadness_level: item.suite_case.broadness_level,
|
||
expected_problem_unit_types: item.expected_problem_unit_types,
|
||
expected_problem_first: item.expected_problem_first,
|
||
problem_units_detected: item.signals.problem_units_total,
|
||
candidate_evidence_detected: item.signals.candidate_evidence_total,
|
||
duplicate_collapses_detected: item.signals.duplicate_collapses_total,
|
||
metric_subscores: caseMetricVector,
|
||
raw_signals: {
|
||
final_reply_type: item.final_reply_type,
|
||
turn_count: item.turn_count,
|
||
broad_query_detected: item.signals.broad_query_detected,
|
||
broad_result_flag: item.signals.broad_result_flag,
|
||
narrowing_strength: item.signals.narrowing_strength,
|
||
minimum_evidence_failed: item.signals.minimum_evidence_failed,
|
||
degraded_to: item.signals.degraded_to,
|
||
evidence_confidence: item.signals.evidence_confidence,
|
||
limitation_reason_codes: item.signals.limitation_reason_codes,
|
||
mechanism_status: item.signals.mechanism_status,
|
||
source_refs: item.signals.source_refs,
|
||
routes: item.signals.routes,
|
||
followup_state_applied: item.signals.followup_state_applied,
|
||
problem_units_total: item.signals.problem_units_total,
|
||
candidate_evidence_total: item.signals.candidate_evidence_total,
|
||
problem_unit_types: item.signals.problem_unit_types,
|
||
duplicate_collapses_total: item.signals.duplicate_collapses_total,
|
||
problem_centric_answer_applied: item.signals.problem_centric_answer_applied,
|
||
problem_units_used_count: item.signals.problem_units_used_count,
|
||
problem_answer_mode: item.signals.problem_answer_mode,
|
||
problem_unit_ids_used: item.signals.problem_unit_ids_used,
|
||
entity_leakage_detected: item.signals.entity_leakage_detected
|
||
},
|
||
limitations: item.limitations,
|
||
notes: item.notes
|
||
};
|
||
});
|
||
const strongestSignals = Object.entries(metrics.rubric_bands)
|
||
.filter(([, band]) => band?.score === 5)
|
||
.map(([name]) => name);
|
||
const weakestSignals = Object.entries(metrics.rubric_bands)
|
||
.filter(([, band]) => band?.score === 0)
|
||
.map(([name]) => name);
|
||
const runTimestamp = new Date().toISOString();
|
||
const report = {
|
||
schema_version: ASSISTANT_STAGE2_RUN_SCHEMA_VERSION,
|
||
run_id: runId,
|
||
run_timestamp: runTimestamp,
|
||
eval_target: "assistant_stage2",
|
||
mode: payload.mode,
|
||
use_mock: Boolean(payload.useMock),
|
||
prompt_version: payload.normalizeConfig.promptVersion ?? null,
|
||
suite_id: suite.suite_id,
|
||
suite_version: suite.suite_version,
|
||
suite_schema_version: suite.schema_version ?? null,
|
||
scenario_count: suite.scenario_count,
|
||
case_ids: suiteCases.map((item) => item.case_id),
|
||
cases_total: caseRecords.length,
|
||
feature_profile_snapshot: buildFeatureProfileSnapshot(),
|
||
code_version: buildCodeVersionMarker(),
|
||
metrics: {
|
||
raw: metrics.raw,
|
||
denominators: metrics.denominators
|
||
},
|
||
rubric_bands: metrics.rubric_bands,
|
||
subsets: {
|
||
expected_problem_cases_total: metrics.denominators.expected_problem_cases_total,
|
||
followup_cases_total: metrics.denominators.followup_cases_total,
|
||
candidate_cases_total: metrics.denominators.candidate_cases_total
|
||
},
|
||
budget: {
|
||
requests_total: requestsTotal
|
||
},
|
||
results: caseRecords,
|
||
scenario_summary: {
|
||
improved_or_strong: caseRecords.filter((item) => {
|
||
const clarity = Number(item.metric_subscores.problem_clarity_score ?? 0);
|
||
const mechanism = Number(item.metric_subscores.mechanism_coherence_score ?? 0);
|
||
return clarity >= 4 && mechanism >= 3;
|
||
}).length,
|
||
unchanged_or_mixed: caseRecords.filter((item) => {
|
||
const clarity = Number(item.metric_subscores.problem_clarity_score ?? 0);
|
||
return clarity >= 2.5 && clarity < 4;
|
||
}).length,
|
||
weak_or_regressed: caseRecords.filter((item) => Number(item.metric_subscores.problem_clarity_score ?? 0) < 2.5).length
|
||
},
|
||
improvement_hints: {
|
||
strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none",
|
||
weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none"
|
||
},
|
||
known_limitations: [
|
||
"Stage 2 eval remains heuristic and scoped to problem-unit baseline (no graph/lifecycle/investigation runtime scoring).",
|
||
"problem_unit_recall_proxy uses suite expected types as lightweight proxy, not full ground-truth labeling.",
|
||
"Comparison quality depends on stable feature profile and reproducible mock/runtime setup."
|
||
],
|
||
report_title: "Assistant Stage 2 Eval Run"
|
||
};
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
const runJsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.json`);
|
||
const runMdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.md`);
|
||
const compactReport = compactAssistantStage2Report(report);
|
||
const jsonWritten = tryWriteJsonFile(runJsonPath, compactReport);
|
||
const mdWritten = tryWriteTextFile(runMdPath, buildAssistantStage2EvalMarkdownReport(compactReport));
|
||
const runReportRef = jsonWritten ? runJsonPath : putInMemoryEvalReport(compactReport);
|
||
report.artifacts = {
|
||
run_report_json_path: runReportRef,
|
||
run_report_md_path: mdWritten ? runMdPath : null
|
||
};
|
||
if (payload.compareWithReportFile) {
|
||
report.comparison = this.buildAssistantStage2ComparisonReport({
|
||
currentReport: report,
|
||
baselineReportFile: payload.compareWithReportFile
|
||
});
|
||
}
|
||
return report;
|
||
}
|
||
async runAssistantP0(payload) {
|
||
if (!config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1) {
|
||
throw new http_1.ApiError("ASSISTANT_P0_EVAL_DISABLED", "Assistant P0 eval target is disabled by FEATURE_ASSISTANT_STAGE2_EVAL_V1.", 409);
|
||
}
|
||
const runner = new p0_eval_runner_1.P0EvalRunner(this.normalizerService);
|
||
return runner.run({
|
||
normalizeConfig: payload.normalizeConfig,
|
||
caseIds: payload.caseIds,
|
||
useMock: payload.useMock,
|
||
mode: payload.mode,
|
||
caseSetFile: payload.caseSetFile,
|
||
compareWithReportFile: payload.compareWithReportFile
|
||
});
|
||
}
|
||
async run(payload) {
|
||
const mode = payload.mode ?? "standard";
|
||
const evalTarget = payload.evalTarget ?? "normalizer";
|
||
if (evalTarget === "assistant_stage1") {
|
||
return this.runAssistantStage1({
|
||
normalizeConfig: payload.normalizeConfig,
|
||
caseIds: payload.caseIds,
|
||
useMock: payload.useMock,
|
||
mode,
|
||
caseSetFile: payload.caseSetFile,
|
||
compareWithReportFile: payload.compareWithReportFile
|
||
});
|
||
}
|
||
if (evalTarget === "assistant_stage2") {
|
||
return this.runAssistantStage2({
|
||
normalizeConfig: payload.normalizeConfig,
|
||
caseIds: payload.caseIds,
|
||
useMock: payload.useMock,
|
||
mode,
|
||
caseSetFile: payload.caseSetFile,
|
||
compareWithReportFile: payload.compareWithReportFile
|
||
});
|
||
}
|
||
if (evalTarget === "assistant_p0") {
|
||
return this.runAssistantP0({
|
||
normalizeConfig: payload.normalizeConfig,
|
||
caseIds: payload.caseIds,
|
||
useMock: payload.useMock,
|
||
mode,
|
||
caseSetFile: payload.caseSetFile,
|
||
compareWithReportFile: payload.compareWithReportFile
|
||
});
|
||
}
|
||
const promptVersion = String(payload.normalizeConfig.promptVersion ?? "").toLowerCase();
|
||
const schemaVersion = String(payload.normalizeConfig.schemaVersion ?? "").toLowerCase();
|
||
const isV2 = promptVersion.startsWith("normalizer_v2") || schemaVersion === "v2" || schemaVersion === "v2_0_1" || schemaVersion === "v2_0_2";
|
||
const inlineQuestions = payload.rawQuestions ? parseRawQuestions(payload.rawQuestions) : [];
|
||
const inlineCases = inlineQuestions.map((question, index) => ({
|
||
case_id: formatCaseId("BQ", index),
|
||
raw_question: question,
|
||
expected: null
|
||
}));
|
||
if (isV2) {
|
||
const sourceCases = inlineCases.length > 0
|
||
? inlineCases
|
||
: payload.caseSetFile
|
||
? parseCaseSetFile(payload.caseSetFile).map((item) => ({
|
||
case_id: item.case_id,
|
||
raw_question: item.raw_question,
|
||
expected: item.expected
|
||
}))
|
||
: this.listCases().map((item) => ({
|
||
case_id: item.case_id,
|
||
raw_question: item.raw_question,
|
||
expected: item.expected
|
||
}));
|
||
const filtered = sourceCases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
|
||
return this.runV2({
|
||
...payload,
|
||
mode,
|
||
cases: filtered
|
||
});
|
||
}
|
||
if (inlineCases.length > 0) {
|
||
throw new Error("rawQuestions batch is supported for normalizer_v2 only.");
|
||
}
|
||
const casesSource = payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile) : this.listCases();
|
||
const filteredCases = casesSource.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
|
||
const runId = `eval-${(0, nanoid_1.nanoid)(10)}`;
|
||
const results = [];
|
||
const mismatches = [];
|
||
const badConfidenceCases = [];
|
||
const classCounter = {};
|
||
let schemaPass = 0;
|
||
let intentPass = 0;
|
||
let routePass = 0;
|
||
let causalPass = 0;
|
||
let highConfidenceErrors = 0;
|
||
let requestsTotal = 0;
|
||
let retriesUsed = 0;
|
||
for (const item of filteredCases) {
|
||
const response = await this.normalizerService.normalize({
|
||
...payload.normalizeConfig,
|
||
userQuestion: item.raw_question,
|
||
context: {
|
||
expected_route: item.expected.route_hint,
|
||
eval_label: runId,
|
||
case_id: item.case_id,
|
||
eval_mode: mode
|
||
},
|
||
retryPolicy: mode === "single-pass-strict" ? "single-pass-strict" : "default",
|
||
useMock: payload.useMock
|
||
});
|
||
const normalized = response.normalized && response.normalized.schema_version === "normalized_query_v1"
|
||
? response.normalized
|
||
: null;
|
||
const intentMatch = Boolean(normalized && item.expected.intent_class === normalized.intent_class);
|
||
const routeMatch = Boolean(normalized && item.expected.route_hint === normalized.route_hint);
|
||
const causalMatch = Boolean(normalized &&
|
||
item.expected.requires &&
|
||
item.expected.requires.needs_cross_entity_join === normalized.requires.needs_cross_entity_join &&
|
||
item.expected.requires.needs_causal_chain === normalized.requires.needs_causal_chain);
|
||
if (response.validation.passed)
|
||
schemaPass += 1;
|
||
if (intentMatch)
|
||
intentPass += 1;
|
||
if (routeMatch)
|
||
routePass += 1;
|
||
if (causalMatch || !item.expected.requires)
|
||
causalPass += 1;
|
||
const requestCount = Number(response.request_count_for_case ?? 0);
|
||
requestsTotal += requestCount;
|
||
if (requestCount > 1) {
|
||
retriesUsed += 1;
|
||
}
|
||
const classKey = String(item.expected.intent_class ?? "unknown");
|
||
if (!classCounter[classKey]) {
|
||
classCounter[classKey] = { total: 0, passed: 0 };
|
||
}
|
||
classCounter[classKey].total += 1;
|
||
if (intentMatch) {
|
||
classCounter[classKey].passed += 1;
|
||
}
|
||
const confidenceOverall = normalized?.confidence.overall ?? null;
|
||
const hasMismatch = !intentMatch || !routeMatch || (!causalMatch && Boolean(item.expected.requires));
|
||
if (confidenceOverall === "high" && hasMismatch) {
|
||
highConfidenceErrors += 1;
|
||
badConfidenceCases.push({
|
||
case_id: item.case_id,
|
||
confidence_overall: confidenceOverall,
|
||
intent_match: intentMatch,
|
||
route_match: routeMatch,
|
||
causal_match: causalMatch || !item.expected.requires,
|
||
trace_id: response.trace_id
|
||
});
|
||
}
|
||
if (hasMismatch || !response.validation.passed) {
|
||
mismatches.push({
|
||
case_id: item.case_id,
|
||
expected_intent_class: item.expected.intent_class ?? null,
|
||
actual_intent_class: normalized?.intent_class ?? null,
|
||
expected_route_hint: item.expected.route_hint ?? null,
|
||
actual_route_hint: normalized?.route_hint ?? null,
|
||
expected_requires: item.expected.requires ?? null,
|
||
actual_requires: normalized?.requires ?? null,
|
||
comment: shortMismatchComment({
|
||
intentMatch,
|
||
routeMatch,
|
||
causalMatch: causalMatch || !item.expected.requires,
|
||
validationPassed: response.validation.passed
|
||
}),
|
||
trace_id: response.trace_id
|
||
});
|
||
}
|
||
results.push({
|
||
case_id: item.case_id,
|
||
raw_question: item.raw_question,
|
||
validation_passed: response.validation.passed,
|
||
intent_match: intentMatch,
|
||
route_match: routeMatch,
|
||
causal_flags_match: causalMatch || !item.expected.requires,
|
||
expected_intent_class: item.expected.intent_class ?? null,
|
||
actual_intent_class: normalized?.intent_class ?? null,
|
||
expected_route_hint: item.expected.route_hint ?? null,
|
||
actual_route_hint: normalized?.route_hint ?? null,
|
||
expected_requires: item.expected.requires ?? null,
|
||
actual_requires: normalized?.requires ?? null,
|
||
confidence_overall: confidenceOverall,
|
||
trace_id: response.trace_id,
|
||
request_count_for_case: requestCount
|
||
});
|
||
}
|
||
const total = Math.max(1, filteredCases.length);
|
||
const metrics = {
|
||
schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
|
||
intent_class_accuracy: Number(((intentPass / total) * 100).toFixed(2)),
|
||
route_hint_accuracy: Number(((routePass / total) * 100).toFixed(2)),
|
||
causal_flag_accuracy: Number(((causalPass / total) * 100).toFixed(2)),
|
||
high_confidence_error_rate: Number(((highConfidenceErrors / total) * 100).toFixed(2))
|
||
};
|
||
const classAccuracy = Object.fromEntries(Object.entries(classCounter).map(([key, value]) => [
|
||
key,
|
||
{
|
||
total: value.total,
|
||
passed: value.passed,
|
||
accuracy_percent: Number(((value.passed / Math.max(1, value.total)) * 100).toFixed(2))
|
||
}
|
||
]));
|
||
const baselineAsMap = BASELINE_METRICS;
|
||
const baselineDelta = Object.fromEntries(Object.entries(metrics).map(([key, value]) => [key, Number((value - baselineAsMap[key]).toFixed(2))]));
|
||
const report = {
|
||
run_id: runId,
|
||
timestamp: new Date().toISOString(),
|
||
mode,
|
||
use_mock: Boolean(payload.useMock),
|
||
prompt_version: payload.normalizeConfig.promptVersion ?? null,
|
||
dataset: {
|
||
source: payload.caseSetFile ? "file" : "data/eval_cases/*.json",
|
||
file: payload.caseSetFile ?? null
|
||
},
|
||
cases_total: filteredCases.length,
|
||
metrics,
|
||
baseline_metrics: BASELINE_METRICS,
|
||
baseline_delta: baselineDelta,
|
||
class_accuracy: classAccuracy,
|
||
budget: {
|
||
requests_total: requestsTotal,
|
||
retries_used: retriesUsed,
|
||
guidance: {
|
||
forensic_calls_max: 10,
|
||
final_eval_calls_max: 30,
|
||
target_total_calls_max: 40,
|
||
hard_cap_calls_max: 45
|
||
}
|
||
},
|
||
mismatches,
|
||
bad_confidence_cases: badConfidenceCases,
|
||
results
|
||
};
|
||
(0, files_1.ensureDir)(config_1.EVAL_CASES_DIR);
|
||
tryWriteJsonFile(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report);
|
||
const shouldWriteV11Artifacts = mode === "single-pass-strict" &&
|
||
Boolean(payload.caseSetFile) &&
|
||
path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_30cases.json";
|
||
if (shouldWriteV11Artifacts) {
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.json"), report);
|
||
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.md"), buildMarkdownReport({
|
||
...report,
|
||
report_title: "LLM Normalizer v1.1 Eval Run"
|
||
}));
|
||
}
|
||
const shouldWriteV1121EvalArtifacts = mode === "single-pass-strict" &&
|
||
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2_1" &&
|
||
Boolean(payload.caseSetFile) &&
|
||
path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_2_1_30cases.json";
|
||
if (shouldWriteV1121EvalArtifacts) {
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.json"), report);
|
||
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.md"), buildMarkdownReport({
|
||
...report,
|
||
report_title: "LLM Normalizer v1.1.2.1 Eval Run"
|
||
}));
|
||
}
|
||
const shouldWriteV111MicroArtifacts = mode === "single-pass-strict" &&
|
||
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_1" &&
|
||
isSameCaseSet(payload.caseIds, V111_MICRO_CASE_IDS);
|
||
if (shouldWriteV111MicroArtifacts) {
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.json"), report);
|
||
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.md"), buildMarkdownReport({
|
||
...report,
|
||
report_title: "LLM Normalizer v1.1.1 Micro Eval"
|
||
}));
|
||
}
|
||
const shouldWriteV112MicroArtifacts = mode === "single-pass-strict" &&
|
||
String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2" &&
|
||
isSameCaseSet(payload.caseIds, V112_MICRO_CASE_IDS);
|
||
if (shouldWriteV112MicroArtifacts) {
|
||
(0, files_1.ensureDir)(config_1.REPORTS_DIR);
|
||
tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.json"), report);
|
||
tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.md"), buildMarkdownReport({
|
||
...report,
|
||
report_title: "LLM Normalizer v1.1.2 Micro Eval"
|
||
}));
|
||
}
|
||
return report;
|
||
}
|
||
}
|
||
exports.EvalService = EvalService;
|