"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.EvalService = void 0; const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const nanoid_1 = require("nanoid"); const config_1 = require("../config"); const p0_eval_runner_1 = require("../eval/p0_eval_runner"); const stage1Contracts_1 = require("../types/stage1Contracts"); const stage2EvalContracts_1 = require("../types/stage2EvalContracts"); const http_1 = require("../utils/http"); const assistantService_1 = require("./assistantService"); const assistantSessionStore_1 = require("./assistantSessionStore"); const files_1 = require("../utils/files"); const BASELINE_METRICS = { schema_validation_pass_rate: 100, intent_class_accuracy: 72.73, route_hint_accuracy: 90.91, causal_flag_accuracy: 81.82, high_confidence_error_rate: 9.09 }; const V111_MICRO_CASE_IDS = ["NQ-008", "V11-DD-005", "V11-OT-003", "V11-OT-004", "V11-OT-005"]; const V112_MICRO_CASE_IDS = ["NQ-002", "NQ-007", "V11-HA-004", "V11-OT-003", "V11-OT-005"]; function isSameCaseSet(input, target) { if (!input || input.length !== target.length) { return false; } const left = [...input].sort(); const right = [...target].sort(); return left.every((value, index) => value === right[index]); } function formatPercent(value) { return `${value.toFixed(2)}%`; } function shortMismatchComment(input) { if (!input.validationPassed) { return "Schema validation failed for this case."; } if (!input.intentMatch && input.routeMatch) { return "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket."; } if (input.intentMatch && !input.routeMatch) { return "Intent understood, but route_hint selected a weaker execution route."; } if (!input.intentMatch && !input.routeMatch) { return "Both intent and route misclassified; likely lexical ambiguity in causal vs risk wording."; } if (!input.causalMatch) { return "Causal flags are inconsistent with expected relationship depth."; } return "No mismatch."; } function buildMarkdownReport(report) { const metrics = (report.metrics ?? {}); const baseline = (report.baseline_metrics ?? {}); const delta = (report.baseline_delta ?? {}); const classAccuracy = (report.class_accuracy ?? {}); const mismatches = Array.isArray(report.mismatches) ? report.mismatches : []; const badConfidenceCases = Array.isArray(report.bad_confidence_cases) ? report.bad_confidence_cases : []; const budget = (report.budget ?? {}); const metricRows = Object.keys(metrics) .map((key) => { const current = Number(metrics[key] ?? 0); const base = Number(baseline[key] ?? 0); const d = Number(delta[key] ?? 0); const sign = d > 0 ? "+" : ""; return `| ${key} | ${formatPercent(current)} | ${formatPercent(base)} | ${sign}${d.toFixed(2)} |`; }) .join("\n"); const classRows = Object.keys(classAccuracy) .map((key) => { const row = classAccuracy[key]; return `| ${key} | ${row.passed}/${row.total} | ${formatPercent(row.accuracy_percent)} |`; }) .join("\n"); const mismatchRows = mismatches.length === 0 ? "No mismatches." : mismatches .map((item) => { const row = item; return `- ${row.case_id}: expected(${row.expected_intent_class} / ${row.expected_route_hint}) -> actual(${row.actual_intent_class} / ${row.actual_route_hint}). ${row.comment}`; }) .join("\n"); const badConfidenceRows = badConfidenceCases.length === 0 ? "No bad-confidence cases." : badConfidenceCases .map((item) => { const row = item; return `- ${row.case_id}: confidence=${row.confidence_overall}, intent_match=${row.intent_match}, route_match=${row.route_match}`; }) .join("\n"); return [ `# ${String(report.report_title ?? "LLM Normalizer Eval Run")}`, "", `- run_id: ${String(report.run_id ?? "")}`, `- timestamp: ${String(report.timestamp ?? "")}`, `- mode: ${String(report.mode ?? "")}`, `- use_mock: ${String(report.use_mock ?? false)}`, `- cases_total: ${String(report.cases_total ?? 0)}`, `- prompt_version: ${String(report.prompt_version ?? "")}`, "", "## Metrics vs Baseline", "", "| Metric | Current | Baseline | Delta |", "|---|---:|---:|---:|", metricRows || "| n/a | n/a | n/a | n/a |", "", "## Class Accuracy", "", "| Intent class | Passed/Total | Accuracy |", "|---|---:|---:|", classRows || "| n/a | n/a | n/a |", "", "## Budget", "", `- requests_total: ${String(budget.requests_total ?? 0)}`, `- retries_used: ${String(budget.retries_used ?? 0)}`, "", "## Mismatches", "", mismatchRows, "", "## Bad Confidence Cases", "", badConfidenceRows, "" ].join("\n"); } function parseCaseSetFile(inputPath) { const filePath = path_1.default.isAbsolute(inputPath) ? inputPath : path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath); const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, ""); const parsed = JSON.parse(raw); if (Array.isArray(parsed)) { return parsed; } if (parsed && typeof parsed === "object" && Array.isArray(parsed.cases)) { return parsed.cases; } throw new Error(`Unsupported eval dataset format: ${filePath}`); } function formatCaseId(prefix, index) { return `${prefix}-${String(index + 1).padStart(3, "0")}`; } function parseRawQuestions(rawQuestions) { const text = rawQuestions.replace(/\r\n/g, "\n").trim(); if (!text) { return []; } const bySemicolon = text .split(";") .map((item) => item.trim()) .filter(Boolean); if (bySemicolon.length > 1) { return bySemicolon; } const byBlankLine = text .split(/\n\s*\n+/) .map((item) => item.trim()) .filter(Boolean); if (byBlankLine.length > 1) { return byBlankLine; } const byLine = text .split("\n") .map((item) => item.trim()) .filter(Boolean); return byLine.length > 0 ? byLine : [text]; } function executionReadinessOf(fragment) { return "execution_readiness" in fragment ? fragment.execution_readiness : "executable"; } function softAssumptionsOf(fragment) { return "soft_assumption_used" in fragment ? fragment.soft_assumption_used : []; } function routeStatusOf(fragment) { return "route_status" in fragment ? fragment.route_status : null; } function noRouteReasonOf(fragment) { return "no_route_reason" in fragment ? fragment.no_route_reason : null; } function expectedScopeInScope(expected) { if (!expected) { return null; } if (typeof expected.expected_scope_in_scope === "boolean") { return expected.expected_scope_in_scope; } if (expected.expected_no_route_reason === "out_of_scope") { return false; } if (expected.expected_route_status === "routed") { return true; } if (typeof expected.clarification_required === "boolean") { return true; } return null; } function isDecisionStateConsistent(decision) { const readiness = String(decision.execution_readiness ?? ""); const noRouteReason = decision.no_route_reason ?? null; if (decision.route === "no_route") { if (!noRouteReason) { return false; } return readiness !== "executable" && readiness !== "executable_with_soft_assumptions"; } if (noRouteReason) { return false; } return readiness !== "needs_clarification" && readiness !== "no_route"; } const DEFAULT_ASSISTANT_STAGE1_SUITE_FILE = "assistant_stage1_canonical_v0_1.json"; const ASSISTANT_STAGE1_RUN_SCHEMA_VERSION = "assistant_stage1_eval_run_v0_1"; const ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION = "assistant_stage1_eval_comparison_v0_1"; const DEFAULT_ASSISTANT_STAGE2_SUITE_FILE = "assistant_stage2_canonical_v0_1.json"; const ASSISTANT_STAGE2_RUN_SCHEMA_VERSION = "assistant_stage2_eval_run_v0_1"; const ASSISTANT_STAGE2_COMPARISON_SCHEMA_VERSION = "assistant_stage2_eval_comparison_v0_1"; const INMEM_EVAL_REPORT_PREFIX = "inmem_eval_report:"; const INMEM_EVAL_REPORTS = new Map(); function isNoSpaceError(error) { const code = error?.code; return code === "ENOSPC"; } function tryWriteJsonFile(pathname, value) { try { (0, files_1.writeJsonFile)(pathname, value); return true; } catch (error) { if (isNoSpaceError(error)) { return false; } throw error; } } function tryWriteTextFile(pathname, value) { try { fs_1.default.writeFileSync(pathname, value, "utf-8"); return true; } catch (error) { if (isNoSpaceError(error)) { return false; } throw error; } } function putInMemoryEvalReport(report) { const key = `${INMEM_EVAL_REPORT_PREFIX}${(0, nanoid_1.nanoid)(12)}`; INMEM_EVAL_REPORTS.set(key, report); return key; } function readEvalReportByRef(ref) { if (ref.startsWith(INMEM_EVAL_REPORT_PREFIX)) { const report = INMEM_EVAL_REPORTS.get(ref); if (!report) { throw new Error(`In-memory eval report not found: ${ref}`); } return { report, resolved_path: ref }; } const resolvedPath = resolveReadablePath(ref); const report = JSON.parse(fs_1.default.readFileSync(resolvedPath, "utf-8")); return { report, resolved_path: resolvedPath }; } function compactAssistantStage1Report(report) { const results = Array.isArray(report.results) ? report.results : []; const compactResults = results.map((item) => ({ case_id: item.case_id ?? null, scenario_tag: item.scenario_tag ?? null, accountant_usefulness_score: item.accountant_usefulness_score ?? null, accountant_metrics: typeof item.accountant_metrics === "object" && item.accountant_metrics !== null ? item.accountant_metrics : null })); return { ...report, results: compactResults }; } function compactAssistantStage2Report(report) { const results = Array.isArray(report.results) ? report.results : []; const compactResults = results.map((item) => { const metricSubscores = (item.metric_subscores ?? {}); return { case_id: item.case_id ?? null, metric_subscores: { problem_clarity_score: metricSubscores.problem_clarity_score ?? null, mechanism_coherence_score: metricSubscores.mechanism_coherence_score ?? null, problem_first_answer_rate: metricSubscores.problem_first_answer_rate ?? null, entity_leakage_rate: metricSubscores.entity_leakage_rate ?? null } }; }); return { ...report, results: compactResults }; } const KNOWN_PROBLEM_UNIT_TYPES = [ "document_conflict", "broken_chain_segment", "lifecycle_anomaly_node", "unresolved_settlement_cluster", "period_risk_cluster", "cross_branch_inconsistency_cluster" ]; function toProblemUnitType(value) { return KNOWN_PROBLEM_UNIT_TYPES.includes(value) ? value : null; } function round2(value) { return Number(value.toFixed(2)); } function clampScore(value, min = 0, max = 5) { if (Number.isNaN(value)) { return min; } if (value < min) return min; if (value > max) return max; return value; } function rateToBandScore(metric, value) { if (metric === "retrieval_differentiation_rate") { if (value >= 0.75) return 5; if (value >= 0.45) return 3; return 0; } if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") { if (value <= 0.25) return 5; if (value <= 0.45) return 3; return 0; } if (metric === "accountant_actionability_score" || metric === "mechanism_specificity_score" || metric === "followup_context_retention_score") { if (value >= 4) return 5; if (value >= 2.5) return 3; return 0; } return 0; } function rubricBandForMetric(metric, value) { if (value === null) { return null; } const score = rateToBandScore(metric, value); return stage1Contracts_1.ACCOUNTANT_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null; } function rateToBandScoreStage2(metric, value) { if (metric === "problem_unit_precision" || metric === "problem_unit_recall_proxy" || metric === "problem_first_answer_rate") { if (value >= 0.75) return 5; if (value >= 0.45) return 3; return 0; } if (metric === "duplicate_collapse_rate") { if (value >= 0.2) return 5; if (value >= 0.08) return 3; return 0; } if (metric === "entity_leakage_rate") { if (value <= 0.2) return 5; if (value <= 0.4) return 3; return 0; } if (metric === "mechanism_coherence_score" || metric === "problem_clarity_score") { if (value >= 4) return 5; if (value >= 2.5) return 3; return 0; } return 0; } function rubricBandForMetricStage2(metric, value) { if (value === null) { return null; } const score = rateToBandScoreStage2(metric, value); return stage2EvalContracts_1.ASSISTANT_STAGE2_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null; } function buildFeatureProfileSnapshot() { return { FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1: config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1, FEATURE_ASSISTANT_ANSWER_POLICY_V11: process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 ?? String(config_1.FEATURE_ASSISTANT_ANSWER_POLICY_V11), FEATURE_ASSISTANT_BROAD_GUARD_V1: process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 ?? null, FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1: process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 ?? null, FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1: process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 ?? null, FEATURE_ASSISTANT_INVESTIGATION_STATE_V1: process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 ?? null, FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1: process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 ?? null, FEATURE_ASSISTANT_PROBLEM_UNITS_V1: process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_UNITS_V1), FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1: process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1), FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1: process.env.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1 ?? String(config_1.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1), FEATURE_ASSISTANT_STAGE2_EVAL_V1: process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 ?? String(config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1) }; } function buildCodeVersionMarker() { return { git_commit: process.env.GIT_COMMIT ?? process.env.CI_COMMIT_SHA ?? process.env.VERCEL_GIT_COMMIT_SHA ?? process.env.GITHUB_SHA ?? null, build_marker: process.env.BUILD_MARKER ?? process.env.BUILD_ID ?? process.env.npm_package_version ?? null }; } function resolveReadablePath(inputPath) { if (path_1.default.isAbsolute(inputPath)) { return inputPath; } const candidates = [ path_1.default.resolve(config_1.REPORTS_DIR, inputPath), path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath), path_1.default.resolve(config_1.EVAL_CASES_DIR, inputPath), path_1.default.resolve(inputPath) ]; for (const candidate of candidates) { if (fs_1.default.existsSync(candidate)) { return candidate; } } return candidates[0]; } function parseAssistantSuiteFile(inputPath) { const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE1_SUITE_FILE); const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, ""); const parsed = JSON.parse(raw); if (!parsed || typeof parsed !== "object") { throw new Error(`Invalid assistant suite format: ${filePath}`); } if (!Array.isArray(parsed.cases)) { throw new Error(`Assistant suite cases[] is required: ${filePath}`); } if (!Array.isArray(parsed.case_ids)) { throw new Error(`Assistant suite case_ids[] is required: ${filePath}`); } if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) { throw new Error(`Assistant suite suite_id is required: ${filePath}`); } if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) { throw new Error(`Assistant suite suite_version is required: ${filePath}`); } if (parsed.scenario_count !== parsed.cases.length) { throw new Error(`Assistant suite scenario_count mismatch: ${filePath}`); } const declaredIds = [...parsed.case_ids].sort(); const actualIds = parsed.cases.map((item) => item.case_id).sort(); const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]); if (!idsMatch) { throw new Error(`Assistant suite case_ids do not match cases[]: ${filePath}`); } for (const item of parsed.cases) { if (!Array.isArray(item.turns) || item.turns.length === 0) { throw new Error(`Assistant suite case ${item.case_id} must include at least one turn.`); } } return parsed; } function parseAssistantStage2SuiteFile(inputPath) { const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE2_SUITE_FILE); const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, ""); const parsed = JSON.parse(raw); if (!parsed || typeof parsed !== "object") { throw new Error(`Invalid assistant stage2 suite format: ${filePath}`); } if (!Array.isArray(parsed.cases)) { throw new Error(`Assistant stage2 suite cases[] is required: ${filePath}`); } if (!Array.isArray(parsed.case_ids)) { throw new Error(`Assistant stage2 suite case_ids[] is required: ${filePath}`); } if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) { throw new Error(`Assistant stage2 suite_id is required: ${filePath}`); } if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) { throw new Error(`Assistant stage2 suite_version is required: ${filePath}`); } if (parsed.scenario_count !== parsed.cases.length) { throw new Error(`Assistant stage2 scenario_count mismatch: ${filePath}`); } const declaredIds = [...parsed.case_ids].sort(); const actualIds = parsed.cases.map((item) => item.case_id).sort(); const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]); if (!idsMatch) { throw new Error(`Assistant stage2 case_ids do not match cases[]: ${filePath}`); } for (const item of parsed.cases) { if (!Array.isArray(item.turns) || item.turns.length === 0) { throw new Error(`Assistant stage2 case ${item.case_id} must include at least one turn.`); } } return parsed; } function hasDomainAnchors(text) { const source = String(text ?? ""); if (!source.trim()) { return false; } const hasPeriod = /\b20\d{2}(?:[-./](?:0[1-9]|1[0-2]))?\b/.test(source); const hasAccountingObject = /(счет|контрагент|документ|ндс|ос|period|account|supplier|invoice|guid|объект)/i.test(source); const hasAccountCode = /\b(?:01|02|03|04|08|10|19|20|25|26|41|43|44|50|51|52|57|60|62|68|69|70|71|73|76|90|91|94|97)\b/.test(source); const hits = [hasPeriod, hasAccountingObject, hasAccountCode].filter(Boolean).length; return hits >= 2; } function detectEntityLeakage(text) { const source = String(text ?? ""); if (!source.trim()) { return false; } const uuidHits = source.match(/\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi)?.length ?? 0; const guidHits = source.match(/\b(?:guid|uuid|entity_id|source_ref|canonical_ref|fragment_id)\b/gi)?.length ?? 0; const longHexHits = source.match(/\b[0-9a-f]{24,}\b/gi)?.length ?? 0; return uuidHits > 0 || guidHits > 1 || longHexHits > 0; } function extractTextList(value) { if (!Array.isArray(value)) { return []; } return value .map((item) => (typeof item === "string" ? item.trim() : "")) .filter(Boolean); } function toNarrowingStrength(value) { if (value === "weak" || value === "medium" || value === "strong") { return value; } return null; } function toDegradedTo(value) { if (value === "partial" || value === "clarification") { return value; } return null; } function buildAssistantEvalMarkdownReport(report) { const metrics = (report.metrics ?? {}).raw ?? {}; const bands = (report.rubric_bands ?? {}); const subsets = (report.subsets ?? {}); const scenarioSummary = (report.scenario_summary ?? {}); const improvementHints = (report.improvement_hints ?? {}); const rows = Object.keys(metrics) .map((key) => { const rawValue = metrics[key]; const band = bands[key]; const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue); const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a"; return `| ${key} | ${rawPrintable} | ${bandPrintable} |`; }) .join("\n"); return [ `# ${String(report.report_title ?? "Assistant Stage 1 Eval Run")}`, "", `- run_id: ${String(report.run_id ?? "")}`, `- eval_target: ${String(report.eval_target ?? "")}`, `- run_timestamp: ${String(report.run_timestamp ?? "")}`, `- suite_id: ${String(report.suite_id ?? "")}`, `- suite_version: ${String(report.suite_version ?? "")}`, `- cases_total: ${String(report.cases_total ?? 0)}`, "", "## Raw Metrics and Rubric Bands", "", "| Metric | Raw | Rubric band |", "|---|---:|---|", rows || "| n/a | n/a | n/a |", "", "## Subsets", "", `- broad_cases_total: ${String(subsets.broad_cases_total ?? 0)}`, `- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`, "", "## Scenario Summary", "", `- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`, `- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`, `- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`, "", "## Improvement Hints", "", `- strongest_signals: ${String(improvementHints.strongest_signals ?? "n/a")}`, `- weakest_signals: ${String(improvementHints.weakest_signals ?? "n/a")}`, "" ].join("\n"); } function buildAssistantStage2EvalMarkdownReport(report) { const metrics = (report.metrics ?? {}).raw ?? {}; const bands = (report.rubric_bands ?? {}); const subsets = (report.subsets ?? {}); const scenarioSummary = (report.scenario_summary ?? {}); const rows = Object.keys(metrics) .map((key) => { const rawValue = metrics[key]; const band = bands[key]; const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue); const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a"; return `| ${key} | ${rawPrintable} | ${bandPrintable} |`; }) .join("\n"); return [ `# ${String(report.report_title ?? "Assistant Stage 2 Eval Run")}`, "", `- run_id: ${String(report.run_id ?? "")}`, `- eval_target: ${String(report.eval_target ?? "")}`, `- run_timestamp: ${String(report.run_timestamp ?? "")}`, `- suite_id: ${String(report.suite_id ?? "")}`, `- suite_version: ${String(report.suite_version ?? "")}`, `- cases_total: ${String(report.cases_total ?? 0)}`, "", "## Raw Metrics and Rubric Bands", "", "| Metric | Raw | Rubric band |", "|---|---:|---|", rows || "| n/a | n/a | n/a |", "", "## Subsets", "", `- expected_problem_cases_total: ${String(subsets.expected_problem_cases_total ?? 0)}`, `- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`, `- candidate_cases_total: ${String(subsets.candidate_cases_total ?? 0)}`, "", "## Scenario Summary", "", `- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`, `- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`, `- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`, "" ].join("\n"); } function buildAssistantComparisonMarkdownReport(report) { const metrics = (report.metric_deltas ?? {}); const summary = (report.scenario_notes_summary ?? {}); const rows = Object.keys(metrics) .map((key) => { const row = metrics[key]; return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`; }) .join("\n"); return [ `# ${String(report.report_title ?? "Assistant Stage 1 Baseline vs Current")}`, "", `- comparison_id: ${String(report.comparison_id ?? "")}`, `- baseline_run_id: ${String(report.baseline_run_id ?? "")}`, `- current_run_id: ${String(report.current_run_id ?? "")}`, `- suite_version: ${String(report.suite_version ?? "")}`, "", "## Metric Deltas", "", "| Metric | Baseline | Current | Delta | Trend |", "|---|---:|---:|---:|---|", rows || "| n/a | n/a | n/a | n/a | n/a |", "", "## Scenario Notes Summary", "", `- improved: ${String(summary.improved ?? 0)}`, `- unchanged: ${String(summary.unchanged ?? 0)}`, `- weakened: ${String(summary.weakened ?? 0)}`, "" ].join("\n"); } function buildAssistantStage2ComparisonMarkdownReport(report) { const metrics = (report.metric_deltas ?? {}); const summary = (report.scenario_notes_summary ?? {}); const rows = Object.keys(metrics) .map((key) => { const row = metrics[key]; return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`; }) .join("\n"); return [ `# ${String(report.report_title ?? "Assistant Stage 2 Baseline vs Current")}`, "", `- comparison_id: ${String(report.comparison_id ?? "")}`, `- baseline_run_id: ${String(report.baseline_run_id ?? "")}`, `- current_run_id: ${String(report.current_run_id ?? "")}`, `- suite_version: ${String(report.suite_version ?? "")}`, "", "## Metric Deltas", "", "| Metric | Baseline | Current | Delta | Trend |", "|---|---:|---:|---:|---|", rows || "| n/a | n/a | n/a | n/a | n/a |", "", "## Scenario Notes Summary", "", `- improved: ${String(summary.improved ?? 0)}`, `- unchanged: ${String(summary.unchanged ?? 0)}`, `- weakened: ${String(summary.weakened ?? 0)}`, "" ].join("\n"); } class EvalService { normalizerService; constructor(normalizerService) { this.normalizerService = normalizerService; } listCases() { (0, files_1.ensureDir)(config_1.EVAL_CASES_DIR); const files = fs_1.default .readdirSync(config_1.EVAL_CASES_DIR) .filter((item) => item.endsWith(".json") && !item.endsWith(".report.json")); return files .map((name) => { const raw = fs_1.default.readFileSync(path_1.default.resolve(config_1.EVAL_CASES_DIR, name), "utf-8"); return JSON.parse(raw); }) .sort((a, b) => a.case_id.localeCompare(b.case_id)); } async runV2(payload) { const runId = `eval-${(0, nanoid_1.nanoid)(10)}`; const results = []; const routeCounter = {}; const fallbackCounter = {}; let schemaPass = 0; let inScopeMessages = 0; let multiIntentMessages = 0; let clarificationMessages = 0; let totalFragments = 0; let inScopeFragments = 0; let outOfScopeFragments = 0; let unclearFragments = 0; let executableWithSoftAssumptionsFragments = 0; let softAssumptionFragments = 0; let routedFragments = 0; let noRouteFragments = 0; let requestsTotal = 0; let retriesUsed = 0; let clarificationLabeledCases = 0; let clarificationTruePositive = 0; let clarificationFalsePositive = 0; let clarificationFalseNegative = 0; let scopeLabeledCases = 0; let scopeCorrectCases = 0; let routeLabeledCases = 0; let routeCorrectCases = 0; let expectedRoutedCases = 0; let noRouteTruePositive = 0; let noRouteFalsePositive = 0; let stateConsistencyChecks = 0; let stateConsistencyPass = 0; for (const item of payload.cases) { const response = await this.normalizerService.normalize({ ...payload.normalizeConfig, userQuestion: item.raw_question, context: { eval_label: runId, case_id: item.case_id, eval_mode: payload.mode }, retryPolicy: payload.mode === "single-pass-strict" ? "single-pass-strict" : "default", useMock: payload.useMock }); if (response.validation.passed) { schemaPass += 1; } const requestCount = Number(response.request_count_for_case ?? 0); requestsTotal += requestCount; if (requestCount > 1) { retriesUsed += 1; } const normalized = response.normalized && ["normalized_query_v2", "normalized_query_v2_0_1", "normalized_query_v2_0_2"].includes(String(response.normalized.schema_version ?? "")) ? response.normalized : null; const routeSummary = response.route_hint_summary && response.route_hint_summary.mode === "deterministic_v2" ? response.route_hint_summary : null; if (normalized) { if (normalized.message_in_scope) { inScopeMessages += 1; } if (normalized.contains_multiple_tasks) { multiIntentMessages += 1; } if (normalized.global_notes.needs_clarification) { clarificationMessages += 1; } totalFragments += normalized.fragments.length; const inScopeList = normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope"); inScopeFragments += inScopeList.length; outOfScopeFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length; unclearFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length; for (const fragment of inScopeList) { const readiness = executionReadinessOf(fragment); if (readiness === "executable_with_soft_assumptions") { executableWithSoftAssumptionsFragments += 1; } if (softAssumptionsOf(fragment).length > 0) { softAssumptionFragments += 1; } } } const predictedClarification = Boolean(normalized?.global_notes?.needs_clarification); const expectedClarification = typeof item.expected?.clarification_required === "boolean" ? item.expected.clarification_required : null; if (expectedClarification !== null) { clarificationLabeledCases += 1; if (predictedClarification && expectedClarification) clarificationTruePositive += 1; if (predictedClarification && !expectedClarification) clarificationFalsePositive += 1; if (!predictedClarification && expectedClarification) clarificationFalseNegative += 1; } const predictedScope = normalized ? normalized.message_in_scope : null; const expectedScope = expectedScopeInScope(item.expected); if (expectedScope !== null && predictedScope !== null) { scopeLabeledCases += 1; if (predictedScope === expectedScope) { scopeCorrectCases += 1; } } const predictedRouteStatus = routeSummary ? routeSummary.decisions.some((decision) => decision.route !== "no_route") ? "routed" : "no_route" : null; const predictedNoRouteReason = routeSummary && routeSummary.decisions.length > 0 && routeSummary.decisions.every((decision) => decision.route === "no_route") ? (routeSummary.decisions[0]?.no_route_reason ?? null) : null; const expectedRouteStatus = item.expected?.expected_route_status ?? null; const expectedNoRouteReason = item.expected?.expected_no_route_reason ?? null; if (expectedRouteStatus) { routeLabeledCases += 1; if (predictedRouteStatus === expectedRouteStatus) { routeCorrectCases += 1; } if (expectedRouteStatus === "routed") { expectedRoutedCases += 1; } } if (predictedRouteStatus === "no_route") { if (expectedRouteStatus === "no_route") { if (!expectedNoRouteReason || expectedNoRouteReason === predictedNoRouteReason) { noRouteTruePositive += 1; } else { noRouteFalsePositive += 1; } } else if (expectedRouteStatus === "routed") { noRouteFalsePositive += 1; } } if (routeSummary) { for (const decision of routeSummary.decisions) { stateConsistencyChecks += 1; if (isDecisionStateConsistent(decision)) { stateConsistencyPass += 1; } routeCounter[decision.route] = (routeCounter[decision.route] ?? 0) + 1; if (decision.route === "no_route") { noRouteFragments += 1; } else { routedFragments += 1; } } const fallbackType = String(routeSummary.fallback?.type ?? "none"); fallbackCounter[fallbackType] = (fallbackCounter[fallbackType] ?? 0) + 1; } else { fallbackCounter.none = (fallbackCounter.none ?? 0) + 1; } results.push({ case_id: item.case_id, raw_question: item.raw_question, validation_passed: response.validation.passed, message_in_scope: normalized?.message_in_scope ?? null, scope_confidence: normalized?.scope_confidence ?? null, contains_multiple_tasks: normalized?.contains_multiple_tasks ?? null, fragments_total: normalized?.fragments.length ?? 0, in_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope").length : 0, out_of_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length : 0, unclear_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length : 0, fallback_type: routeSummary?.fallback?.type ?? "none", predicted_route_status: predictedRouteStatus, expected_route_status: expectedRouteStatus, predicted_no_route_reason: predictedNoRouteReason, expected_no_route_reason: expectedNoRouteReason, predicted_clarification_required: predictedClarification, expected_clarification_required: expectedClarification, executable_with_soft_assumptions_fragments: normalized ? normalized.fragments.filter((fragment) => executionReadinessOf(fragment) === "executable_with_soft_assumptions") .length : 0, trace_id: response.trace_id, request_count_for_case: requestCount }); } const total = Math.max(1, payload.cases.length); const totalFragmentsSafe = Math.max(1, totalFragments); const totalRoutedDecisions = Math.max(1, routedFragments + noRouteFragments); const precisionDenominator = clarificationTruePositive + clarificationFalsePositive; const recallDenominator = clarificationTruePositive + clarificationFalseNegative; const noRoutePrecisionDenominator = noRouteTruePositive + noRouteFalsePositive; const metrics = { schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)), scope_detection_accuracy: scopeLabeledCases > 0 ? Number(((scopeCorrectCases / scopeLabeledCases) * 100).toFixed(2)) : null, scope_in_scope_rate: Number(((inScopeMessages / total) * 100).toFixed(2)), multi_intent_detected_rate: Number(((multiIntentMessages / total) * 100).toFixed(2)), clarification_required_rate: Number(((clarificationMessages / total) * 100).toFixed(2)), avg_fragments_per_message: Number((totalFragments / total).toFixed(2)), out_of_scope_fragment_rate: Number(((outOfScopeFragments / totalFragmentsSafe) * 100).toFixed(2)), routed_fragment_rate: Number(((routedFragments / totalRoutedDecisions) * 100).toFixed(2)), no_route_fragment_rate: Number(((noRouteFragments / totalRoutedDecisions) * 100).toFixed(2)), route_resolution_accuracy: routeLabeledCases > 0 ? Number(((routeCorrectCases / routeLabeledCases) * 100).toFixed(2)) : null, no_route_precision: noRoutePrecisionDenominator > 0 ? Number(((noRouteTruePositive / noRoutePrecisionDenominator) * 100).toFixed(2)) : null, false_no_route_rate: expectedRoutedCases > 0 ? Number(((noRouteFalsePositive / expectedRoutedCases) * 100).toFixed(2)) : null, execution_state_consistency_rate: stateConsistencyChecks > 0 ? Number(((stateConsistencyPass / stateConsistencyChecks) * 100).toFixed(2)) : null, executable_with_soft_assumptions_rate: Number(((executableWithSoftAssumptionsFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)), soft_assumption_used_fragment_rate: Number(((softAssumptionFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)), clarification_precision: precisionDenominator > 0 ? Number(((clarificationTruePositive / precisionDenominator) * 100).toFixed(2)) : null, clarification_recall: recallDenominator > 0 ? Number(((clarificationTruePositive / recallDenominator) * 100).toFixed(2)) : null, false_clarification_rate: clarificationLabeledCases > 0 ? Number(((clarificationFalsePositive / clarificationLabeledCases) * 100).toFixed(2)) : null }; const report = { run_id: runId, timestamp: new Date().toISOString(), mode: payload.mode, use_mock: Boolean(payload.useMock), prompt_version: payload.normalizeConfig.promptVersion ?? null, schema_version: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "") .toLowerCase() .includes("v2_0_2") ? "v2_0_2" : String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "") .toLowerCase() .includes("v2_0_1") ? "v2_0_1" : "v2", dataset: { source: payload.rawQuestions ? "inline_raw_questions" : payload.caseSetFile ? "file" : "data/eval_cases/*.json", file: payload.caseSetFile ?? null, raw_questions_count: payload.rawQuestions ? parseRawQuestions(payload.rawQuestions).length : null }, cases_total: payload.cases.length, metrics, budget: { requests_total: requestsTotal, retries_used: retriesUsed }, clarification_eval: { labeled_cases: clarificationLabeledCases, true_positive: clarificationTruePositive, false_positive: clarificationFalsePositive, false_negative: clarificationFalseNegative }, route_eval: { labeled_cases: routeLabeledCases, correct_cases: routeCorrectCases, expected_routed_cases: expectedRoutedCases, no_route_true_positive: noRouteTruePositive, no_route_false_positive: noRouteFalsePositive }, scope_eval: { labeled_cases: scopeLabeledCases, correct_cases: scopeCorrectCases }, execution_state_eval: { checks_total: stateConsistencyChecks, checks_passed: stateConsistencyPass }, route_distribution: routeCounter, fallback_distribution: fallbackCounter, results }; (0, files_1.ensureDir)(config_1.EVAL_CASES_DIR); tryWriteJsonFile(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report); return report; } collectAssistantSignals(finalResponse, turnResponses) { const debug = finalResponse.debug; const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : []; const sourceRefSet = new Set(); const limitationCodeSet = new Set(); const routeSet = new Set(); const confidenceScores = []; const narrowingOrder = { weak: 0, medium: 1, strong: 2 }; let broadQueryDetected = false; let broadResultFlag = false; let minimumEvidenceFailed = false; let degradedTo = null; let narrowingStrength = null; for (const result of retrievalResults) { routeSet.add(String(result.route ?? "unknown")); const summary = result.summary ?? {}; if (summary.broad_query_detected === true) broadQueryDetected = true; if (summary.broad_result_flag === true) broadResultFlag = true; if (summary.minimum_evidence_failed === true) minimumEvidenceFailed = true; const degraded = toDegradedTo(summary.degraded_to); if (degraded === "clarification") { degradedTo = "clarification"; } else if (!degradedTo && degraded === "partial") { degradedTo = "partial"; } const narrowed = toNarrowingStrength(summary.narrowing_strength); if (narrowed && (!narrowingStrength || narrowingOrder[narrowed] < narrowingOrder[narrowingStrength])) { narrowingStrength = narrowed; } if (result.confidence === "high") confidenceScores.push(3); if (result.confidence === "medium") confidenceScores.push(2); if (result.confidence === "low") confidenceScores.push(1); for (const evidence of Array.isArray(result.evidence) ? result.evidence : []) { const canonicalRef = String(evidence.source_ref?.canonical_ref ?? "").trim(); if (canonicalRef) { sourceRefSet.add(canonicalRef); } const reasonCode = String(evidence.limitation?.reason_code ?? "").trim(); if (reasonCode) { limitationCodeSet.add(reasonCode); } if (evidence.confidence === "high") confidenceScores.push(3); if (evidence.confidence === "medium") confidenceScores.push(2); if (evidence.confidence === "low") confidenceScores.push(1); } } const averageConfidence = confidenceScores.length > 0 ? confidenceScores.reduce((acc, item) => acc + item, 0) / confidenceScores.length : null; const evidenceConfidence = averageConfidence === null ? null : averageConfidence >= 2.6 ? "high" : averageConfidence >= 1.8 ? "medium" : "low"; const mechanismStatus = debug?.answer_structure_v11?.mechanism_block?.status === "grounded" || debug?.answer_structure_v11?.mechanism_block?.status === "limited" || debug?.answer_structure_v11?.mechanism_block?.status === "unresolved" ? debug.answer_structure_v11.mechanism_block.status : null; const followupStateApplied = turnResponses.some((item) => item.debug?.followup_state_usage?.applied === true); const uncertaintyLimitationsCount = debug?.answer_structure_v11?.uncertainty_block?.limitations?.length ?? 0; return { broad_query_detected: broadQueryDetected, broad_result_flag: broadResultFlag, narrowing_strength: narrowingStrength, minimum_evidence_failed: minimumEvidenceFailed, degraded_to: degradedTo, evidence_confidence: evidenceConfidence, limitation_reason_codes: [...limitationCodeSet], mechanism_status: mechanismStatus, source_refs: [...sourceRefSet], routes: [...routeSet], followup_state_applied: followupStateApplied, uncertainty_limitations_count: uncertaintyLimitationsCount }; } collectAssistantStage2Signals(finalResponse, turnResponses) { const base = this.collectAssistantSignals(finalResponse, turnResponses); const debug = finalResponse.debug; const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : []; const typeSet = new Set(); const mechanismSummaries = new Set(); let candidateEvidenceTotal = 0; let problemUnitsTotal = 0; let duplicateCollapsesTotal = 0; for (const result of retrievalResults) { const candidates = Array.isArray(result.candidate_evidence) ? result.candidate_evidence : []; candidateEvidenceTotal += candidates.length; const problemUnits = Array.isArray(result.problem_units) ? result.problem_units : []; problemUnitsTotal += problemUnits.length; for (const unit of problemUnits) { const unitType = toProblemUnitType(unit.problem_unit_type); if (unitType) { typeSet.add(unitType); } const mechanismSummary = String(unit.mechanism_summary ?? "").trim(); if (mechanismSummary) { mechanismSummaries.add(mechanismSummary); } } if (result.problem_unit_summary && typeof result.problem_unit_summary.duplicate_collapses === "number") { duplicateCollapsesTotal += Number(result.problem_unit_summary.duplicate_collapses); } } const answerMode = typeof debug?.problem_answer_mode === "string" ? debug.problem_answer_mode : null; const unitsUsedCount = Number(debug?.problem_units_used_count ?? 0); const unitIdsUsed = Array.isArray(debug?.problem_unit_ids_used) ? debug.problem_unit_ids_used .map((item) => String(item ?? "").trim()) .filter(Boolean) : []; const problemCentricApplied = debug?.problem_centric_answer_applied === true || answerMode === "stage2_problem_centric_v1"; return { ...base, candidate_evidence_total: candidateEvidenceTotal, problem_units_total: problemUnitsTotal, problem_unit_types: [...typeSet], problem_mechanism_summaries: [...mechanismSummaries], duplicate_collapses_total: duplicateCollapsesTotal, problem_centric_answer_applied: problemCentricApplied, problem_units_used_count: unitsUsedCount, problem_answer_mode: answerMode, problem_unit_ids_used: unitIdsUsed, entity_leakage_detected: detectEntityLeakage(String(finalResponse.assistant_reply ?? "")) }; } getExpectedProblemUnitTypes(suiteCase) { const expected = Array.isArray(suiteCase.expected_hints?.expected_problem_unit_types) ? suiteCase.expected_hints?.expected_problem_unit_types : []; const output = new Set(); for (const value of expected ?? []) { const mapped = toProblemUnitType(value); if (mapped) { output.add(mapped); } } return [...output]; } computeProblemUnitPrecision(expectedTypes, detectedTypes) { const uniqueExpected = [...new Set(expectedTypes)]; const uniqueDetected = [...new Set(detectedTypes)]; if (uniqueDetected.length === 0) { return uniqueExpected.length === 0 ? 1 : 0; } if (uniqueExpected.length === 0) { return 0; } const matchedDetected = uniqueDetected.filter((item) => uniqueExpected.includes(item)).length; return round2(matchedDetected / uniqueDetected.length); } computeProblemUnitRecallProxy(expectedTypes, detectedTypes) { const uniqueExpected = [...new Set(expectedTypes)]; const uniqueDetected = [...new Set(detectedTypes)]; if (uniqueExpected.length === 0) { return null; } if (uniqueDetected.length === 0) { return 0; } const matchedExpected = uniqueExpected.filter((item) => uniqueDetected.includes(item)).length; return round2(matchedExpected / uniqueExpected.length); } computeDuplicateCollapseRate(candidateTotal, duplicateCollapses) { if (candidateTotal <= 0) { return null; } return round2(Math.min(1, Math.max(0, duplicateCollapses / candidateTotal))); } computeMechanismCoherenceScore(finalResponse, signals) { const mechanismBlock = finalResponse.debug?.answer_structure_v11?.mechanism_block; const mechanismStatus = mechanismBlock?.status; const mechanismNotes = extractTextList(mechanismBlock?.mechanism_notes); const hasProblemMechanism = signals.problem_mechanism_summaries.length > 0; let score = 0; if (mechanismStatus === "grounded" && hasProblemMechanism && mechanismNotes.length > 0) { score = 5; } else if ((mechanismStatus === "limited" || mechanismStatus === "unresolved") && (hasProblemMechanism || mechanismNotes.length > 0)) { score = 3; } else if (hasProblemMechanism || mechanismNotes.length > 0) { score = 2; } if (mechanismStatus === "grounded" && !hasProblemMechanism) { score = Math.min(score, 2); } if (signals.limitation_reason_codes.includes("missing_mechanism")) { score -= 1; } return clampScore(score); } computeProblemClarityScore(finalResponse, signals) { const structure = finalResponse.debug?.answer_structure_v11; const answerSummary = String(structure?.answer_summary ?? "").trim(); const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "").trim(); const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions); const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions); const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations); let score = 0; if (answerSummary.length > 20) score += 1; if (directAnswer.length > 20) score += 1; if (hasDomainAnchors(`${answerSummary} ${directAnswer}`)) score += 1; if (recommendedActions.length > 0 || clarificationQuestions.length > 0) score += 1; if (signals.problem_units_total > 0 || signals.problem_centric_answer_applied) score += 1; if ((signals.minimum_evidence_failed || signals.degraded_to === "clarification") && uncertaintyLimitations.length === 0) { score -= 1; } if (signals.entity_leakage_detected) { score -= 1; } return clampScore(score); } computeAssistantMetrics(input) { const diagnostics = input.diagnostics; const total = Math.max(1, diagnostics.length); const signatureCounter = diagnostics.reduce((acc, item) => { acc[item.signature] = (acc[item.signature] ?? 0) + 1; return acc; }, {}); const uniqueSignatures = Object.keys(signatureCounter).length; const genericCases = diagnostics.filter((item) => item.is_generic).length; const falseConfidenceCases = diagnostics.filter((item) => item.is_false_confident).length; const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null); const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length; const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null); const avgActionability = diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length : null; const avgMechanism = diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.mechanism_specificity_score, 0) / diagnostics.length : null; const avgFollowup = followupCases.length > 0 ? followupCases.reduce((acc, item) => acc + Number(item.followup_retention_score ?? 0), 0) / followupCases.length : null; const raw = { retrieval_differentiation_rate: round2(uniqueSignatures / total), generic_explanation_rate: round2(genericCases / total), accountant_actionability_score: avgActionability === null ? null : round2(avgActionability), false_confidence_rate: round2(falseConfidenceCases / total), broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null, mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism), followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup) }; const rubric_bands = { retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate), generic_explanation_rate: rubricBandForMetric("generic_explanation_rate", raw.generic_explanation_rate), accountant_actionability_score: rubricBandForMetric("accountant_actionability_score", raw.accountant_actionability_score), false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate), broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate), mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score), followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score) }; return { raw, rubric_bands, denominators: { cases_total: diagnostics.length, broad_cases_total: broadCases.length, followup_cases_total: followupCases.length }, signature_counts: signatureCounter }; } computeAssistantStage2Metrics(input) { const diagnostics = input.diagnostics; const signatureCounter = diagnostics.reduce((acc, item) => { acc[item.signature] = (acc[item.signature] ?? 0) + 1; return acc; }, {}); const precisionValues = diagnostics .map((item) => item.problem_unit_precision) .filter((item) => typeof item === "number"); const recallValues = diagnostics .map((item) => item.problem_unit_recall_proxy) .filter((item) => typeof item === "number"); const collapseValues = diagnostics .map((item) => item.duplicate_collapse_rate) .filter((item) => typeof item === "number"); const mechanismValues = diagnostics.map((item) => item.mechanism_coherence_score); const clarityValues = diagnostics.map((item) => item.problem_clarity_score); const firstApplicable = diagnostics.filter((item) => item.problem_first_answer_applied !== null); const firstApplied = firstApplicable.filter((item) => item.problem_first_answer_applied === true).length; const leakageCases = diagnostics.filter((item) => item.entity_leakage).length; const followupCases = diagnostics.filter((item) => item.suite_case.question_type === "followup" || item.turn_count > 1); const candidateCases = diagnostics.filter((item) => item.signals.candidate_evidence_total > 0); const expectedProblemCases = diagnostics.filter((item) => item.expected_problem_first); const average = (values) => { if (values.length === 0) return null; return round2(values.reduce((acc, item) => acc + item, 0) / values.length); }; const raw = { problem_unit_precision: average(precisionValues), problem_unit_recall_proxy: average(recallValues), duplicate_collapse_rate: average(collapseValues), mechanism_coherence_score: average(mechanismValues), problem_clarity_score: average(clarityValues), problem_first_answer_rate: firstApplicable.length > 0 ? round2(firstApplied / firstApplicable.length) : null, entity_leakage_rate: diagnostics.length > 0 ? round2(leakageCases / diagnostics.length) : null }; const rubric_bands = { problem_unit_precision: rubricBandForMetricStage2("problem_unit_precision", raw.problem_unit_precision), problem_unit_recall_proxy: rubricBandForMetricStage2("problem_unit_recall_proxy", raw.problem_unit_recall_proxy), duplicate_collapse_rate: rubricBandForMetricStage2("duplicate_collapse_rate", raw.duplicate_collapse_rate), mechanism_coherence_score: rubricBandForMetricStage2("mechanism_coherence_score", raw.mechanism_coherence_score), problem_clarity_score: rubricBandForMetricStage2("problem_clarity_score", raw.problem_clarity_score), problem_first_answer_rate: rubricBandForMetricStage2("problem_first_answer_rate", raw.problem_first_answer_rate), entity_leakage_rate: rubricBandForMetricStage2("entity_leakage_rate", raw.entity_leakage_rate) }; return { raw, rubric_bands, denominators: { cases_total: diagnostics.length, expected_problem_cases_total: expectedProblemCases.length, followup_cases_total: followupCases.length, candidate_cases_total: candidateCases.length, precision_cases_total: precisionValues.length, recall_cases_total: recallValues.length, duplicate_collapse_cases_total: collapseValues.length, problem_first_applicable_cases_total: firstApplicable.length }, signature_counts: signatureCounter }; } buildAssistantComparisonReport(input) { const baselineRef = readEvalReportByRef(input.baselineReportFile); const baselinePath = baselineRef.resolved_path; const baselineReport = baselineRef.report; const currentReport = input.currentReport; const metricKeys = [ "retrieval_differentiation_rate", "generic_explanation_rate", "accountant_actionability_score", "false_confidence_rate", "broad_answer_rate", "mechanism_specificity_score", "followup_context_retention_score" ]; const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]); const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {}; const currentRaw = (currentReport.metrics ?? {}).raw ?? {}; const deltas = {}; for (const metric of metricKeys) { const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null; const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null; const delta = baseline !== null && current !== null ? round2(current - baseline) : null; let trend = "n/a"; if (baseline !== null && current !== null) { const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01; const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01; trend = improved ? "improved" : weakened ? "weakened" : "unchanged"; } deltas[metric] = { baseline, current, delta, trend }; } const baselineResults = Array.isArray(baselineReport.results) ? baselineReport.results : []; const currentResults = Array.isArray(currentReport.results) ? currentReport.results : []; const baselineByCase = new Map(); for (const row of baselineResults) { baselineByCase.set(String(row.case_id ?? ""), row); } const improvedNotes = []; const unchangedNotes = []; const weakenedNotes = []; for (const row of currentResults) { const caseId = String(row.case_id ?? ""); const currentUsefulness = typeof row.accountant_usefulness_score === "number" ? Number(row.accountant_usefulness_score) : null; const baselineRow = baselineByCase.get(caseId); const baselineUsefulness = baselineRow && typeof baselineRow.accountant_usefulness_score === "number" ? Number(baselineRow.accountant_usefulness_score) : null; if (baselineUsefulness === null || currentUsefulness === null) { continue; } const delta = round2(currentUsefulness - baselineUsefulness); const note = `${caseId}: usefulness ${baselineUsefulness} -> ${currentUsefulness} (delta ${delta})`; if (delta > 0.25) { improvedNotes.push(note); } else if (delta < -0.25) { weakenedNotes.push(note); } else { unchangedNotes.push(note); } } const comparisonId = `assistant-compare-${(0, nanoid_1.nanoid)(8)}`; const comparisonReport = { schema_version: ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION, comparison_id: comparisonId, run_timestamp: new Date().toISOString(), baseline_run_id: baselineReport.run_id ?? null, current_run_id: currentReport.run_id ?? null, eval_target: "assistant_stage1", suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null, suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null, baseline_report_file: baselinePath, current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object" ? currentReport.artifacts.run_report_json_path ?? null : null, metric_deltas: deltas, scenario_notes_summary: { improved: improvedNotes.length, unchanged: unchangedNotes.length, weakened: weakenedNotes.length }, scenario_notes: { improved: improvedNotes, unchanged: unchangedNotes, weakened: weakenedNotes }, known_limitations: currentReport.known_limitations ?? [ "Comparison is run-to-run and depends on stable mock/runtime flags.", "Metrics remain Stage 1 heuristic bands, not full product scorecards." ], report_title: "Assistant Stage 1 Baseline vs Current" }; (0, files_1.ensureDir)(config_1.REPORTS_DIR); const jsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.json`); const mdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.md`); const jsonWritten = tryWriteJsonFile(jsonPath, comparisonReport); const mdWritten = tryWriteTextFile(mdPath, buildAssistantComparisonMarkdownReport(comparisonReport)); const comparisonRef = jsonWritten ? jsonPath : putInMemoryEvalReport(comparisonReport); return { ...comparisonReport, artifacts: { comparison_report_json_path: comparisonRef, comparison_report_md_path: mdWritten ? mdPath : null } }; } buildAssistantStage2ComparisonReport(input) { const baselineRef = readEvalReportByRef(input.baselineReportFile); const baselinePath = baselineRef.resolved_path; const baselineReport = baselineRef.report; const currentReport = input.currentReport; const metricKeys = [ "problem_unit_precision", "problem_unit_recall_proxy", "duplicate_collapse_rate", "mechanism_coherence_score", "problem_clarity_score", "problem_first_answer_rate", "entity_leakage_rate" ]; const lowerIsBetter = new Set(["entity_leakage_rate"]); const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {}; const currentRaw = (currentReport.metrics ?? {}).raw ?? {}; const deltas = {}; for (const metric of metricKeys) { const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null; const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null; const delta = baseline !== null && current !== null ? round2(current - baseline) : null; let trend = "n/a"; if (baseline !== null && current !== null) { const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01; const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01; trend = improved ? "improved" : weakened ? "weakened" : "unchanged"; } deltas[metric] = { baseline, current, delta, trend }; } const baselineResults = Array.isArray(baselineReport.results) ? baselineReport.results : []; const currentResults = Array.isArray(currentReport.results) ? currentReport.results : []; const baselineByCase = new Map(); for (const row of baselineResults) { baselineByCase.set(String(row.case_id ?? ""), row); } const improvedNotes = []; const unchangedNotes = []; const weakenedNotes = []; const toComposite = (row) => { if (!row || typeof row !== "object") return null; const metricSubscores = row.metric_subscores; if (!metricSubscores) return null; const clarity = typeof metricSubscores.problem_clarity_score === "number" ? Number(metricSubscores.problem_clarity_score) : null; const mechanism = typeof metricSubscores.mechanism_coherence_score === "number" ? Number(metricSubscores.mechanism_coherence_score) : null; const firstRate = typeof metricSubscores.problem_first_answer_rate === "number" ? Number(metricSubscores.problem_first_answer_rate) : null; const leakageRate = typeof metricSubscores.entity_leakage_rate === "number" ? Number(metricSubscores.entity_leakage_rate) : null; if (clarity === null || mechanism === null || firstRate === null || leakageRate === null) { return null; } return round2((clarity + mechanism + firstRate * 5 + (1 - leakageRate) * 5) / 4); }; for (const row of currentResults) { const caseId = String(row.case_id ?? ""); const currentComposite = toComposite(row); const baselineComposite = toComposite(baselineByCase.get(caseId)); if (currentComposite === null || baselineComposite === null) { continue; } const delta = round2(currentComposite - baselineComposite); const note = `${caseId}: composite ${baselineComposite} -> ${currentComposite} (delta ${delta})`; if (delta > 0.25) { improvedNotes.push(note); } else if (delta < -0.25) { weakenedNotes.push(note); } else { unchangedNotes.push(note); } } const comparisonId = `assistant-stage2-compare-${(0, nanoid_1.nanoid)(8)}`; const comparisonReport = { schema_version: ASSISTANT_STAGE2_COMPARISON_SCHEMA_VERSION, comparison_id: comparisonId, run_timestamp: new Date().toISOString(), baseline_run_id: baselineReport.run_id ?? null, current_run_id: currentReport.run_id ?? null, eval_target: "assistant_stage2", suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null, suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null, baseline_report_file: baselinePath, current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object" ? currentReport.artifacts.run_report_json_path ?? null : null, metric_deltas: deltas, scenario_notes_summary: { improved: improvedNotes.length, unchanged: unchangedNotes.length, weakened: weakenedNotes.length }, scenario_notes: { improved: improvedNotes, unchanged: unchangedNotes, weakened: weakenedNotes }, known_limitations: currentReport.known_limitations ?? [ "Stage 2 comparison remains run-to-run and depends on stable feature profile.", "Metrics are Stage 2 Wave 5 heuristics, not final product scorecards." ], report_title: "Assistant Stage 2 Baseline vs Current" }; (0, files_1.ensureDir)(config_1.REPORTS_DIR); const jsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.json`); const mdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.md`); const jsonWritten = tryWriteJsonFile(jsonPath, comparisonReport); const mdWritten = tryWriteTextFile(mdPath, buildAssistantStage2ComparisonMarkdownReport(comparisonReport)); const comparisonRef = jsonWritten ? jsonPath : putInMemoryEvalReport(comparisonReport); return { ...comparisonReport, artifacts: { comparison_report_json_path: comparisonRef, comparison_report_md_path: mdWritten ? mdPath : null } }; } async runAssistantStage1(payload) { if (!config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1) { throw new http_1.ApiError("ASSISTANT_STAGE1_EVAL_DISABLED", "Assistant Stage 1 eval target is disabled by FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1.", 409); } const suite = parseAssistantSuiteFile(payload.caseSetFile); const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id)); const runId = typeof payload.runId === "string" && payload.runId.trim().length > 0 ? payload.runId.trim() : `assistant-stage1-${(0, nanoid_1.nanoid)(10)}`; const assistantService = new assistantService_1.AssistantService(this.normalizerService, new assistantSessionStore_1.AssistantSessionStore()); const diagnostics = []; let requestsTotal = 0; for (const suiteCase of suiteCases) { const sessionId = `${runId}-${suiteCase.case_id}`; const turnResponses = []; const notes = []; const limitations = []; try { for (const turn of suiteCase.turns) { const response = (await assistantService.handleMessage({ session_id: sessionId, user_message: turn.user_message, message: turn.user_message, mode: "assistant", llmProvider: payload.normalizeConfig.llmProvider, apiKey: payload.normalizeConfig.apiKey, model: payload.normalizeConfig.model, baseUrl: payload.normalizeConfig.baseUrl, temperature: payload.normalizeConfig.temperature, maxOutputTokens: payload.normalizeConfig.maxOutputTokens, promptVersion: payload.normalizeConfig.promptVersion, systemPrompt: payload.normalizeConfig.systemPrompt, developerPrompt: payload.normalizeConfig.developerPrompt, domainPrompt: payload.normalizeConfig.domainPrompt, fewShotExamples: payload.normalizeConfig.fewShotExamples, useMock: payload.useMock })); turnResponses.push(response); requestsTotal += 1; } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); diagnostics.push({ suite_case: suiteCase, session_id: sessionId, trace_id: null, final_reply_type: "backend_error", turn_count: turnResponses.length, narrowing_result: "failed", signature: `backend_error|${suiteCase.scenario_tag}`, is_generic: true, is_false_confident: false, is_broad_answer: suiteCase.broadness_level === "low" ? null : false, followup_retention_score: suiteCase.question_type === "followup" || suiteCase.turns.length > 1 ? 0 : null, evidence_quality_score: 0, mechanism_specificity_score: 0, genericness_score: 5, accountant_actionability_score: 0, accountant_usefulness_score: 0, signals: { broad_query_detected: suiteCase.broadness_level !== "low", broad_result_flag: false, narrowing_strength: null, minimum_evidence_failed: true, degraded_to: "clarification", evidence_confidence: "low", limitation_reason_codes: [], mechanism_status: null, source_refs: [], routes: [], followup_state_applied: false, uncertainty_limitations_count: 0 }, limitations: [errorMessage], notes: [`Case execution failed: ${errorMessage}`] }); continue; } const finalResponse = turnResponses[turnResponses.length - 1]; const signals = this.collectAssistantSignals(finalResponse, turnResponses); const structure = finalResponse.debug?.answer_structure_v11 ?? null; const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions); const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions); const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes); const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations); const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? ""); const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")); let genericnessScore = 0; if (!hasAnchors) genericnessScore += 2; if (mechanismNotes.length === 0) genericnessScore += 1; if (signals.source_refs.length === 0) genericnessScore += 1; if (recommendedActions.length === 0) genericnessScore += 1; genericnessScore = clampScore(genericnessScore); let actionabilityScore = 0; if (recommendedActions.length > 0) actionabilityScore += 2; if (recommendedActions.some((item) => hasDomainAnchors(item))) actionabilityScore += 2; if (clarificationQuestions.length > 0 && (finalResponse.reply_type === "clarification_required" || signals.degraded_to === "clarification")) { actionabilityScore += 1; } if (signals.source_refs.length > 0 && actionabilityScore < 5) { actionabilityScore += 1; } actionabilityScore = clampScore(actionabilityScore); let evidenceQualityScore = 0; if (signals.source_refs.length >= 3) evidenceQualityScore += 2; else if (signals.source_refs.length > 0) evidenceQualityScore += 1; if (signals.evidence_confidence === "high") evidenceQualityScore += 2; if (signals.evidence_confidence === "medium") evidenceQualityScore += 1; if (signals.minimum_evidence_failed) evidenceQualityScore -= 2; if (signals.limitation_reason_codes.includes("insufficient_detail")) evidenceQualityScore -= 1; if (signals.limitation_reason_codes.includes("missing_mechanism")) evidenceQualityScore -= 1; evidenceQualityScore = clampScore(evidenceQualityScore); let mechanismSpecificityScore = 0; if (signals.mechanism_status === "grounded" && mechanismNotes.length > 0 && !signals.limitation_reason_codes.includes("missing_mechanism")) { mechanismSpecificityScore = 5; } else if (signals.mechanism_status === "limited" && mechanismNotes.length > 0) { mechanismSpecificityScore = 3; } else if (mechanismNotes.length > 0) { mechanismSpecificityScore = 2; } else { mechanismSpecificityScore = 0; } const usefulnessScore = clampScore((actionabilityScore + (5 - genericnessScore) + evidenceQualityScore + mechanismSpecificityScore) / 4); const isGeneric = genericnessScore >= 3; const factualReply = finalResponse.reply_type === "factual" || finalResponse.reply_type === "factual_with_explanation"; const isFalseConfident = factualReply && (signals.minimum_evidence_failed || signals.degraded_to !== null || signals.evidence_confidence === "low" || (signals.limitation_reason_codes.length > 0 && signals.uncertainty_limitations_count === 0)); const isBroadCase = suiteCase.broadness_level !== "low" || signals.broad_query_detected; const isBroadAnswer = isBroadCase ? factualReply && signals.degraded_to === null && !signals.minimum_evidence_failed : null; const isFollowupCase = suiteCase.question_type === "followup" || suiteCase.turns.length > 1; let followupRetentionScore = null; if (isFollowupCase) { const finalTurnIndex = Number(finalResponse.debug?.investigation_state_snapshot?.turn_index ?? 0); if (signals.followup_state_applied && finalTurnIndex >= suiteCase.turns.length) { followupRetentionScore = 5; } else if (finalTurnIndex >= suiteCase.turns.length) { followupRetentionScore = 3; } else { followupRetentionScore = 0; } } let narrowingResult = "not_required"; if (signals.degraded_to === "clarification" || finalResponse.reply_type === "clarification_required") { narrowingResult = "clarification_requested"; } else if (signals.broad_query_detected || signals.broad_result_flag) { narrowingResult = signals.minimum_evidence_failed ? "failed" : "applied"; } if (signals.minimum_evidence_failed) { limitations.push("minimum_evidence_failed"); } limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`)); if (signals.mechanism_status === "unresolved") { limitations.push("mechanism_unresolved"); } limitations.push(...uncertaintyLimitations); if (isGeneric) notes.push("genericness_high"); if (isFalseConfident) notes.push("false_confidence_risk"); if (isBroadCase && isBroadAnswer) notes.push("broad_answer_without_degradation"); if (followupRetentionScore !== null && followupRetentionScore < 3) notes.push("followup_context_retention_weak"); diagnostics.push({ suite_case: suiteCase, session_id: sessionId, trace_id: finalResponse.debug?.trace_id ?? null, final_reply_type: finalResponse.reply_type, turn_count: suiteCase.turns.length, narrowing_result: narrowingResult, signature: [ finalResponse.reply_type, signals.routes.sort().join(","), signals.degraded_to ?? "none", signals.mechanism_status ?? "unknown", signals.source_refs.slice(0, 2).join(",") ].join("|"), is_generic: isGeneric, is_false_confident: isFalseConfident, is_broad_answer: isBroadAnswer, followup_retention_score: followupRetentionScore, evidence_quality_score: evidenceQualityScore, mechanism_specificity_score: mechanismSpecificityScore, genericness_score: genericnessScore, accountant_actionability_score: actionabilityScore, accountant_usefulness_score: round2(usefulnessScore), signals, limitations: Array.from(new Set(limitations)), notes }); } const metrics = this.computeAssistantMetrics({ diagnostics }); const caseRecords = diagnostics.map((item) => { const signatureHits = metrics.signature_counts[item.signature] ?? 1; const caseMetricVector = { retrieval_differentiation_rate: signatureHits === 1 ? 1 : 0, generic_explanation_rate: item.is_generic ? 1 : 0, accountant_actionability_score: round2(item.accountant_actionability_score), false_confidence_rate: item.is_false_confident ? 1 : 0, broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0, mechanism_specificity_score: round2(item.mechanism_specificity_score), followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score) }; return { schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION, created_at: new Date().toISOString(), case_id: item.suite_case.case_id, scenario_tag: item.suite_case.scenario_tag, session_id: item.session_id, trace_id: item.trace_id, question_type: item.suite_case.question_type, broadness_level: item.suite_case.broadness_level, narrowing_result: item.narrowing_result, evidence_quality_score: round2(item.evidence_quality_score), genericness_score: round2(item.genericness_score), accountant_usefulness_score: round2(item.accountant_usefulness_score), accountant_metrics: caseMetricVector, raw_signals: { final_reply_type: item.final_reply_type, turn_count: item.turn_count, broad_query_detected: item.signals.broad_query_detected, broad_result_flag: item.signals.broad_result_flag, narrowing_strength: item.signals.narrowing_strength, minimum_evidence_failed: item.signals.minimum_evidence_failed, degraded_to: item.signals.degraded_to, evidence_confidence: item.signals.evidence_confidence, limitation_reason_codes: item.signals.limitation_reason_codes, mechanism_status: item.signals.mechanism_status, source_refs: item.signals.source_refs, routes: item.signals.routes, followup_state_applied: item.signals.followup_state_applied }, metric_subscores: caseMetricVector, limitations: item.limitations, notes: item.notes }; }); const strongestSignals = Object.entries(metrics.rubric_bands) .filter(([, band]) => band?.score === 5) .map(([name]) => name); const weakestSignals = Object.entries(metrics.rubric_bands) .filter(([, band]) => band?.score === 0) .map(([name]) => name); const runTimestamp = new Date().toISOString(); const report = { schema_version: ASSISTANT_STAGE1_RUN_SCHEMA_VERSION, run_id: runId, run_timestamp: runTimestamp, eval_target: "assistant_stage1", mode: payload.mode, use_mock: Boolean(payload.useMock), prompt_version: payload.normalizeConfig.promptVersion ?? null, suite_id: suite.suite_id, suite_version: suite.suite_version, suite_schema_version: suite.schema_version ?? null, scenario_count: suite.scenario_count, case_ids: suiteCases.map((item) => item.case_id), cases_total: caseRecords.length, feature_profile_snapshot: buildFeatureProfileSnapshot(), code_version: buildCodeVersionMarker(), metrics: { raw: metrics.raw, denominators: metrics.denominators }, rubric_bands: metrics.rubric_bands, subsets: { broad_cases_total: metrics.denominators.broad_cases_total, followup_cases_total: metrics.denominators.followup_cases_total }, budget: { requests_total: requestsTotal }, results: caseRecords, scenario_summary: { improved_or_strong: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) >= 4).length, unchanged_or_mixed: caseRecords.filter((item) => { const value = Number(item.accountant_usefulness_score ?? 0); return value >= 2.5 && value < 4; }).length, weak_or_regressed: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) < 2.5).length }, improvement_hints: { strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none", weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none" }, known_limitations: [ "Snapshot-only retrieval contour remains (no live verification core in Stage 1).", "Metric mapping for genericness/false confidence is heuristic by design.", "Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)." ], report_title: "Assistant Stage 1 Eval Run" }; (0, files_1.ensureDir)(config_1.REPORTS_DIR); const runJsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.json`); const runMdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.md`); const compactReport = compactAssistantStage1Report(report); const jsonWritten = tryWriteJsonFile(runJsonPath, compactReport); const mdWritten = tryWriteTextFile(runMdPath, buildAssistantEvalMarkdownReport(compactReport)); const runReportRef = jsonWritten ? runJsonPath : putInMemoryEvalReport(compactReport); report.artifacts = { run_report_json_path: runReportRef, run_report_md_path: mdWritten ? runMdPath : null }; if (payload.compareWithReportFile) { report.comparison = this.buildAssistantComparisonReport({ currentReport: report, baselineReportFile: payload.compareWithReportFile }); } return report; } async runAssistantStage2(payload) { if (!config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1) { throw new http_1.ApiError("ASSISTANT_STAGE2_EVAL_DISABLED", "Assistant Stage 2 eval target is disabled by FEATURE_ASSISTANT_STAGE2_EVAL_V1.", 409); } const suite = parseAssistantStage2SuiteFile(payload.caseSetFile); const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id)); const runId = typeof payload.runId === "string" && payload.runId.trim().length > 0 ? payload.runId.trim() : `assistant-stage2-${(0, nanoid_1.nanoid)(10)}`; const assistantService = new assistantService_1.AssistantService(this.normalizerService, new assistantSessionStore_1.AssistantSessionStore()); const diagnostics = []; let requestsTotal = 0; for (const suiteCase of suiteCases) { const sessionId = `${runId}-${suiteCase.case_id}`; const turnResponses = []; const notes = []; const limitations = []; const expectedProblemUnitTypes = this.getExpectedProblemUnitTypes(suiteCase); const expectedProblemFirst = suiteCase.expected_hints?.expected_problem_first ?? (suiteCase.broadness_level !== "low" || suiteCase.question_type !== "direct"); try { for (const turn of suiteCase.turns) { const response = (await assistantService.handleMessage({ session_id: sessionId, user_message: turn.user_message, message: turn.user_message, mode: "assistant", llmProvider: payload.normalizeConfig.llmProvider, apiKey: payload.normalizeConfig.apiKey, model: payload.normalizeConfig.model, baseUrl: payload.normalizeConfig.baseUrl, temperature: payload.normalizeConfig.temperature, maxOutputTokens: payload.normalizeConfig.maxOutputTokens, promptVersion: payload.normalizeConfig.promptVersion, systemPrompt: payload.normalizeConfig.systemPrompt, developerPrompt: payload.normalizeConfig.developerPrompt, domainPrompt: payload.normalizeConfig.domainPrompt, fewShotExamples: payload.normalizeConfig.fewShotExamples, useMock: payload.useMock })); turnResponses.push(response); requestsTotal += 1; } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); diagnostics.push({ suite_case: suiteCase, session_id: sessionId, trace_id: null, final_reply_type: "backend_error", turn_count: turnResponses.length, signature: `backend_error|${suiteCase.scenario_tag}`, expected_problem_unit_types: expectedProblemUnitTypes, expected_problem_first: expectedProblemFirst, problem_unit_precision: 0, problem_unit_recall_proxy: expectedProblemUnitTypes.length > 0 ? 0 : null, duplicate_collapse_rate: null, mechanism_coherence_score: 0, problem_clarity_score: 0, problem_first_answer_applied: expectedProblemFirst ? false : null, entity_leakage: false, signals: { broad_query_detected: suiteCase.broadness_level !== "low", broad_result_flag: false, narrowing_strength: null, minimum_evidence_failed: true, degraded_to: "clarification", evidence_confidence: "low", limitation_reason_codes: [], mechanism_status: null, source_refs: [], routes: [], followup_state_applied: false, uncertainty_limitations_count: 0, candidate_evidence_total: 0, problem_units_total: 0, problem_unit_types: [], problem_mechanism_summaries: [], duplicate_collapses_total: 0, problem_centric_answer_applied: false, problem_units_used_count: 0, problem_answer_mode: null, problem_unit_ids_used: [], entity_leakage_detected: false }, limitations: [errorMessage], notes: [`Case execution failed: ${errorMessage}`] }); continue; } const finalResponse = turnResponses[turnResponses.length - 1]; const signals = this.collectAssistantStage2Signals(finalResponse, turnResponses); const problemUnitPrecision = this.computeProblemUnitPrecision(expectedProblemUnitTypes, signals.problem_unit_types); const problemUnitRecallProxy = this.computeProblemUnitRecallProxy(expectedProblemUnitTypes, signals.problem_unit_types); const duplicateCollapseRate = this.computeDuplicateCollapseRate(signals.candidate_evidence_total, signals.duplicate_collapses_total); const mechanismCoherenceScore = this.computeMechanismCoherenceScore(finalResponse, signals); const problemClarityScore = this.computeProblemClarityScore(finalResponse, signals); const problemFirstAnswerApplied = expectedProblemFirst ? signals.problem_centric_answer_applied && signals.problem_units_used_count > 0 : null; if (signals.problem_units_total === 0 && expectedProblemUnitTypes.length > 0) { limitations.push("missing_problem_units"); } if (signals.problem_centric_answer_applied && signals.problem_units_used_count <= 0) { limitations.push("problem_mode_without_units"); } limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`)); if (signals.entity_leakage_detected) { limitations.push("entity_leakage_detected"); } if (problemFirstAnswerApplied === false) notes.push("problem_first_not_applied"); if (signals.problem_units_total === 0) notes.push("problem_units_missing"); if (signals.problem_unit_types.length > 0) notes.push(`problem_types:${signals.problem_unit_types.join(",")}`); if (signals.entity_leakage_detected) notes.push("entity_leakage"); if (signals.degraded_to === "clarification") notes.push("clarification_degraded"); diagnostics.push({ suite_case: suiteCase, session_id: sessionId, trace_id: finalResponse.debug?.trace_id ?? null, final_reply_type: finalResponse.reply_type, turn_count: suiteCase.turns.length, signature: [ finalResponse.reply_type, signals.problem_answer_mode ?? "unknown", signals.problem_unit_types.sort().join(","), signals.degraded_to ?? "none" ].join("|"), expected_problem_unit_types: expectedProblemUnitTypes, expected_problem_first: expectedProblemFirst, problem_unit_precision: problemUnitPrecision, problem_unit_recall_proxy: problemUnitRecallProxy, duplicate_collapse_rate: duplicateCollapseRate, mechanism_coherence_score: mechanismCoherenceScore, problem_clarity_score: problemClarityScore, problem_first_answer_applied: problemFirstAnswerApplied, entity_leakage: signals.entity_leakage_detected, signals, limitations: Array.from(new Set(limitations)), notes }); } const metrics = this.computeAssistantStage2Metrics({ diagnostics }); const caseRecords = diagnostics.map((item) => { const caseMetricVector = { problem_unit_precision: item.problem_unit_precision, problem_unit_recall_proxy: item.problem_unit_recall_proxy, duplicate_collapse_rate: item.duplicate_collapse_rate, mechanism_coherence_score: round2(item.mechanism_coherence_score), problem_clarity_score: round2(item.problem_clarity_score), problem_first_answer_rate: item.problem_first_answer_applied === null ? null : item.problem_first_answer_applied ? 1 : 0, entity_leakage_rate: item.entity_leakage ? 1 : 0 }; return { schema_version: stage2EvalContracts_1.ASSISTANT_STAGE2_EVAL_RECORD_SCHEMA_VERSION, created_at: new Date().toISOString(), case_id: item.suite_case.case_id, scenario_tag: item.suite_case.scenario_tag, session_id: item.session_id, trace_id: item.trace_id, question_type: item.suite_case.question_type, broadness_level: item.suite_case.broadness_level, expected_problem_unit_types: item.expected_problem_unit_types, expected_problem_first: item.expected_problem_first, problem_units_detected: item.signals.problem_units_total, candidate_evidence_detected: item.signals.candidate_evidence_total, duplicate_collapses_detected: item.signals.duplicate_collapses_total, metric_subscores: caseMetricVector, raw_signals: { final_reply_type: item.final_reply_type, turn_count: item.turn_count, broad_query_detected: item.signals.broad_query_detected, broad_result_flag: item.signals.broad_result_flag, narrowing_strength: item.signals.narrowing_strength, minimum_evidence_failed: item.signals.minimum_evidence_failed, degraded_to: item.signals.degraded_to, evidence_confidence: item.signals.evidence_confidence, limitation_reason_codes: item.signals.limitation_reason_codes, mechanism_status: item.signals.mechanism_status, source_refs: item.signals.source_refs, routes: item.signals.routes, followup_state_applied: item.signals.followup_state_applied, problem_units_total: item.signals.problem_units_total, candidate_evidence_total: item.signals.candidate_evidence_total, problem_unit_types: item.signals.problem_unit_types, duplicate_collapses_total: item.signals.duplicate_collapses_total, problem_centric_answer_applied: item.signals.problem_centric_answer_applied, problem_units_used_count: item.signals.problem_units_used_count, problem_answer_mode: item.signals.problem_answer_mode, problem_unit_ids_used: item.signals.problem_unit_ids_used, entity_leakage_detected: item.signals.entity_leakage_detected }, limitations: item.limitations, notes: item.notes }; }); const strongestSignals = Object.entries(metrics.rubric_bands) .filter(([, band]) => band?.score === 5) .map(([name]) => name); const weakestSignals = Object.entries(metrics.rubric_bands) .filter(([, band]) => band?.score === 0) .map(([name]) => name); const runTimestamp = new Date().toISOString(); const report = { schema_version: ASSISTANT_STAGE2_RUN_SCHEMA_VERSION, run_id: runId, run_timestamp: runTimestamp, eval_target: "assistant_stage2", mode: payload.mode, use_mock: Boolean(payload.useMock), prompt_version: payload.normalizeConfig.promptVersion ?? null, suite_id: suite.suite_id, suite_version: suite.suite_version, suite_schema_version: suite.schema_version ?? null, scenario_count: suite.scenario_count, case_ids: suiteCases.map((item) => item.case_id), cases_total: caseRecords.length, feature_profile_snapshot: buildFeatureProfileSnapshot(), code_version: buildCodeVersionMarker(), metrics: { raw: metrics.raw, denominators: metrics.denominators }, rubric_bands: metrics.rubric_bands, subsets: { expected_problem_cases_total: metrics.denominators.expected_problem_cases_total, followup_cases_total: metrics.denominators.followup_cases_total, candidate_cases_total: metrics.denominators.candidate_cases_total }, budget: { requests_total: requestsTotal }, results: caseRecords, scenario_summary: { improved_or_strong: caseRecords.filter((item) => { const clarity = Number(item.metric_subscores.problem_clarity_score ?? 0); const mechanism = Number(item.metric_subscores.mechanism_coherence_score ?? 0); return clarity >= 4 && mechanism >= 3; }).length, unchanged_or_mixed: caseRecords.filter((item) => { const clarity = Number(item.metric_subscores.problem_clarity_score ?? 0); return clarity >= 2.5 && clarity < 4; }).length, weak_or_regressed: caseRecords.filter((item) => Number(item.metric_subscores.problem_clarity_score ?? 0) < 2.5).length }, improvement_hints: { strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none", weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none" }, known_limitations: [ "Stage 2 eval remains heuristic and scoped to problem-unit baseline (no graph/lifecycle/investigation runtime scoring).", "problem_unit_recall_proxy uses suite expected types as lightweight proxy, not full ground-truth labeling.", "Comparison quality depends on stable feature profile and reproducible mock/runtime setup." ], report_title: "Assistant Stage 2 Eval Run" }; (0, files_1.ensureDir)(config_1.REPORTS_DIR); const runJsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.json`); const runMdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.md`); const compactReport = compactAssistantStage2Report(report); const jsonWritten = tryWriteJsonFile(runJsonPath, compactReport); const mdWritten = tryWriteTextFile(runMdPath, buildAssistantStage2EvalMarkdownReport(compactReport)); const runReportRef = jsonWritten ? runJsonPath : putInMemoryEvalReport(compactReport); report.artifacts = { run_report_json_path: runReportRef, run_report_md_path: mdWritten ? runMdPath : null }; if (payload.compareWithReportFile) { report.comparison = this.buildAssistantStage2ComparisonReport({ currentReport: report, baselineReportFile: payload.compareWithReportFile }); } return report; } async runAssistantP0(payload) { if (!config_1.FEATURE_ASSISTANT_STAGE2_EVAL_V1) { throw new http_1.ApiError("ASSISTANT_P0_EVAL_DISABLED", "Assistant P0 eval target is disabled by FEATURE_ASSISTANT_STAGE2_EVAL_V1.", 409); } const runner = new p0_eval_runner_1.P0EvalRunner(this.normalizerService); return runner.run({ normalizeConfig: payload.normalizeConfig, caseIds: payload.caseIds, useMock: payload.useMock, mode: payload.mode, caseSetFile: payload.caseSetFile, compareWithReportFile: payload.compareWithReportFile }); } async run(payload) { const mode = payload.mode ?? "standard"; const evalTarget = payload.evalTarget ?? "normalizer"; if (evalTarget === "assistant_stage1") { return this.runAssistantStage1({ normalizeConfig: payload.normalizeConfig, caseIds: payload.caseIds, useMock: payload.useMock, mode, caseSetFile: payload.caseSetFile, compareWithReportFile: payload.compareWithReportFile, runId: payload.runId }); } if (evalTarget === "assistant_stage2") { return this.runAssistantStage2({ normalizeConfig: payload.normalizeConfig, caseIds: payload.caseIds, useMock: payload.useMock, mode, caseSetFile: payload.caseSetFile, compareWithReportFile: payload.compareWithReportFile, runId: payload.runId }); } if (evalTarget === "assistant_p0") { return this.runAssistantP0({ normalizeConfig: payload.normalizeConfig, caseIds: payload.caseIds, useMock: payload.useMock, mode, caseSetFile: payload.caseSetFile, compareWithReportFile: payload.compareWithReportFile }); } const promptVersion = String(payload.normalizeConfig.promptVersion ?? "").toLowerCase(); const schemaVersion = String(payload.normalizeConfig.schemaVersion ?? "").toLowerCase(); const isV2 = promptVersion.startsWith("normalizer_v2") || schemaVersion === "v2" || schemaVersion === "v2_0_1" || schemaVersion === "v2_0_2"; const inlineQuestions = payload.rawQuestions ? parseRawQuestions(payload.rawQuestions) : []; const inlineCases = inlineQuestions.map((question, index) => ({ case_id: formatCaseId("BQ", index), raw_question: question, expected: null })); if (isV2) { const sourceCases = inlineCases.length > 0 ? inlineCases : payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile).map((item) => ({ case_id: item.case_id, raw_question: item.raw_question, expected: item.expected })) : this.listCases().map((item) => ({ case_id: item.case_id, raw_question: item.raw_question, expected: item.expected })); const filtered = sourceCases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id)); return this.runV2({ ...payload, mode, cases: filtered }); } if (inlineCases.length > 0) { throw new Error("rawQuestions batch is supported for normalizer_v2 only."); } const casesSource = payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile) : this.listCases(); const filteredCases = casesSource.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id)); const runId = `eval-${(0, nanoid_1.nanoid)(10)}`; const results = []; const mismatches = []; const badConfidenceCases = []; const classCounter = {}; let schemaPass = 0; let intentPass = 0; let routePass = 0; let causalPass = 0; let highConfidenceErrors = 0; let requestsTotal = 0; let retriesUsed = 0; for (const item of filteredCases) { const response = await this.normalizerService.normalize({ ...payload.normalizeConfig, userQuestion: item.raw_question, context: { expected_route: item.expected.route_hint, eval_label: runId, case_id: item.case_id, eval_mode: mode }, retryPolicy: mode === "single-pass-strict" ? "single-pass-strict" : "default", useMock: payload.useMock }); const normalized = response.normalized && response.normalized.schema_version === "normalized_query_v1" ? response.normalized : null; const intentMatch = Boolean(normalized && item.expected.intent_class === normalized.intent_class); const routeMatch = Boolean(normalized && item.expected.route_hint === normalized.route_hint); const causalMatch = Boolean(normalized && item.expected.requires && item.expected.requires.needs_cross_entity_join === normalized.requires.needs_cross_entity_join && item.expected.requires.needs_causal_chain === normalized.requires.needs_causal_chain); if (response.validation.passed) schemaPass += 1; if (intentMatch) intentPass += 1; if (routeMatch) routePass += 1; if (causalMatch || !item.expected.requires) causalPass += 1; const requestCount = Number(response.request_count_for_case ?? 0); requestsTotal += requestCount; if (requestCount > 1) { retriesUsed += 1; } const classKey = String(item.expected.intent_class ?? "unknown"); if (!classCounter[classKey]) { classCounter[classKey] = { total: 0, passed: 0 }; } classCounter[classKey].total += 1; if (intentMatch) { classCounter[classKey].passed += 1; } const confidenceOverall = normalized?.confidence.overall ?? null; const hasMismatch = !intentMatch || !routeMatch || (!causalMatch && Boolean(item.expected.requires)); if (confidenceOverall === "high" && hasMismatch) { highConfidenceErrors += 1; badConfidenceCases.push({ case_id: item.case_id, confidence_overall: confidenceOverall, intent_match: intentMatch, route_match: routeMatch, causal_match: causalMatch || !item.expected.requires, trace_id: response.trace_id }); } if (hasMismatch || !response.validation.passed) { mismatches.push({ case_id: item.case_id, expected_intent_class: item.expected.intent_class ?? null, actual_intent_class: normalized?.intent_class ?? null, expected_route_hint: item.expected.route_hint ?? null, actual_route_hint: normalized?.route_hint ?? null, expected_requires: item.expected.requires ?? null, actual_requires: normalized?.requires ?? null, comment: shortMismatchComment({ intentMatch, routeMatch, causalMatch: causalMatch || !item.expected.requires, validationPassed: response.validation.passed }), trace_id: response.trace_id }); } results.push({ case_id: item.case_id, raw_question: item.raw_question, validation_passed: response.validation.passed, intent_match: intentMatch, route_match: routeMatch, causal_flags_match: causalMatch || !item.expected.requires, expected_intent_class: item.expected.intent_class ?? null, actual_intent_class: normalized?.intent_class ?? null, expected_route_hint: item.expected.route_hint ?? null, actual_route_hint: normalized?.route_hint ?? null, expected_requires: item.expected.requires ?? null, actual_requires: normalized?.requires ?? null, confidence_overall: confidenceOverall, trace_id: response.trace_id, request_count_for_case: requestCount }); } const total = Math.max(1, filteredCases.length); const metrics = { schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)), intent_class_accuracy: Number(((intentPass / total) * 100).toFixed(2)), route_hint_accuracy: Number(((routePass / total) * 100).toFixed(2)), causal_flag_accuracy: Number(((causalPass / total) * 100).toFixed(2)), high_confidence_error_rate: Number(((highConfidenceErrors / total) * 100).toFixed(2)) }; const classAccuracy = Object.fromEntries(Object.entries(classCounter).map(([key, value]) => [ key, { total: value.total, passed: value.passed, accuracy_percent: Number(((value.passed / Math.max(1, value.total)) * 100).toFixed(2)) } ])); const baselineAsMap = BASELINE_METRICS; const baselineDelta = Object.fromEntries(Object.entries(metrics).map(([key, value]) => [key, Number((value - baselineAsMap[key]).toFixed(2))])); const report = { run_id: runId, timestamp: new Date().toISOString(), mode, use_mock: Boolean(payload.useMock), prompt_version: payload.normalizeConfig.promptVersion ?? null, dataset: { source: payload.caseSetFile ? "file" : "data/eval_cases/*.json", file: payload.caseSetFile ?? null }, cases_total: filteredCases.length, metrics, baseline_metrics: BASELINE_METRICS, baseline_delta: baselineDelta, class_accuracy: classAccuracy, budget: { requests_total: requestsTotal, retries_used: retriesUsed, guidance: { forensic_calls_max: 10, final_eval_calls_max: 30, target_total_calls_max: 40, hard_cap_calls_max: 45 } }, mismatches, bad_confidence_cases: badConfidenceCases, results }; (0, files_1.ensureDir)(config_1.EVAL_CASES_DIR); tryWriteJsonFile(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report); const shouldWriteV11Artifacts = mode === "single-pass-strict" && Boolean(payload.caseSetFile) && path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_30cases.json"; if (shouldWriteV11Artifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.json"), report); tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1 Eval Run" })); } const shouldWriteV1121EvalArtifacts = mode === "single-pass-strict" && String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2_1" && Boolean(payload.caseSetFile) && path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_2_1_30cases.json"; if (shouldWriteV1121EvalArtifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.json"), report); tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1.2.1 Eval Run" })); } const shouldWriteV111MicroArtifacts = mode === "single-pass-strict" && String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_1" && isSameCaseSet(payload.caseIds, V111_MICRO_CASE_IDS); if (shouldWriteV111MicroArtifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.json"), report); tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1.1 Micro Eval" })); } const shouldWriteV112MicroArtifacts = mode === "single-pass-strict" && String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2" && isSameCaseSet(payload.caseIds, V112_MICRO_CASE_IDS); if (shouldWriteV112MicroArtifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); tryWriteJsonFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.json"), report); tryWriteTextFile(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1.2 Micro Eval" })); } return report; } } exports.EvalService = EvalService;