"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.EvalService = void 0; const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const nanoid_1 = require("nanoid"); const config_1 = require("../config"); const stage1Contracts_1 = require("../types/stage1Contracts"); const http_1 = require("../utils/http"); const assistantService_1 = require("./assistantService"); const assistantSessionStore_1 = require("./assistantSessionStore"); const files_1 = require("../utils/files"); const BASELINE_METRICS = { schema_validation_pass_rate: 100, intent_class_accuracy: 72.73, route_hint_accuracy: 90.91, causal_flag_accuracy: 81.82, high_confidence_error_rate: 9.09 }; const V111_MICRO_CASE_IDS = ["NQ-008", "V11-DD-005", "V11-OT-003", "V11-OT-004", "V11-OT-005"]; const V112_MICRO_CASE_IDS = ["NQ-002", "NQ-007", "V11-HA-004", "V11-OT-003", "V11-OT-005"]; function isSameCaseSet(input, target) { if (!input || input.length !== target.length) { return false; } const left = [...input].sort(); const right = [...target].sort(); return left.every((value, index) => value === right[index]); } function formatPercent(value) { return `${value.toFixed(2)}%`; } function shortMismatchComment(input) { if (!input.validationPassed) { return "Schema validation failed for this case."; } if (!input.intentMatch && input.routeMatch) { return "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket."; } if (input.intentMatch && !input.routeMatch) { return "Intent understood, but route_hint selected a weaker execution route."; } if (!input.intentMatch && !input.routeMatch) { return "Both intent and route misclassified; likely lexical ambiguity in causal vs risk wording."; } if (!input.causalMatch) { return "Causal flags are inconsistent with expected relationship depth."; } return "No mismatch."; } function buildMarkdownReport(report) { const metrics = (report.metrics ?? {}); const baseline = (report.baseline_metrics ?? {}); const delta = (report.baseline_delta ?? {}); const classAccuracy = (report.class_accuracy ?? {}); const mismatches = Array.isArray(report.mismatches) ? report.mismatches : []; const badConfidenceCases = Array.isArray(report.bad_confidence_cases) ? report.bad_confidence_cases : []; const budget = (report.budget ?? {}); const metricRows = Object.keys(metrics) .map((key) => { const current = Number(metrics[key] ?? 0); const base = Number(baseline[key] ?? 0); const d = Number(delta[key] ?? 0); const sign = d > 0 ? "+" : ""; return `| ${key} | ${formatPercent(current)} | ${formatPercent(base)} | ${sign}${d.toFixed(2)} |`; }) .join("\n"); const classRows = Object.keys(classAccuracy) .map((key) => { const row = classAccuracy[key]; return `| ${key} | ${row.passed}/${row.total} | ${formatPercent(row.accuracy_percent)} |`; }) .join("\n"); const mismatchRows = mismatches.length === 0 ? "No mismatches." : mismatches .map((item) => { const row = item; return `- ${row.case_id}: expected(${row.expected_intent_class} / ${row.expected_route_hint}) -> actual(${row.actual_intent_class} / ${row.actual_route_hint}). ${row.comment}`; }) .join("\n"); const badConfidenceRows = badConfidenceCases.length === 0 ? "No bad-confidence cases." : badConfidenceCases .map((item) => { const row = item; return `- ${row.case_id}: confidence=${row.confidence_overall}, intent_match=${row.intent_match}, route_match=${row.route_match}`; }) .join("\n"); return [ `# ${String(report.report_title ?? "LLM Normalizer Eval Run")}`, "", `- run_id: ${String(report.run_id ?? "")}`, `- timestamp: ${String(report.timestamp ?? "")}`, `- mode: ${String(report.mode ?? "")}`, `- use_mock: ${String(report.use_mock ?? false)}`, `- cases_total: ${String(report.cases_total ?? 0)}`, `- prompt_version: ${String(report.prompt_version ?? "")}`, "", "## Metrics vs Baseline", "", "| Metric | Current | Baseline | Delta |", "|---|---:|---:|---:|", metricRows || "| n/a | n/a | n/a | n/a |", "", "## Class Accuracy", "", "| Intent class | Passed/Total | Accuracy |", "|---|---:|---:|", classRows || "| n/a | n/a | n/a |", "", "## Budget", "", `- requests_total: ${String(budget.requests_total ?? 0)}`, `- retries_used: ${String(budget.retries_used ?? 0)}`, "", "## Mismatches", "", mismatchRows, "", "## Bad Confidence Cases", "", badConfidenceRows, "" ].join("\n"); } function parseCaseSetFile(inputPath) { const filePath = path_1.default.isAbsolute(inputPath) ? inputPath : path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath); const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, ""); const parsed = JSON.parse(raw); if (Array.isArray(parsed)) { return parsed; } if (parsed && typeof parsed === "object" && Array.isArray(parsed.cases)) { return parsed.cases; } throw new Error(`Unsupported eval dataset format: ${filePath}`); } function formatCaseId(prefix, index) { return `${prefix}-${String(index + 1).padStart(3, "0")}`; } function parseRawQuestions(rawQuestions) { const text = rawQuestions.replace(/\r\n/g, "\n").trim(); if (!text) { return []; } const bySemicolon = text .split(";") .map((item) => item.trim()) .filter(Boolean); if (bySemicolon.length > 1) { return bySemicolon; } const byBlankLine = text .split(/\n\s*\n+/) .map((item) => item.trim()) .filter(Boolean); if (byBlankLine.length > 1) { return byBlankLine; } const byLine = text .split("\n") .map((item) => item.trim()) .filter(Boolean); return byLine.length > 0 ? byLine : [text]; } function executionReadinessOf(fragment) { return "execution_readiness" in fragment ? fragment.execution_readiness : "executable"; } function softAssumptionsOf(fragment) { return "soft_assumption_used" in fragment ? fragment.soft_assumption_used : []; } function routeStatusOf(fragment) { return "route_status" in fragment ? fragment.route_status : null; } function noRouteReasonOf(fragment) { return "no_route_reason" in fragment ? fragment.no_route_reason : null; } function expectedScopeInScope(expected) { if (!expected) { return null; } if (typeof expected.expected_scope_in_scope === "boolean") { return expected.expected_scope_in_scope; } if (expected.expected_no_route_reason === "out_of_scope") { return false; } if (expected.expected_route_status === "routed") { return true; } if (typeof expected.clarification_required === "boolean") { return true; } return null; } function isDecisionStateConsistent(decision) { const readiness = String(decision.execution_readiness ?? ""); const noRouteReason = decision.no_route_reason ?? null; if (decision.route === "no_route") { if (!noRouteReason) { return false; } return readiness !== "executable" && readiness !== "executable_with_soft_assumptions"; } if (noRouteReason) { return false; } return readiness !== "needs_clarification" && readiness !== "no_route"; } const DEFAULT_ASSISTANT_STAGE1_SUITE_FILE = "assistant_stage1_canonical_v0_1.json"; const ASSISTANT_STAGE1_RUN_SCHEMA_VERSION = "assistant_stage1_eval_run_v0_1"; const ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION = "assistant_stage1_eval_comparison_v0_1"; function round2(value) { return Number(value.toFixed(2)); } function clampScore(value, min = 0, max = 5) { if (Number.isNaN(value)) { return min; } if (value < min) return min; if (value > max) return max; return value; } function rateToBandScore(metric, value) { if (metric === "retrieval_differentiation_rate") { if (value >= 0.75) return 5; if (value >= 0.45) return 3; return 0; } if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") { if (value <= 0.25) return 5; if (value <= 0.45) return 3; return 0; } if (metric === "accountant_actionability_score" || metric === "mechanism_specificity_score" || metric === "followup_context_retention_score") { if (value >= 4) return 5; if (value >= 2.5) return 3; return 0; } return 0; } function rubricBandForMetric(metric, value) { if (value === null) { return null; } const score = rateToBandScore(metric, value); return stage1Contracts_1.ACCOUNTANT_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null; } function buildFeatureProfileSnapshot() { return { FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1: config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1, FEATURE_ASSISTANT_ANSWER_POLICY_V11: process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 ?? String(config_1.FEATURE_ASSISTANT_ANSWER_POLICY_V11), FEATURE_ASSISTANT_BROAD_GUARD_V1: process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 ?? null, FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1: process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 ?? null, FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1: process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 ?? null, FEATURE_ASSISTANT_INVESTIGATION_STATE_V1: process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 ?? null, FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1: process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 ?? null }; } function buildCodeVersionMarker() { return { git_commit: process.env.GIT_COMMIT ?? process.env.CI_COMMIT_SHA ?? process.env.VERCEL_GIT_COMMIT_SHA ?? process.env.GITHUB_SHA ?? null, build_marker: process.env.BUILD_MARKER ?? process.env.BUILD_ID ?? process.env.npm_package_version ?? null }; } function resolveReadablePath(inputPath) { if (path_1.default.isAbsolute(inputPath)) { return inputPath; } const candidates = [ path_1.default.resolve(config_1.REPORTS_DIR, inputPath), path_1.default.resolve(config_1.EVAL_DATASETS_DIR, inputPath), path_1.default.resolve(config_1.EVAL_CASES_DIR, inputPath), path_1.default.resolve(inputPath) ]; for (const candidate of candidates) { if (fs_1.default.existsSync(candidate)) { return candidate; } } return candidates[0]; } function parseAssistantSuiteFile(inputPath) { const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE1_SUITE_FILE); const raw = fs_1.default.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, ""); const parsed = JSON.parse(raw); if (!parsed || typeof parsed !== "object") { throw new Error(`Invalid assistant suite format: ${filePath}`); } if (!Array.isArray(parsed.cases)) { throw new Error(`Assistant suite cases[] is required: ${filePath}`); } if (!Array.isArray(parsed.case_ids)) { throw new Error(`Assistant suite case_ids[] is required: ${filePath}`); } if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) { throw new Error(`Assistant suite suite_id is required: ${filePath}`); } if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) { throw new Error(`Assistant suite suite_version is required: ${filePath}`); } if (parsed.scenario_count !== parsed.cases.length) { throw new Error(`Assistant suite scenario_count mismatch: ${filePath}`); } const declaredIds = [...parsed.case_ids].sort(); const actualIds = parsed.cases.map((item) => item.case_id).sort(); const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]); if (!idsMatch) { throw new Error(`Assistant suite case_ids do not match cases[]: ${filePath}`); } for (const item of parsed.cases) { if (!Array.isArray(item.turns) || item.turns.length === 0) { throw new Error(`Assistant suite case ${item.case_id} must include at least one turn.`); } } return parsed; } function hasDomainAnchors(text) { const source = String(text ?? ""); if (!source.trim()) { return false; } const hasPeriod = /\b20\d{2}(?:[-./](?:0[1-9]|1[0-2]))?\b/.test(source); const hasAccountingObject = /(счет|контрагент|документ|ндс|ос|period|account|supplier|invoice|guid|объект)/i.test(source); const hasAccountCode = /\b(?:01|02|03|04|08|10|19|20|25|26|41|43|44|50|51|52|57|60|62|68|69|70|71|73|76|90|91|94|97)\b/.test(source); const hits = [hasPeriod, hasAccountingObject, hasAccountCode].filter(Boolean).length; return hits >= 2; } function extractTextList(value) { if (!Array.isArray(value)) { return []; } return value .map((item) => (typeof item === "string" ? item.trim() : "")) .filter(Boolean); } function toNarrowingStrength(value) { if (value === "weak" || value === "medium" || value === "strong") { return value; } return null; } function toDegradedTo(value) { if (value === "partial" || value === "clarification") { return value; } return null; } function buildAssistantEvalMarkdownReport(report) { const metrics = (report.metrics ?? {}).raw ?? {}; const bands = (report.rubric_bands ?? {}); const subsets = (report.subsets ?? {}); const scenarioSummary = (report.scenario_summary ?? {}); const improvementHints = (report.improvement_hints ?? {}); const rows = Object.keys(metrics) .map((key) => { const rawValue = metrics[key]; const band = bands[key]; const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue); const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a"; return `| ${key} | ${rawPrintable} | ${bandPrintable} |`; }) .join("\n"); return [ `# ${String(report.report_title ?? "Assistant Stage 1 Eval Run")}`, "", `- run_id: ${String(report.run_id ?? "")}`, `- eval_target: ${String(report.eval_target ?? "")}`, `- run_timestamp: ${String(report.run_timestamp ?? "")}`, `- suite_id: ${String(report.suite_id ?? "")}`, `- suite_version: ${String(report.suite_version ?? "")}`, `- cases_total: ${String(report.cases_total ?? 0)}`, "", "## Raw Metrics and Rubric Bands", "", "| Metric | Raw | Rubric band |", "|---|---:|---|", rows || "| n/a | n/a | n/a |", "", "## Subsets", "", `- broad_cases_total: ${String(subsets.broad_cases_total ?? 0)}`, `- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`, "", "## Scenario Summary", "", `- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`, `- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`, `- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`, "", "## Improvement Hints", "", `- strongest_signals: ${String(improvementHints.strongest_signals ?? "n/a")}`, `- weakest_signals: ${String(improvementHints.weakest_signals ?? "n/a")}`, "" ].join("\n"); } function buildAssistantComparisonMarkdownReport(report) { const metrics = (report.metric_deltas ?? {}); const summary = (report.scenario_notes_summary ?? {}); const rows = Object.keys(metrics) .map((key) => { const row = metrics[key]; return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`; }) .join("\n"); return [ `# ${String(report.report_title ?? "Assistant Stage 1 Baseline vs Current")}`, "", `- comparison_id: ${String(report.comparison_id ?? "")}`, `- baseline_run_id: ${String(report.baseline_run_id ?? "")}`, `- current_run_id: ${String(report.current_run_id ?? "")}`, `- suite_version: ${String(report.suite_version ?? "")}`, "", "## Metric Deltas", "", "| Metric | Baseline | Current | Delta | Trend |", "|---|---:|---:|---:|---|", rows || "| n/a | n/a | n/a | n/a | n/a |", "", "## Scenario Notes Summary", "", `- improved: ${String(summary.improved ?? 0)}`, `- unchanged: ${String(summary.unchanged ?? 0)}`, `- weakened: ${String(summary.weakened ?? 0)}`, "" ].join("\n"); } class EvalService { normalizerService; constructor(normalizerService) { this.normalizerService = normalizerService; } listCases() { (0, files_1.ensureDir)(config_1.EVAL_CASES_DIR); const files = fs_1.default .readdirSync(config_1.EVAL_CASES_DIR) .filter((item) => item.endsWith(".json") && !item.endsWith(".report.json")); return files .map((name) => { const raw = fs_1.default.readFileSync(path_1.default.resolve(config_1.EVAL_CASES_DIR, name), "utf-8"); return JSON.parse(raw); }) .sort((a, b) => a.case_id.localeCompare(b.case_id)); } async runV2(payload) { const runId = `eval-${(0, nanoid_1.nanoid)(10)}`; const results = []; const routeCounter = {}; const fallbackCounter = {}; let schemaPass = 0; let inScopeMessages = 0; let multiIntentMessages = 0; let clarificationMessages = 0; let totalFragments = 0; let inScopeFragments = 0; let outOfScopeFragments = 0; let unclearFragments = 0; let executableWithSoftAssumptionsFragments = 0; let softAssumptionFragments = 0; let routedFragments = 0; let noRouteFragments = 0; let requestsTotal = 0; let retriesUsed = 0; let clarificationLabeledCases = 0; let clarificationTruePositive = 0; let clarificationFalsePositive = 0; let clarificationFalseNegative = 0; let scopeLabeledCases = 0; let scopeCorrectCases = 0; let routeLabeledCases = 0; let routeCorrectCases = 0; let expectedRoutedCases = 0; let noRouteTruePositive = 0; let noRouteFalsePositive = 0; let stateConsistencyChecks = 0; let stateConsistencyPass = 0; for (const item of payload.cases) { const response = await this.normalizerService.normalize({ ...payload.normalizeConfig, userQuestion: item.raw_question, context: { eval_label: runId, case_id: item.case_id, eval_mode: payload.mode }, retryPolicy: payload.mode === "single-pass-strict" ? "single-pass-strict" : "default", useMock: payload.useMock }); if (response.validation.passed) { schemaPass += 1; } const requestCount = Number(response.request_count_for_case ?? 0); requestsTotal += requestCount; if (requestCount > 1) { retriesUsed += 1; } const normalized = response.normalized && ["normalized_query_v2", "normalized_query_v2_0_1", "normalized_query_v2_0_2"].includes(String(response.normalized.schema_version ?? "")) ? response.normalized : null; const routeSummary = response.route_hint_summary && response.route_hint_summary.mode === "deterministic_v2" ? response.route_hint_summary : null; if (normalized) { if (normalized.message_in_scope) { inScopeMessages += 1; } if (normalized.contains_multiple_tasks) { multiIntentMessages += 1; } if (normalized.global_notes.needs_clarification) { clarificationMessages += 1; } totalFragments += normalized.fragments.length; const inScopeList = normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope"); inScopeFragments += inScopeList.length; outOfScopeFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length; unclearFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length; for (const fragment of inScopeList) { const readiness = executionReadinessOf(fragment); if (readiness === "executable_with_soft_assumptions") { executableWithSoftAssumptionsFragments += 1; } if (softAssumptionsOf(fragment).length > 0) { softAssumptionFragments += 1; } } } const predictedClarification = Boolean(normalized?.global_notes?.needs_clarification); const expectedClarification = typeof item.expected?.clarification_required === "boolean" ? item.expected.clarification_required : null; if (expectedClarification !== null) { clarificationLabeledCases += 1; if (predictedClarification && expectedClarification) clarificationTruePositive += 1; if (predictedClarification && !expectedClarification) clarificationFalsePositive += 1; if (!predictedClarification && expectedClarification) clarificationFalseNegative += 1; } const predictedScope = normalized ? normalized.message_in_scope : null; const expectedScope = expectedScopeInScope(item.expected); if (expectedScope !== null && predictedScope !== null) { scopeLabeledCases += 1; if (predictedScope === expectedScope) { scopeCorrectCases += 1; } } const predictedRouteStatus = routeSummary ? routeSummary.decisions.some((decision) => decision.route !== "no_route") ? "routed" : "no_route" : null; const predictedNoRouteReason = routeSummary && routeSummary.decisions.length > 0 && routeSummary.decisions.every((decision) => decision.route === "no_route") ? (routeSummary.decisions[0]?.no_route_reason ?? null) : null; const expectedRouteStatus = item.expected?.expected_route_status ?? null; const expectedNoRouteReason = item.expected?.expected_no_route_reason ?? null; if (expectedRouteStatus) { routeLabeledCases += 1; if (predictedRouteStatus === expectedRouteStatus) { routeCorrectCases += 1; } if (expectedRouteStatus === "routed") { expectedRoutedCases += 1; } } if (predictedRouteStatus === "no_route") { if (expectedRouteStatus === "no_route") { if (!expectedNoRouteReason || expectedNoRouteReason === predictedNoRouteReason) { noRouteTruePositive += 1; } else { noRouteFalsePositive += 1; } } else if (expectedRouteStatus === "routed") { noRouteFalsePositive += 1; } } if (routeSummary) { for (const decision of routeSummary.decisions) { stateConsistencyChecks += 1; if (isDecisionStateConsistent(decision)) { stateConsistencyPass += 1; } routeCounter[decision.route] = (routeCounter[decision.route] ?? 0) + 1; if (decision.route === "no_route") { noRouteFragments += 1; } else { routedFragments += 1; } } const fallbackType = String(routeSummary.fallback?.type ?? "none"); fallbackCounter[fallbackType] = (fallbackCounter[fallbackType] ?? 0) + 1; } else { fallbackCounter.none = (fallbackCounter.none ?? 0) + 1; } results.push({ case_id: item.case_id, raw_question: item.raw_question, validation_passed: response.validation.passed, message_in_scope: normalized?.message_in_scope ?? null, scope_confidence: normalized?.scope_confidence ?? null, contains_multiple_tasks: normalized?.contains_multiple_tasks ?? null, fragments_total: normalized?.fragments.length ?? 0, in_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope").length : 0, out_of_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length : 0, unclear_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length : 0, fallback_type: routeSummary?.fallback?.type ?? "none", predicted_route_status: predictedRouteStatus, expected_route_status: expectedRouteStatus, predicted_no_route_reason: predictedNoRouteReason, expected_no_route_reason: expectedNoRouteReason, predicted_clarification_required: predictedClarification, expected_clarification_required: expectedClarification, executable_with_soft_assumptions_fragments: normalized ? normalized.fragments.filter((fragment) => executionReadinessOf(fragment) === "executable_with_soft_assumptions") .length : 0, trace_id: response.trace_id, request_count_for_case: requestCount }); } const total = Math.max(1, payload.cases.length); const totalFragmentsSafe = Math.max(1, totalFragments); const totalRoutedDecisions = Math.max(1, routedFragments + noRouteFragments); const precisionDenominator = clarificationTruePositive + clarificationFalsePositive; const recallDenominator = clarificationTruePositive + clarificationFalseNegative; const noRoutePrecisionDenominator = noRouteTruePositive + noRouteFalsePositive; const metrics = { schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)), scope_detection_accuracy: scopeLabeledCases > 0 ? Number(((scopeCorrectCases / scopeLabeledCases) * 100).toFixed(2)) : null, scope_in_scope_rate: Number(((inScopeMessages / total) * 100).toFixed(2)), multi_intent_detected_rate: Number(((multiIntentMessages / total) * 100).toFixed(2)), clarification_required_rate: Number(((clarificationMessages / total) * 100).toFixed(2)), avg_fragments_per_message: Number((totalFragments / total).toFixed(2)), out_of_scope_fragment_rate: Number(((outOfScopeFragments / totalFragmentsSafe) * 100).toFixed(2)), routed_fragment_rate: Number(((routedFragments / totalRoutedDecisions) * 100).toFixed(2)), no_route_fragment_rate: Number(((noRouteFragments / totalRoutedDecisions) * 100).toFixed(2)), route_resolution_accuracy: routeLabeledCases > 0 ? Number(((routeCorrectCases / routeLabeledCases) * 100).toFixed(2)) : null, no_route_precision: noRoutePrecisionDenominator > 0 ? Number(((noRouteTruePositive / noRoutePrecisionDenominator) * 100).toFixed(2)) : null, false_no_route_rate: expectedRoutedCases > 0 ? Number(((noRouteFalsePositive / expectedRoutedCases) * 100).toFixed(2)) : null, execution_state_consistency_rate: stateConsistencyChecks > 0 ? Number(((stateConsistencyPass / stateConsistencyChecks) * 100).toFixed(2)) : null, executable_with_soft_assumptions_rate: Number(((executableWithSoftAssumptionsFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)), soft_assumption_used_fragment_rate: Number(((softAssumptionFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)), clarification_precision: precisionDenominator > 0 ? Number(((clarificationTruePositive / precisionDenominator) * 100).toFixed(2)) : null, clarification_recall: recallDenominator > 0 ? Number(((clarificationTruePositive / recallDenominator) * 100).toFixed(2)) : null, false_clarification_rate: clarificationLabeledCases > 0 ? Number(((clarificationFalsePositive / clarificationLabeledCases) * 100).toFixed(2)) : null }; const report = { run_id: runId, timestamp: new Date().toISOString(), mode: payload.mode, use_mock: Boolean(payload.useMock), prompt_version: payload.normalizeConfig.promptVersion ?? null, schema_version: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "") .toLowerCase() .includes("v2_0_2") ? "v2_0_2" : String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "") .toLowerCase() .includes("v2_0_1") ? "v2_0_1" : "v2", dataset: { source: payload.rawQuestions ? "inline_raw_questions" : payload.caseSetFile ? "file" : "data/eval_cases/*.json", file: payload.caseSetFile ?? null, raw_questions_count: payload.rawQuestions ? parseRawQuestions(payload.rawQuestions).length : null }, cases_total: payload.cases.length, metrics, budget: { requests_total: requestsTotal, retries_used: retriesUsed }, clarification_eval: { labeled_cases: clarificationLabeledCases, true_positive: clarificationTruePositive, false_positive: clarificationFalsePositive, false_negative: clarificationFalseNegative }, route_eval: { labeled_cases: routeLabeledCases, correct_cases: routeCorrectCases, expected_routed_cases: expectedRoutedCases, no_route_true_positive: noRouteTruePositive, no_route_false_positive: noRouteFalsePositive }, scope_eval: { labeled_cases: scopeLabeledCases, correct_cases: scopeCorrectCases }, execution_state_eval: { checks_total: stateConsistencyChecks, checks_passed: stateConsistencyPass }, route_distribution: routeCounter, fallback_distribution: fallbackCounter, results }; (0, files_1.ensureDir)(config_1.EVAL_CASES_DIR); (0, files_1.writeJsonFile)(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report); return report; } collectAssistantSignals(finalResponse, turnResponses) { const debug = finalResponse.debug; const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : []; const sourceRefSet = new Set(); const limitationCodeSet = new Set(); const routeSet = new Set(); const confidenceScores = []; const narrowingOrder = { weak: 0, medium: 1, strong: 2 }; let broadQueryDetected = false; let broadResultFlag = false; let minimumEvidenceFailed = false; let degradedTo = null; let narrowingStrength = null; for (const result of retrievalResults) { routeSet.add(String(result.route ?? "unknown")); const summary = result.summary ?? {}; if (summary.broad_query_detected === true) broadQueryDetected = true; if (summary.broad_result_flag === true) broadResultFlag = true; if (summary.minimum_evidence_failed === true) minimumEvidenceFailed = true; const degraded = toDegradedTo(summary.degraded_to); if (degraded === "clarification") { degradedTo = "clarification"; } else if (!degradedTo && degraded === "partial") { degradedTo = "partial"; } const narrowed = toNarrowingStrength(summary.narrowing_strength); if (narrowed && (!narrowingStrength || narrowingOrder[narrowed] < narrowingOrder[narrowingStrength])) { narrowingStrength = narrowed; } if (result.confidence === "high") confidenceScores.push(3); if (result.confidence === "medium") confidenceScores.push(2); if (result.confidence === "low") confidenceScores.push(1); for (const evidence of Array.isArray(result.evidence) ? result.evidence : []) { const canonicalRef = String(evidence.source_ref?.canonical_ref ?? "").trim(); if (canonicalRef) { sourceRefSet.add(canonicalRef); } const reasonCode = String(evidence.limitation?.reason_code ?? "").trim(); if (reasonCode) { limitationCodeSet.add(reasonCode); } if (evidence.confidence === "high") confidenceScores.push(3); if (evidence.confidence === "medium") confidenceScores.push(2); if (evidence.confidence === "low") confidenceScores.push(1); } } const averageConfidence = confidenceScores.length > 0 ? confidenceScores.reduce((acc, item) => acc + item, 0) / confidenceScores.length : null; const evidenceConfidence = averageConfidence === null ? null : averageConfidence >= 2.6 ? "high" : averageConfidence >= 1.8 ? "medium" : "low"; const mechanismStatus = debug?.answer_structure_v11?.mechanism_block?.status === "grounded" || debug?.answer_structure_v11?.mechanism_block?.status === "limited" || debug?.answer_structure_v11?.mechanism_block?.status === "unresolved" ? debug.answer_structure_v11.mechanism_block.status : null; const followupStateApplied = turnResponses.some((item) => item.debug?.followup_state_usage?.applied === true); const uncertaintyLimitationsCount = debug?.answer_structure_v11?.uncertainty_block?.limitations?.length ?? 0; return { broad_query_detected: broadQueryDetected, broad_result_flag: broadResultFlag, narrowing_strength: narrowingStrength, minimum_evidence_failed: minimumEvidenceFailed, degraded_to: degradedTo, evidence_confidence: evidenceConfidence, limitation_reason_codes: [...limitationCodeSet], mechanism_status: mechanismStatus, source_refs: [...sourceRefSet], routes: [...routeSet], followup_state_applied: followupStateApplied, uncertainty_limitations_count: uncertaintyLimitationsCount }; } computeAssistantMetrics(input) { const diagnostics = input.diagnostics; const total = Math.max(1, diagnostics.length); const signatureCounter = diagnostics.reduce((acc, item) => { acc[item.signature] = (acc[item.signature] ?? 0) + 1; return acc; }, {}); const uniqueSignatures = Object.keys(signatureCounter).length; const genericCases = diagnostics.filter((item) => item.is_generic).length; const falseConfidenceCases = diagnostics.filter((item) => item.is_false_confident).length; const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null); const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length; const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null); const avgActionability = diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length : null; const avgMechanism = diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.mechanism_specificity_score, 0) / diagnostics.length : null; const avgFollowup = followupCases.length > 0 ? followupCases.reduce((acc, item) => acc + Number(item.followup_retention_score ?? 0), 0) / followupCases.length : null; const raw = { retrieval_differentiation_rate: round2(uniqueSignatures / total), generic_explanation_rate: round2(genericCases / total), accountant_actionability_score: avgActionability === null ? null : round2(avgActionability), false_confidence_rate: round2(falseConfidenceCases / total), broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null, mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism), followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup) }; const rubric_bands = { retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate), generic_explanation_rate: rubricBandForMetric("generic_explanation_rate", raw.generic_explanation_rate), accountant_actionability_score: rubricBandForMetric("accountant_actionability_score", raw.accountant_actionability_score), false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate), broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate), mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score), followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score) }; return { raw, rubric_bands, denominators: { cases_total: diagnostics.length, broad_cases_total: broadCases.length, followup_cases_total: followupCases.length }, signature_counts: signatureCounter }; } buildAssistantComparisonReport(input) { const baselinePath = resolveReadablePath(input.baselineReportFile); const baselineReport = JSON.parse(fs_1.default.readFileSync(baselinePath, "utf-8")); const currentReport = input.currentReport; const metricKeys = [ "retrieval_differentiation_rate", "generic_explanation_rate", "accountant_actionability_score", "false_confidence_rate", "broad_answer_rate", "mechanism_specificity_score", "followup_context_retention_score" ]; const lowerIsBetter = new Set(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]); const baselineRaw = (baselineReport.metrics ?? {}).raw ?? {}; const currentRaw = (currentReport.metrics ?? {}).raw ?? {}; const deltas = {}; for (const metric of metricKeys) { const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null; const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null; const delta = baseline !== null && current !== null ? round2(current - baseline) : null; let trend = "n/a"; if (baseline !== null && current !== null) { const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01; const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01; trend = improved ? "improved" : weakened ? "weakened" : "unchanged"; } deltas[metric] = { baseline, current, delta, trend }; } const baselineResults = Array.isArray(baselineReport.results) ? baselineReport.results : []; const currentResults = Array.isArray(currentReport.results) ? currentReport.results : []; const baselineByCase = new Map(); for (const row of baselineResults) { baselineByCase.set(String(row.case_id ?? ""), row); } const improvedNotes = []; const unchangedNotes = []; const weakenedNotes = []; for (const row of currentResults) { const caseId = String(row.case_id ?? ""); const currentUsefulness = typeof row.accountant_usefulness_score === "number" ? Number(row.accountant_usefulness_score) : null; const baselineRow = baselineByCase.get(caseId); const baselineUsefulness = baselineRow && typeof baselineRow.accountant_usefulness_score === "number" ? Number(baselineRow.accountant_usefulness_score) : null; if (baselineUsefulness === null || currentUsefulness === null) { continue; } const delta = round2(currentUsefulness - baselineUsefulness); const note = `${caseId}: usefulness ${baselineUsefulness} -> ${currentUsefulness} (delta ${delta})`; if (delta > 0.25) { improvedNotes.push(note); } else if (delta < -0.25) { weakenedNotes.push(note); } else { unchangedNotes.push(note); } } const comparisonId = `assistant-compare-${(0, nanoid_1.nanoid)(8)}`; const comparisonReport = { schema_version: ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION, comparison_id: comparisonId, run_timestamp: new Date().toISOString(), baseline_run_id: baselineReport.run_id ?? null, current_run_id: currentReport.run_id ?? null, eval_target: "assistant_stage1", suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null, suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null, baseline_report_file: baselinePath, current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object" ? currentReport.artifacts.run_report_json_path ?? null : null, metric_deltas: deltas, scenario_notes_summary: { improved: improvedNotes.length, unchanged: unchangedNotes.length, weakened: weakenedNotes.length }, scenario_notes: { improved: improvedNotes, unchanged: unchangedNotes, weakened: weakenedNotes }, known_limitations: currentReport.known_limitations ?? [ "Comparison is run-to-run and depends on stable mock/runtime flags.", "Metrics remain Stage 1 heuristic bands, not full product scorecards." ], report_title: "Assistant Stage 1 Baseline vs Current" }; (0, files_1.ensureDir)(config_1.REPORTS_DIR); const jsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.json`); const mdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${comparisonId}.md`); (0, files_1.writeJsonFile)(jsonPath, comparisonReport); fs_1.default.writeFileSync(mdPath, buildAssistantComparisonMarkdownReport(comparisonReport), "utf-8"); return { ...comparisonReport, artifacts: { comparison_report_json_path: jsonPath, comparison_report_md_path: mdPath } }; } async runAssistantStage1(payload) { if (!config_1.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1) { throw new http_1.ApiError("ASSISTANT_STAGE1_EVAL_DISABLED", "Assistant Stage 1 eval target is disabled by FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1.", 409); } const suite = parseAssistantSuiteFile(payload.caseSetFile); const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id)); const runId = `assistant-stage1-${(0, nanoid_1.nanoid)(10)}`; const assistantService = new assistantService_1.AssistantService(this.normalizerService, new assistantSessionStore_1.AssistantSessionStore()); const diagnostics = []; let requestsTotal = 0; for (const suiteCase of suiteCases) { const sessionId = `${runId}-${suiteCase.case_id}`; const turnResponses = []; const notes = []; const limitations = []; try { for (const turn of suiteCase.turns) { const response = await assistantService.handleMessage({ session_id: sessionId, user_message: turn.user_message, message: turn.user_message, mode: "assistant", apiKey: payload.normalizeConfig.apiKey, model: payload.normalizeConfig.model, baseUrl: payload.normalizeConfig.baseUrl, temperature: payload.normalizeConfig.temperature, maxOutputTokens: payload.normalizeConfig.maxOutputTokens, promptVersion: payload.normalizeConfig.promptVersion, systemPrompt: payload.normalizeConfig.systemPrompt, developerPrompt: payload.normalizeConfig.developerPrompt, domainPrompt: payload.normalizeConfig.domainPrompt, fewShotExamples: payload.normalizeConfig.fewShotExamples, useMock: payload.useMock }); turnResponses.push(response); requestsTotal += 1; } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); diagnostics.push({ suite_case: suiteCase, session_id: sessionId, trace_id: null, final_reply_type: "backend_error", turn_count: turnResponses.length, narrowing_result: "failed", signature: `backend_error|${suiteCase.scenario_tag}`, is_generic: true, is_false_confident: false, is_broad_answer: suiteCase.broadness_level === "low" ? null : false, followup_retention_score: suiteCase.question_type === "followup" || suiteCase.turns.length > 1 ? 0 : null, evidence_quality_score: 0, mechanism_specificity_score: 0, genericness_score: 5, accountant_actionability_score: 0, accountant_usefulness_score: 0, signals: { broad_query_detected: suiteCase.broadness_level !== "low", broad_result_flag: false, narrowing_strength: null, minimum_evidence_failed: true, degraded_to: "clarification", evidence_confidence: "low", limitation_reason_codes: [], mechanism_status: null, source_refs: [], routes: [], followup_state_applied: false, uncertainty_limitations_count: 0 }, limitations: [errorMessage], notes: [`Case execution failed: ${errorMessage}`] }); continue; } const finalResponse = turnResponses[turnResponses.length - 1]; const signals = this.collectAssistantSignals(finalResponse, turnResponses); const structure = finalResponse.debug?.answer_structure_v11 ?? null; const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions); const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions); const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes); const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations); const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? ""); const hasAnchors = hasDomainAnchors([directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")); let genericnessScore = 0; if (!hasAnchors) genericnessScore += 2; if (mechanismNotes.length === 0) genericnessScore += 1; if (signals.source_refs.length === 0) genericnessScore += 1; if (recommendedActions.length === 0) genericnessScore += 1; genericnessScore = clampScore(genericnessScore); let actionabilityScore = 0; if (recommendedActions.length > 0) actionabilityScore += 2; if (recommendedActions.some((item) => hasDomainAnchors(item))) actionabilityScore += 2; if (clarificationQuestions.length > 0 && (finalResponse.reply_type === "clarification_required" || signals.degraded_to === "clarification")) { actionabilityScore += 1; } if (signals.source_refs.length > 0 && actionabilityScore < 5) { actionabilityScore += 1; } actionabilityScore = clampScore(actionabilityScore); let evidenceQualityScore = 0; if (signals.source_refs.length >= 3) evidenceQualityScore += 2; else if (signals.source_refs.length > 0) evidenceQualityScore += 1; if (signals.evidence_confidence === "high") evidenceQualityScore += 2; if (signals.evidence_confidence === "medium") evidenceQualityScore += 1; if (signals.minimum_evidence_failed) evidenceQualityScore -= 2; if (signals.limitation_reason_codes.includes("insufficient_detail")) evidenceQualityScore -= 1; if (signals.limitation_reason_codes.includes("missing_mechanism")) evidenceQualityScore -= 1; evidenceQualityScore = clampScore(evidenceQualityScore); let mechanismSpecificityScore = 0; if (signals.mechanism_status === "grounded" && mechanismNotes.length > 0 && !signals.limitation_reason_codes.includes("missing_mechanism")) { mechanismSpecificityScore = 5; } else if (signals.mechanism_status === "limited" && mechanismNotes.length > 0) { mechanismSpecificityScore = 3; } else if (mechanismNotes.length > 0) { mechanismSpecificityScore = 2; } else { mechanismSpecificityScore = 0; } const usefulnessScore = clampScore((actionabilityScore + (5 - genericnessScore) + evidenceQualityScore + mechanismSpecificityScore) / 4); const isGeneric = genericnessScore >= 3; const factualReply = finalResponse.reply_type === "factual" || finalResponse.reply_type === "factual_with_explanation"; const isFalseConfident = factualReply && (signals.minimum_evidence_failed || signals.degraded_to !== null || signals.evidence_confidence === "low" || (signals.limitation_reason_codes.length > 0 && signals.uncertainty_limitations_count === 0)); const isBroadCase = suiteCase.broadness_level !== "low" || signals.broad_query_detected; const isBroadAnswer = isBroadCase ? factualReply && signals.degraded_to === null && !signals.minimum_evidence_failed : null; const isFollowupCase = suiteCase.question_type === "followup" || suiteCase.turns.length > 1; let followupRetentionScore = null; if (isFollowupCase) { const finalTurnIndex = Number(finalResponse.debug?.investigation_state_snapshot?.turn_index ?? 0); if (signals.followup_state_applied && finalTurnIndex >= suiteCase.turns.length) { followupRetentionScore = 5; } else if (finalTurnIndex >= suiteCase.turns.length) { followupRetentionScore = 3; } else { followupRetentionScore = 0; } } let narrowingResult = "not_required"; if (signals.degraded_to === "clarification" || finalResponse.reply_type === "clarification_required") { narrowingResult = "clarification_requested"; } else if (signals.broad_query_detected || signals.broad_result_flag) { narrowingResult = signals.minimum_evidence_failed ? "failed" : "applied"; } if (signals.minimum_evidence_failed) { limitations.push("minimum_evidence_failed"); } limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`)); if (signals.mechanism_status === "unresolved") { limitations.push("mechanism_unresolved"); } limitations.push(...uncertaintyLimitations); if (isGeneric) notes.push("genericness_high"); if (isFalseConfident) notes.push("false_confidence_risk"); if (isBroadCase && isBroadAnswer) notes.push("broad_answer_without_degradation"); if (followupRetentionScore !== null && followupRetentionScore < 3) notes.push("followup_context_retention_weak"); diagnostics.push({ suite_case: suiteCase, session_id: sessionId, trace_id: finalResponse.debug?.trace_id ?? null, final_reply_type: finalResponse.reply_type, turn_count: suiteCase.turns.length, narrowing_result: narrowingResult, signature: [ finalResponse.reply_type, signals.routes.sort().join(","), signals.degraded_to ?? "none", signals.mechanism_status ?? "unknown", signals.source_refs.slice(0, 2).join(",") ].join("|"), is_generic: isGeneric, is_false_confident: isFalseConfident, is_broad_answer: isBroadAnswer, followup_retention_score: followupRetentionScore, evidence_quality_score: evidenceQualityScore, mechanism_specificity_score: mechanismSpecificityScore, genericness_score: genericnessScore, accountant_actionability_score: actionabilityScore, accountant_usefulness_score: round2(usefulnessScore), signals, limitations: Array.from(new Set(limitations)), notes }); } const metrics = this.computeAssistantMetrics({ diagnostics }); const caseRecords = diagnostics.map((item) => { const signatureHits = metrics.signature_counts[item.signature] ?? 1; const caseMetricVector = { retrieval_differentiation_rate: signatureHits === 1 ? 1 : 0, generic_explanation_rate: item.is_generic ? 1 : 0, accountant_actionability_score: round2(item.accountant_actionability_score), false_confidence_rate: item.is_false_confident ? 1 : 0, broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0, mechanism_specificity_score: round2(item.mechanism_specificity_score), followup_context_retention_score: item.followup_retention_score === null ? null : round2(item.followup_retention_score) }; return { schema_version: stage1Contracts_1.ASSISTANT_EVAL_RECORD_SCHEMA_VERSION, created_at: new Date().toISOString(), case_id: item.suite_case.case_id, scenario_tag: item.suite_case.scenario_tag, session_id: item.session_id, trace_id: item.trace_id, question_type: item.suite_case.question_type, broadness_level: item.suite_case.broadness_level, narrowing_result: item.narrowing_result, evidence_quality_score: round2(item.evidence_quality_score), genericness_score: round2(item.genericness_score), accountant_usefulness_score: round2(item.accountant_usefulness_score), accountant_metrics: caseMetricVector, raw_signals: { final_reply_type: item.final_reply_type, turn_count: item.turn_count, broad_query_detected: item.signals.broad_query_detected, broad_result_flag: item.signals.broad_result_flag, narrowing_strength: item.signals.narrowing_strength, minimum_evidence_failed: item.signals.minimum_evidence_failed, degraded_to: item.signals.degraded_to, evidence_confidence: item.signals.evidence_confidence, limitation_reason_codes: item.signals.limitation_reason_codes, mechanism_status: item.signals.mechanism_status, source_refs: item.signals.source_refs, routes: item.signals.routes, followup_state_applied: item.signals.followup_state_applied }, metric_subscores: caseMetricVector, limitations: item.limitations, notes: item.notes }; }); const strongestSignals = Object.entries(metrics.rubric_bands) .filter(([, band]) => band?.score === 5) .map(([name]) => name); const weakestSignals = Object.entries(metrics.rubric_bands) .filter(([, band]) => band?.score === 0) .map(([name]) => name); const runTimestamp = new Date().toISOString(); const report = { schema_version: ASSISTANT_STAGE1_RUN_SCHEMA_VERSION, run_id: runId, run_timestamp: runTimestamp, eval_target: "assistant_stage1", mode: payload.mode, use_mock: Boolean(payload.useMock), prompt_version: payload.normalizeConfig.promptVersion ?? null, suite_id: suite.suite_id, suite_version: suite.suite_version, suite_schema_version: suite.schema_version ?? null, scenario_count: suite.scenario_count, case_ids: suiteCases.map((item) => item.case_id), cases_total: caseRecords.length, feature_profile_snapshot: buildFeatureProfileSnapshot(), code_version: buildCodeVersionMarker(), metrics: { raw: metrics.raw, denominators: metrics.denominators }, rubric_bands: metrics.rubric_bands, subsets: { broad_cases_total: metrics.denominators.broad_cases_total, followup_cases_total: metrics.denominators.followup_cases_total }, budget: { requests_total: requestsTotal }, results: caseRecords, scenario_summary: { improved_or_strong: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) >= 4).length, unchanged_or_mixed: caseRecords.filter((item) => { const value = Number(item.accountant_usefulness_score ?? 0); return value >= 2.5 && value < 4; }).length, weak_or_regressed: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) < 2.5).length }, improvement_hints: { strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none", weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none" }, known_limitations: [ "Snapshot-only retrieval contour remains (no live verification core in Stage 1).", "Metric mapping for genericness/false confidence is heuristic by design.", "Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)." ], report_title: "Assistant Stage 1 Eval Run" }; (0, files_1.ensureDir)(config_1.REPORTS_DIR); const runJsonPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.json`); const runMdPath = path_1.default.resolve(config_1.REPORTS_DIR, `${runId}.md`); (0, files_1.writeJsonFile)(runJsonPath, report); fs_1.default.writeFileSync(runMdPath, buildAssistantEvalMarkdownReport(report), "utf-8"); report.artifacts = { run_report_json_path: runJsonPath, run_report_md_path: runMdPath }; if (payload.compareWithReportFile) { report.comparison = this.buildAssistantComparisonReport({ currentReport: report, baselineReportFile: payload.compareWithReportFile }); } return report; } async run(payload) { const mode = payload.mode ?? "standard"; const evalTarget = payload.evalTarget ?? "normalizer"; if (evalTarget === "assistant_stage1") { return this.runAssistantStage1({ normalizeConfig: payload.normalizeConfig, caseIds: payload.caseIds, useMock: payload.useMock, mode, caseSetFile: payload.caseSetFile, compareWithReportFile: payload.compareWithReportFile }); } const promptVersion = String(payload.normalizeConfig.promptVersion ?? "").toLowerCase(); const schemaVersion = String(payload.normalizeConfig.schemaVersion ?? "").toLowerCase(); const isV2 = promptVersion.startsWith("normalizer_v2") || schemaVersion === "v2" || schemaVersion === "v2_0_1" || schemaVersion === "v2_0_2"; const inlineQuestions = payload.rawQuestions ? parseRawQuestions(payload.rawQuestions) : []; const inlineCases = inlineQuestions.map((question, index) => ({ case_id: formatCaseId("BQ", index), raw_question: question, expected: null })); if (isV2) { const sourceCases = inlineCases.length > 0 ? inlineCases : payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile).map((item) => ({ case_id: item.case_id, raw_question: item.raw_question, expected: item.expected })) : this.listCases().map((item) => ({ case_id: item.case_id, raw_question: item.raw_question, expected: item.expected })); const filtered = sourceCases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id)); return this.runV2({ ...payload, mode, cases: filtered }); } if (inlineCases.length > 0) { throw new Error("rawQuestions batch is supported for normalizer_v2 only."); } const casesSource = payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile) : this.listCases(); const filteredCases = casesSource.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id)); const runId = `eval-${(0, nanoid_1.nanoid)(10)}`; const results = []; const mismatches = []; const badConfidenceCases = []; const classCounter = {}; let schemaPass = 0; let intentPass = 0; let routePass = 0; let causalPass = 0; let highConfidenceErrors = 0; let requestsTotal = 0; let retriesUsed = 0; for (const item of filteredCases) { const response = await this.normalizerService.normalize({ ...payload.normalizeConfig, userQuestion: item.raw_question, context: { expected_route: item.expected.route_hint, eval_label: runId, case_id: item.case_id, eval_mode: mode }, retryPolicy: mode === "single-pass-strict" ? "single-pass-strict" : "default", useMock: payload.useMock }); const normalized = response.normalized && response.normalized.schema_version === "normalized_query_v1" ? response.normalized : null; const intentMatch = Boolean(normalized && item.expected.intent_class === normalized.intent_class); const routeMatch = Boolean(normalized && item.expected.route_hint === normalized.route_hint); const causalMatch = Boolean(normalized && item.expected.requires && item.expected.requires.needs_cross_entity_join === normalized.requires.needs_cross_entity_join && item.expected.requires.needs_causal_chain === normalized.requires.needs_causal_chain); if (response.validation.passed) schemaPass += 1; if (intentMatch) intentPass += 1; if (routeMatch) routePass += 1; if (causalMatch || !item.expected.requires) causalPass += 1; const requestCount = Number(response.request_count_for_case ?? 0); requestsTotal += requestCount; if (requestCount > 1) { retriesUsed += 1; } const classKey = String(item.expected.intent_class ?? "unknown"); if (!classCounter[classKey]) { classCounter[classKey] = { total: 0, passed: 0 }; } classCounter[classKey].total += 1; if (intentMatch) { classCounter[classKey].passed += 1; } const confidenceOverall = normalized?.confidence.overall ?? null; const hasMismatch = !intentMatch || !routeMatch || (!causalMatch && Boolean(item.expected.requires)); if (confidenceOverall === "high" && hasMismatch) { highConfidenceErrors += 1; badConfidenceCases.push({ case_id: item.case_id, confidence_overall: confidenceOverall, intent_match: intentMatch, route_match: routeMatch, causal_match: causalMatch || !item.expected.requires, trace_id: response.trace_id }); } if (hasMismatch || !response.validation.passed) { mismatches.push({ case_id: item.case_id, expected_intent_class: item.expected.intent_class ?? null, actual_intent_class: normalized?.intent_class ?? null, expected_route_hint: item.expected.route_hint ?? null, actual_route_hint: normalized?.route_hint ?? null, expected_requires: item.expected.requires ?? null, actual_requires: normalized?.requires ?? null, comment: shortMismatchComment({ intentMatch, routeMatch, causalMatch: causalMatch || !item.expected.requires, validationPassed: response.validation.passed }), trace_id: response.trace_id }); } results.push({ case_id: item.case_id, raw_question: item.raw_question, validation_passed: response.validation.passed, intent_match: intentMatch, route_match: routeMatch, causal_flags_match: causalMatch || !item.expected.requires, expected_intent_class: item.expected.intent_class ?? null, actual_intent_class: normalized?.intent_class ?? null, expected_route_hint: item.expected.route_hint ?? null, actual_route_hint: normalized?.route_hint ?? null, expected_requires: item.expected.requires ?? null, actual_requires: normalized?.requires ?? null, confidence_overall: confidenceOverall, trace_id: response.trace_id, request_count_for_case: requestCount }); } const total = Math.max(1, filteredCases.length); const metrics = { schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)), intent_class_accuracy: Number(((intentPass / total) * 100).toFixed(2)), route_hint_accuracy: Number(((routePass / total) * 100).toFixed(2)), causal_flag_accuracy: Number(((causalPass / total) * 100).toFixed(2)), high_confidence_error_rate: Number(((highConfidenceErrors / total) * 100).toFixed(2)) }; const classAccuracy = Object.fromEntries(Object.entries(classCounter).map(([key, value]) => [ key, { total: value.total, passed: value.passed, accuracy_percent: Number(((value.passed / Math.max(1, value.total)) * 100).toFixed(2)) } ])); const baselineAsMap = BASELINE_METRICS; const baselineDelta = Object.fromEntries(Object.entries(metrics).map(([key, value]) => [key, Number((value - baselineAsMap[key]).toFixed(2))])); const report = { run_id: runId, timestamp: new Date().toISOString(), mode, use_mock: Boolean(payload.useMock), prompt_version: payload.normalizeConfig.promptVersion ?? null, dataset: { source: payload.caseSetFile ? "file" : "data/eval_cases/*.json", file: payload.caseSetFile ?? null }, cases_total: filteredCases.length, metrics, baseline_metrics: BASELINE_METRICS, baseline_delta: baselineDelta, class_accuracy: classAccuracy, budget: { requests_total: requestsTotal, retries_used: retriesUsed, guidance: { forensic_calls_max: 10, final_eval_calls_max: 30, target_total_calls_max: 40, hard_cap_calls_max: 45 } }, mismatches, bad_confidence_cases: badConfidenceCases, results }; (0, files_1.ensureDir)(config_1.EVAL_CASES_DIR); (0, files_1.writeJsonFile)(path_1.default.resolve(config_1.EVAL_CASES_DIR, `${runId}.report.json`), report); const shouldWriteV11Artifacts = mode === "single-pass-strict" && Boolean(payload.caseSetFile) && path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_30cases.json"; if (shouldWriteV11Artifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); (0, files_1.writeJsonFile)(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.json"), report); fs_1.default.writeFileSync(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_eval_v1_1_run.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1 Eval Run" }), "utf-8"); } const shouldWriteV1121EvalArtifacts = mode === "single-pass-strict" && String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2_1" && Boolean(payload.caseSetFile) && path_1.default.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_2_1_30cases.json"; if (shouldWriteV1121EvalArtifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); (0, files_1.writeJsonFile)(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.json"), report); fs_1.default.writeFileSync(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_1_eval.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1.2.1 Eval Run" }), "utf-8"); } const shouldWriteV111MicroArtifacts = mode === "single-pass-strict" && String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_1" && isSameCaseSet(payload.caseIds, V111_MICRO_CASE_IDS); if (shouldWriteV111MicroArtifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); (0, files_1.writeJsonFile)(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.json"), report); fs_1.default.writeFileSync(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_1_micro_eval.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1.1 Micro Eval" }), "utf-8"); } const shouldWriteV112MicroArtifacts = mode === "single-pass-strict" && String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2" && isSameCaseSet(payload.caseIds, V112_MICRO_CASE_IDS); if (shouldWriteV112MicroArtifacts) { (0, files_1.ensureDir)(config_1.REPORTS_DIR); (0, files_1.writeJsonFile)(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.json"), report); fs_1.default.writeFileSync(path_1.default.resolve(config_1.REPORTS_DIR, "normalizer_v1_1_2_micro_eval.md"), buildMarkdownReport({ ...report, report_title: "LLM Normalizer v1.1.2 Micro Eval" }), "utf-8"); } return report; } } exports.EvalService = EvalService;