NODEDC_1C/llm_normalizer/backend/src/services/evalService.ts

import fs from "fs";
import path from "path";
import { nanoid } from "nanoid";
import {
  EVAL_CASES_DIR,
  EVAL_DATASETS_DIR,
  FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1,
  FEATURE_ASSISTANT_ANSWER_POLICY_V11,
  REPORTS_DIR
} from "../config";
import type { AssistantMessageResponsePayload } from "../types/assistant";
import type { EvalTarget, AssistantStage1SuiteCase, AssistantStage1SuiteFile } from "../types/assistantEval";
import type {
  EvalRunMode,
  NoRouteReason,
  NormalizeRequestPayload,
  NormalizedQueryV1,
  NormalizedQueryV2,
  NormalizedQueryV2_0_1,
  NormalizedQueryV2_0_2
} from "../types/normalizer";
import {
  ACCOUNTANT_SCORING_RUBRIC_V01,
  ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
  type AccountantMetricName,
  type AccountantMetricRubricBand,
  type AssistantEvalMetricVector,
  type AssistantEvalNarrowingResult,
  type AssistantEvalRecord
} from "../types/stage1Contracts";
import { ApiError } from "../utils/http";
import { AssistantService } from "./assistantService";
import { AssistantSessionStore } from "./assistantSessionStore";
import { NormalizerService } from "./normalizerService";
import { ensureDir, writeJsonFile } from "../utils/files";

interface EvalCaseFile {
  case_id: string;
  raw_question: string;
  expected: {
    intent_class?: string;
    route_hint?: string;
    expected_scope_in_scope?: boolean;
    clarification_required?: boolean;
    expected_route_status?: "routed" | "no_route";
    expected_no_route_reason?: NoRouteReason;
    expected_execution_readiness?: "executable" | "executable_with_soft_assumptions" | "needs_clarification" | "no_route";
    requires?: {
      needs_cross_entity_join?: boolean;
      needs_causal_chain?: boolean;
    };
    accounts_mentioned?: string[];
    expected_output_shape?: string;
  };
}

interface EvalInputCase {
  case_id: string;
  raw_question: string;
  expected: EvalCaseFile["expected"] | null;
}

interface BaselineMetrics {
  schema_validation_pass_rate: number;
  intent_class_accuracy: number;
  route_hint_accuracy: number;
  causal_flag_accuracy: number;
  high_confidence_error_rate: number;
}

const BASELINE_METRICS: BaselineMetrics = {
  schema_validation_pass_rate: 100,
  intent_class_accuracy: 72.73,
  route_hint_accuracy: 90.91,
  causal_flag_accuracy: 81.82,
  high_confidence_error_rate: 9.09
};

const V111_MICRO_CASE_IDS = ["NQ-008", "V11-DD-005", "V11-OT-003", "V11-OT-004", "V11-OT-005"];
const V112_MICRO_CASE_IDS = ["NQ-002", "NQ-007", "V11-HA-004", "V11-OT-003", "V11-OT-005"];

function isSameCaseSet(input: string[] | undefined, target: string[]): boolean {
  if (!input || input.length !== target.length) {
    return false;
  }
  const left = [...input].sort();
  const right = [...target].sort();
  return left.every((value, index) => value === right[index]);
}

function formatPercent(value: number): string {
  return `${value.toFixed(2)}%`;
}

function shortMismatchComment(input: { intentMatch: boolean; routeMatch: boolean; causalMatch: boolean; validationPassed: boolean }): string {
  if (!input.validationPassed) {
    return "Schema validation failed for this case.";
  }
  if (!input.intentMatch && input.routeMatch) {
    return "Route chosen correctly, but intent_class drifted into a neighboring taxonomy bucket.";
  }
  if (input.intentMatch && !input.routeMatch) {
    return "Intent understood, but route_hint selected a weaker execution route.";
  }
  if (!input.intentMatch && !input.routeMatch) {
    return "Both intent and route misclassified; likely lexical ambiguity in causal vs risk wording.";
  }
  if (!input.causalMatch) {
    return "Causal flags are inconsistent with expected relationship depth.";
  }
  return "No mismatch.";
}

function buildMarkdownReport(report: Record<string, unknown>): string {
  const metrics = (report.metrics ?? {}) as Record<string, number>;
  const baseline = (report.baseline_metrics ?? {}) as Record<string, number>;
  const delta = (report.baseline_delta ?? {}) as Record<string, number>;
  const classAccuracy = (report.class_accuracy ?? {}) as Record<string, { total: number; passed: number; accuracy_percent: number }>;
  const mismatches = Array.isArray(report.mismatches) ? report.mismatches : [];
  const badConfidenceCases = Array.isArray(report.bad_confidence_cases) ? report.bad_confidence_cases : [];
  const budget = (report.budget ?? {}) as Record<string, unknown>;

  const metricRows = Object.keys(metrics)
    .map((key) => {
      const current = Number(metrics[key] ?? 0);
      const base = Number(baseline[key] ?? 0);
      const d = Number(delta[key] ?? 0);
      const sign = d > 0 ? "+" : "";
      return `| ${key} | ${formatPercent(current)} | ${formatPercent(base)} | ${sign}${d.toFixed(2)} |`;
    })
    .join("\n");

  const classRows = Object.keys(classAccuracy)
    .map((key) => {
      const row = classAccuracy[key];
      return `| ${key} | ${row.passed}/${row.total} | ${formatPercent(row.accuracy_percent)} |`;
    })
    .join("\n");

  const mismatchRows =
    mismatches.length === 0
      ? "No mismatches."
      : mismatches
          .map((item) => {
            const row = item as Record<string, unknown>;
            return `- ${row.case_id}: expected(${row.expected_intent_class} / ${row.expected_route_hint}) -> actual(${row.actual_intent_class} / ${row.actual_route_hint}). ${row.comment}`;
          })
          .join("\n");

  const badConfidenceRows =
    badConfidenceCases.length === 0
      ? "No bad-confidence cases."
      : badConfidenceCases
          .map((item) => {
            const row = item as Record<string, unknown>;
            return `- ${row.case_id}: confidence=${row.confidence_overall}, intent_match=${row.intent_match}, route_match=${row.route_match}`;
          })
          .join("\n");

  return [
    `# ${String(report.report_title ?? "LLM Normalizer Eval Run")}`,
    "",
    `- run_id: ${String(report.run_id ?? "")}`,
    `- timestamp: ${String(report.timestamp ?? "")}`,
    `- mode: ${String(report.mode ?? "")}`,
    `- use_mock: ${String(report.use_mock ?? false)}`,
    `- cases_total: ${String(report.cases_total ?? 0)}`,
    `- prompt_version: ${String(report.prompt_version ?? "")}`,
    "",
    "## Metrics vs Baseline",
    "",
    "| Metric | Current | Baseline | Delta |",
    "|---|---:|---:|---:|",
    metricRows || "| n/a | n/a | n/a | n/a |",
    "",
    "## Class Accuracy",
    "",
    "| Intent class | Passed/Total | Accuracy |",
    "|---|---:|---:|",
    classRows || "| n/a | n/a | n/a |",
    "",
    "## Budget",
    "",
    `- requests_total: ${String(budget.requests_total ?? 0)}`,
    `- retries_used: ${String(budget.retries_used ?? 0)}`,
    "",
    "## Mismatches",
    "",
    mismatchRows,
    "",
    "## Bad Confidence Cases",
    "",
    badConfidenceRows,
    ""
  ].join("\n");
}

function parseCaseSetFile(inputPath: string): EvalCaseFile[] {
  const filePath = path.isAbsolute(inputPath) ? inputPath : path.resolve(EVAL_DATASETS_DIR, inputPath);
  const raw = fs.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
  const parsed = JSON.parse(raw) as unknown;
  if (Array.isArray(parsed)) {
    return parsed as EvalCaseFile[];
  }
  if (parsed && typeof parsed === "object" && Array.isArray((parsed as { cases?: unknown[] }).cases)) {
    return (parsed as { cases: EvalCaseFile[] }).cases;
  }
  throw new Error(`Unsupported eval dataset format: ${filePath}`);
}

function formatCaseId(prefix: string, index: number): string {
  return `${prefix}-${String(index + 1).padStart(3, "0")}`;
}

function parseRawQuestions(rawQuestions: string): string[] {
  const text = rawQuestions.replace(/\r\n/g, "\n").trim();
  if (!text) {
    return [];
  }

  const bySemicolon = text
    .split(";")
    .map((item) => item.trim())
    .filter(Boolean);

  if (bySemicolon.length > 1) {
    return bySemicolon;
  }

  const byBlankLine = text
    .split(/\n\s*\n+/)
    .map((item) => item.trim())
    .filter(Boolean);

  if (byBlankLine.length > 1) {
    return byBlankLine;
  }

  const byLine = text
    .split("\n")
    .map((item) => item.trim())
    .filter(Boolean);

  return byLine.length > 0 ? byLine : [text];
}

type V2FamilyFragment =
  | NormalizedQueryV2["fragments"][number]
  | NormalizedQueryV2_0_1["fragments"][number]
  | NormalizedQueryV2_0_2["fragments"][number];

function executionReadinessOf(fragment: V2FamilyFragment): string {
  return "execution_readiness" in fragment ? fragment.execution_readiness : "executable";
}

function softAssumptionsOf(fragment: V2FamilyFragment): string[] {
  return "soft_assumption_used" in fragment ? fragment.soft_assumption_used : [];
}

function routeStatusOf(fragment: V2FamilyFragment): "routed" | "no_route" | null {
  return "route_status" in fragment ? fragment.route_status : null;
}

function noRouteReasonOf(fragment: V2FamilyFragment): NoRouteReason | null {
  return "no_route_reason" in fragment ? fragment.no_route_reason : null;
}

function expectedScopeInScope(expected: EvalCaseFile["expected"] | null): boolean | null {
  if (!expected) {
    return null;
  }
  if (typeof expected.expected_scope_in_scope === "boolean") {
    return expected.expected_scope_in_scope;
  }
  if (expected.expected_no_route_reason === "out_of_scope") {
    return false;
  }
  if (expected.expected_route_status === "routed") {
    return true;
  }
  if (typeof expected.clarification_required === "boolean") {
    return true;
  }
  return null;
}

function isDecisionStateConsistent(decision: {
  route: string;
  execution_readiness?: string | null;
  no_route_reason?: NoRouteReason | null;
}): boolean {
  const readiness = String(decision.execution_readiness ?? "");
  const noRouteReason = decision.no_route_reason ?? null;
  if (decision.route === "no_route") {
    if (!noRouteReason) {
      return false;
    }
    return readiness !== "executable" && readiness !== "executable_with_soft_assumptions";
  }
  if (noRouteReason) {
    return false;
  }
  return readiness !== "needs_clarification" && readiness !== "no_route";
}

const DEFAULT_ASSISTANT_STAGE1_SUITE_FILE = "assistant_stage1_canonical_v0_1.json";
const ASSISTANT_STAGE1_RUN_SCHEMA_VERSION = "assistant_stage1_eval_run_v0_1";
const ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION = "assistant_stage1_eval_comparison_v0_1";

type AssistantMetricKey = keyof AssistantEvalMetricVector;

interface AssistantCaseSignalSummary {
  broad_query_detected: boolean;
  broad_result_flag: boolean;
  narrowing_strength: "weak" | "medium" | "strong" | null;
  minimum_evidence_failed: boolean;
  degraded_to: "partial" | "clarification" | null;
  evidence_confidence: "high" | "medium" | "low" | null;
  limitation_reason_codes: string[];
  mechanism_status: "grounded" | "limited" | "unresolved" | null;
  source_refs: string[];
  routes: string[];
  followup_state_applied: boolean;
  uncertainty_limitations_count: number;
}

interface AssistantCaseDiagnostics {
  suite_case: AssistantStage1SuiteCase;
  session_id: string;
  trace_id: string | null;
  final_reply_type: string;
  turn_count: number;
  narrowing_result: AssistantEvalNarrowingResult;
  signature: string;
  is_generic: boolean;
  is_false_confident: boolean;
  is_broad_answer: boolean | null;
  followup_retention_score: number | null;
  evidence_quality_score: number;
  mechanism_specificity_score: number;
  genericness_score: number;
  accountant_actionability_score: number;
  accountant_usefulness_score: number;
  signals: AssistantCaseSignalSummary;
  limitations: string[];
  notes: string[];
}

function round2(value: number): number {
  return Number(value.toFixed(2));
}

function clampScore(value: number, min = 0, max = 5): number {
  if (Number.isNaN(value)) {
    return min;
  }
  if (value < min) return min;
  if (value > max) return max;
  return value;
}

function rateToBandScore(metric: AssistantMetricKey, value: number): 0 | 3 | 5 {
  if (metric === "retrieval_differentiation_rate") {
    if (value >= 0.75) return 5;
    if (value >= 0.45) return 3;
    return 0;
  }
  if (metric === "generic_explanation_rate" || metric === "false_confidence_rate" || metric === "broad_answer_rate") {
    if (value <= 0.25) return 5;
    if (value <= 0.45) return 3;
    return 0;
  }
  if (metric === "accountant_actionability_score" || metric === "mechanism_specificity_score" || metric === "followup_context_retention_score") {
    if (value >= 4) return 5;
    if (value >= 2.5) return 3;
    return 0;
  }
  return 0;
}

function rubricBandForMetric(metric: AssistantMetricKey, value: number | null): AccountantMetricRubricBand | null {
  if (value === null) {
    return null;
  }
  const score = rateToBandScore(metric, value);
  return ACCOUNTANT_SCORING_RUBRIC_V01[metric].find((item) => item.score === score) ?? null;
}

function buildFeatureProfileSnapshot(): Record<string, unknown> {
  return {
    FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1,
    FEATURE_ASSISTANT_ANSWER_POLICY_V11: process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 ?? String(FEATURE_ASSISTANT_ANSWER_POLICY_V11),
    FEATURE_ASSISTANT_BROAD_GUARD_V1: process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 ?? null,
    FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1: process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 ?? null,
    FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1: process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 ?? null,
    FEATURE_ASSISTANT_INVESTIGATION_STATE_V1: process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 ?? null,
    FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1: process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 ?? null
  };
}

function buildCodeVersionMarker(): Record<string, string | null> {
  return {
    git_commit:
      process.env.GIT_COMMIT ??
      process.env.CI_COMMIT_SHA ??
      process.env.VERCEL_GIT_COMMIT_SHA ??
      process.env.GITHUB_SHA ??
      null,
    build_marker: process.env.BUILD_MARKER ?? process.env.BUILD_ID ?? process.env.npm_package_version ?? null
  };
}

function resolveReadablePath(inputPath: string): string {
  if (path.isAbsolute(inputPath)) {
    return inputPath;
  }
  const candidates = [
    path.resolve(REPORTS_DIR, inputPath),
    path.resolve(EVAL_DATASETS_DIR, inputPath),
    path.resolve(EVAL_CASES_DIR, inputPath),
    path.resolve(inputPath)
  ];
  for (const candidate of candidates) {
    if (fs.existsSync(candidate)) {
      return candidate;
    }
  }
  return candidates[0];
}

function parseAssistantSuiteFile(inputPath?: string): AssistantStage1SuiteFile {
  const filePath = resolveReadablePath(inputPath ?? DEFAULT_ASSISTANT_STAGE1_SUITE_FILE);
  const raw = fs.readFileSync(filePath, "utf-8").replace(/^\uFEFF/, "");
  const parsed = JSON.parse(raw) as AssistantStage1SuiteFile;
  if (!parsed || typeof parsed !== "object") {
    throw new Error(`Invalid assistant suite format: ${filePath}`);
  }
  if (!Array.isArray(parsed.cases)) {
    throw new Error(`Assistant suite cases[] is required: ${filePath}`);
  }
  if (!Array.isArray(parsed.case_ids)) {
    throw new Error(`Assistant suite case_ids[] is required: ${filePath}`);
  }
  if (typeof parsed.suite_id !== "string" || !parsed.suite_id.trim()) {
    throw new Error(`Assistant suite suite_id is required: ${filePath}`);
  }
  if (typeof parsed.suite_version !== "string" || !parsed.suite_version.trim()) {
    throw new Error(`Assistant suite suite_version is required: ${filePath}`);
  }
  if (parsed.scenario_count !== parsed.cases.length) {
    throw new Error(`Assistant suite scenario_count mismatch: ${filePath}`);
  }
  const declaredIds = [...parsed.case_ids].sort();
  const actualIds = parsed.cases.map((item) => item.case_id).sort();
  const idsMatch = declaredIds.length === actualIds.length && declaredIds.every((item, index) => item === actualIds[index]);
  if (!idsMatch) {
    throw new Error(`Assistant suite case_ids do not match cases[]: ${filePath}`);
  }
  for (const item of parsed.cases) {
    if (!Array.isArray(item.turns) || item.turns.length === 0) {
      throw new Error(`Assistant suite case ${item.case_id} must include at least one turn.`);
    }
  }
  return parsed;
}

function hasDomainAnchors(text: string): boolean {
  const source = String(text ?? "");
  if (!source.trim()) {
    return false;
  }
  const hasPeriod = /\b20\d{2}(?:[-./](?:0[1-9]|1[0-2]))?\b/.test(source);
  const hasAccountingObject = /(счет|контрагент|документ|ндс|ос|period|account|supplier|invoice|guid|объект)/i.test(source);
  const hasAccountCode = /\b(?:01|02|03|04|08|10|19|20|25|26|41|43|44|50|51|52|57|60|62|68|69|70|71|73|76|90|91|94|97)\b/.test(source);
  const hits = [hasPeriod, hasAccountingObject, hasAccountCode].filter(Boolean).length;
  return hits >= 2;
}

function extractTextList(value: unknown): string[] {
  if (!Array.isArray(value)) {
    return [];
  }
  return value
    .map((item) => (typeof item === "string" ? item.trim() : ""))
    .filter(Boolean);
}

function toNarrowingStrength(value: unknown): "weak" | "medium" | "strong" | null {
  if (value === "weak" || value === "medium" || value === "strong") {
    return value;
  }
  return null;
}

function toDegradedTo(value: unknown): "partial" | "clarification" | null {
  if (value === "partial" || value === "clarification") {
    return value;
  }
  return null;
}

function buildAssistantEvalMarkdownReport(report: Record<string, unknown>): string {
  const metrics = ((report.metrics ?? {}) as { raw?: Record<string, number | null> }).raw ?? {};
  const bands = (report.rubric_bands ?? {}) as Record<string, { score?: number; label?: string } | null>;
  const subsets = (report.subsets ?? {}) as Record<string, unknown>;
  const scenarioSummary = (report.scenario_summary ?? {}) as Record<string, unknown>;
  const improvementHints = (report.improvement_hints ?? {}) as Record<string, unknown>;
  const rows = Object.keys(metrics)
    .map((key) => {
      const rawValue = metrics[key];
      const band = bands[key];
      const rawPrintable = rawValue === null || rawValue === undefined ? "n/a" : String(rawValue);
      const bandPrintable = band ? `${String(band.score)} (${String(band.label)})` : "n/a";
      return `| ${key} | ${rawPrintable} | ${bandPrintable} |`;
    })
    .join("\n");

  return [
    `# ${String(report.report_title ?? "Assistant Stage 1 Eval Run")}`,
    "",
    `- run_id: ${String(report.run_id ?? "")}`,
    `- eval_target: ${String(report.eval_target ?? "")}`,
    `- run_timestamp: ${String(report.run_timestamp ?? "")}`,
    `- suite_id: ${String(report.suite_id ?? "")}`,
    `- suite_version: ${String(report.suite_version ?? "")}`,
    `- cases_total: ${String(report.cases_total ?? 0)}`,
    "",
    "## Raw Metrics and Rubric Bands",
    "",
    "| Metric | Raw | Rubric band |",
    "|---|---:|---|",
    rows || "| n/a | n/a | n/a |",
    "",
    "## Subsets",
    "",
    `- broad_cases_total: ${String(subsets.broad_cases_total ?? 0)}`,
    `- followup_cases_total: ${String(subsets.followup_cases_total ?? 0)}`,
    "",
    "## Scenario Summary",
    "",
    `- improved_or_strong: ${String(scenarioSummary.improved_or_strong ?? 0)}`,
    `- unchanged_or_mixed: ${String(scenarioSummary.unchanged_or_mixed ?? 0)}`,
    `- weak_or_regressed: ${String(scenarioSummary.weak_or_regressed ?? 0)}`,
    "",
    "## Improvement Hints",
    "",
    `- strongest_signals: ${String(improvementHints.strongest_signals ?? "n/a")}`,
    `- weakest_signals: ${String(improvementHints.weakest_signals ?? "n/a")}`,
    ""
  ].join("\n");
}

function buildAssistantComparisonMarkdownReport(report: Record<string, unknown>): string {
  const metrics = (report.metric_deltas ?? {}) as Record<
    string,
    { baseline?: number | null; current?: number | null; delta?: number | null; trend?: string }
  >;
  const summary = (report.scenario_notes_summary ?? {}) as Record<string, unknown>;
  const rows = Object.keys(metrics)
    .map((key) => {
      const row = metrics[key];
      return `| ${key} | ${String(row.baseline ?? "n/a")} | ${String(row.current ?? "n/a")} | ${String(row.delta ?? "n/a")} | ${String(row.trend ?? "n/a")} |`;
    })
    .join("\n");
  return [
    `# ${String(report.report_title ?? "Assistant Stage 1 Baseline vs Current")}`,
    "",
    `- comparison_id: ${String(report.comparison_id ?? "")}`,
    `- baseline_run_id: ${String(report.baseline_run_id ?? "")}`,
    `- current_run_id: ${String(report.current_run_id ?? "")}`,
    `- suite_version: ${String(report.suite_version ?? "")}`,
    "",
    "## Metric Deltas",
    "",
    "| Metric | Baseline | Current | Delta | Trend |",
    "|---|---:|---:|---:|---|",
    rows || "| n/a | n/a | n/a | n/a | n/a |",
    "",
    "## Scenario Notes Summary",
    "",
    `- improved: ${String(summary.improved ?? 0)}`,
    `- unchanged: ${String(summary.unchanged ?? 0)}`,
    `- weakened: ${String(summary.weakened ?? 0)}`,
    ""
  ].join("\n");
}

export class EvalService {
  constructor(private readonly normalizerService: NormalizerService) {}

  public listCases(): EvalCaseFile[] {
    ensureDir(EVAL_CASES_DIR);
    const files = fs
      .readdirSync(EVAL_CASES_DIR)
      .filter((item) => item.endsWith(".json") && !item.endsWith(".report.json"));
    return files
      .map((name) => {
        const raw = fs.readFileSync(path.resolve(EVAL_CASES_DIR, name), "utf-8");
        return JSON.parse(raw) as EvalCaseFile;
      })
      .sort((a, b) => a.case_id.localeCompare(b.case_id));
  }

  private async runV2(payload: {
    normalizeConfig: Omit<NormalizeRequestPayload, "userQuestion" | "context">;
    useMock?: boolean;
    mode: EvalRunMode;
    caseSetFile?: string;
    rawQuestions?: string;
    cases: EvalInputCase[];
  }): Promise<Record<string, unknown>> {
    const runId = `eval-${nanoid(10)}`;
    const results: Array<Record<string, unknown>> = [];
    const routeCounter: Record<string, number> = {};
    const fallbackCounter: Record<string, number> = {};

    let schemaPass = 0;
    let inScopeMessages = 0;
    let multiIntentMessages = 0;
    let clarificationMessages = 0;
    let totalFragments = 0;
    let inScopeFragments = 0;
    let outOfScopeFragments = 0;
    let unclearFragments = 0;
    let executableWithSoftAssumptionsFragments = 0;
    let softAssumptionFragments = 0;
    let routedFragments = 0;
    let noRouteFragments = 0;
    let requestsTotal = 0;
    let retriesUsed = 0;
    let clarificationLabeledCases = 0;
    let clarificationTruePositive = 0;
    let clarificationFalsePositive = 0;
    let clarificationFalseNegative = 0;
    let scopeLabeledCases = 0;
    let scopeCorrectCases = 0;
    let routeLabeledCases = 0;
    let routeCorrectCases = 0;
    let expectedRoutedCases = 0;
    let noRouteTruePositive = 0;
    let noRouteFalsePositive = 0;
    let stateConsistencyChecks = 0;
    let stateConsistencyPass = 0;

    for (const item of payload.cases) {
      const response = await this.normalizerService.normalize({
        ...payload.normalizeConfig,
        userQuestion: item.raw_question,
        context: {
          eval_label: runId,
          case_id: item.case_id,
          eval_mode: payload.mode
        },
        retryPolicy: payload.mode === "single-pass-strict" ? "single-pass-strict" : "default",
        useMock: payload.useMock
      });

      if (response.validation.passed) {
        schemaPass += 1;
      }

      const requestCount = Number(response.request_count_for_case ?? 0);
      requestsTotal += requestCount;
      if (requestCount > 1) {
        retriesUsed += 1;
      }

      const normalized =
        response.normalized &&
        ["normalized_query_v2", "normalized_query_v2_0_1", "normalized_query_v2_0_2"].includes(
          String((response.normalized as { schema_version?: string }).schema_version ?? "")
        )
          ? (response.normalized as NormalizedQueryV2 | NormalizedQueryV2_0_1 | NormalizedQueryV2_0_2)
          : null;

      const routeSummary =
        response.route_hint_summary &&
        (response.route_hint_summary as { mode?: string }).mode === "deterministic_v2"
          ? (response.route_hint_summary as {
              decisions: Array<{
                route: string;
                domain_relevance?: string;
                execution_readiness?: string | null;
                route_status?: "routed" | "no_route" | null;
                no_route_reason?: NoRouteReason | null;
              }>;
              fallback?: { type?: string };
            })
          : null;

      if (normalized) {
        if (normalized.message_in_scope) {
          inScopeMessages += 1;
        }
        if (normalized.contains_multiple_tasks) {
          multiIntentMessages += 1;
        }
        if (normalized.global_notes.needs_clarification) {
          clarificationMessages += 1;
        }

        totalFragments += normalized.fragments.length;
        const inScopeList = normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope");
        inScopeFragments += inScopeList.length;
        outOfScopeFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length;
        unclearFragments += normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length;
        for (const fragment of inScopeList as V2FamilyFragment[]) {
          const readiness = executionReadinessOf(fragment);
          if (readiness === "executable_with_soft_assumptions") {
            executableWithSoftAssumptionsFragments += 1;
          }
          if (softAssumptionsOf(fragment).length > 0) {
            softAssumptionFragments += 1;
          }
        }
      }

      const predictedClarification = Boolean(normalized?.global_notes?.needs_clarification);
      const expectedClarification = typeof item.expected?.clarification_required === "boolean" ? item.expected.clarification_required : null;
      if (expectedClarification !== null) {
        clarificationLabeledCases += 1;
        if (predictedClarification && expectedClarification) clarificationTruePositive += 1;
        if (predictedClarification && !expectedClarification) clarificationFalsePositive += 1;
        if (!predictedClarification && expectedClarification) clarificationFalseNegative += 1;
      }

      const predictedScope = normalized ? normalized.message_in_scope : null;
      const expectedScope = expectedScopeInScope(item.expected);
      if (expectedScope !== null && predictedScope !== null) {
        scopeLabeledCases += 1;
        if (predictedScope === expectedScope) {
          scopeCorrectCases += 1;
        }
      }

      const predictedRouteStatus: "routed" | "no_route" | null = routeSummary
        ? routeSummary.decisions.some((decision) => decision.route !== "no_route")
          ? "routed"
          : "no_route"
        : null;
      const predictedNoRouteReason: NoRouteReason | null =
        routeSummary &&
        routeSummary.decisions.length > 0 &&
        routeSummary.decisions.every((decision) => decision.route === "no_route")
          ? (routeSummary.decisions[0]?.no_route_reason ?? null)
          : null;
      const expectedRouteStatus = item.expected?.expected_route_status ?? null;
      const expectedNoRouteReason = item.expected?.expected_no_route_reason ?? null;
      if (expectedRouteStatus) {
        routeLabeledCases += 1;
        if (predictedRouteStatus === expectedRouteStatus) {
          routeCorrectCases += 1;
        }
        if (expectedRouteStatus === "routed") {
          expectedRoutedCases += 1;
        }
      }
      if (predictedRouteStatus === "no_route") {
        if (expectedRouteStatus === "no_route") {
          if (!expectedNoRouteReason || expectedNoRouteReason === predictedNoRouteReason) {
            noRouteTruePositive += 1;
          } else {
            noRouteFalsePositive += 1;
          }
        } else if (expectedRouteStatus === "routed") {
          noRouteFalsePositive += 1;
        }
      }

      if (routeSummary) {
        for (const decision of routeSummary.decisions) {
          stateConsistencyChecks += 1;
          if (isDecisionStateConsistent(decision)) {
            stateConsistencyPass += 1;
          }
          routeCounter[decision.route] = (routeCounter[decision.route] ?? 0) + 1;
          if (decision.route === "no_route") {
            noRouteFragments += 1;
          } else {
            routedFragments += 1;
          }
        }
        const fallbackType = String(routeSummary.fallback?.type ?? "none");
        fallbackCounter[fallbackType] = (fallbackCounter[fallbackType] ?? 0) + 1;
      } else {
        fallbackCounter.none = (fallbackCounter.none ?? 0) + 1;
      }

      results.push({
        case_id: item.case_id,
        raw_question: item.raw_question,
        validation_passed: response.validation.passed,
        message_in_scope: normalized?.message_in_scope ?? null,
        scope_confidence: normalized?.scope_confidence ?? null,
        contains_multiple_tasks: normalized?.contains_multiple_tasks ?? null,
        fragments_total: normalized?.fragments.length ?? 0,
        in_scope_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "in_scope").length : 0,
        out_of_scope_fragments: normalized
          ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "out_of_scope").length
          : 0,
        unclear_fragments: normalized ? normalized.fragments.filter((fragment) => fragment.domain_relevance === "unclear").length : 0,
        fallback_type: routeSummary?.fallback?.type ?? "none",
        predicted_route_status: predictedRouteStatus,
        expected_route_status: expectedRouteStatus,
        predicted_no_route_reason: predictedNoRouteReason,
        expected_no_route_reason: expectedNoRouteReason,
        predicted_clarification_required: predictedClarification,
        expected_clarification_required: expectedClarification,
        executable_with_soft_assumptions_fragments: normalized
          ? normalized.fragments.filter((fragment) => executionReadinessOf(fragment as V2FamilyFragment) === "executable_with_soft_assumptions")
              .length
          : 0,
        trace_id: response.trace_id,
        request_count_for_case: requestCount
      });
    }

    const total = Math.max(1, payload.cases.length);
    const totalFragmentsSafe = Math.max(1, totalFragments);
    const totalRoutedDecisions = Math.max(1, routedFragments + noRouteFragments);
    const precisionDenominator = clarificationTruePositive + clarificationFalsePositive;
    const recallDenominator = clarificationTruePositive + clarificationFalseNegative;
    const noRoutePrecisionDenominator = noRouteTruePositive + noRouteFalsePositive;

    const metrics = {
      schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
      scope_detection_accuracy:
        scopeLabeledCases > 0 ? Number(((scopeCorrectCases / scopeLabeledCases) * 100).toFixed(2)) : null,
      scope_in_scope_rate: Number(((inScopeMessages / total) * 100).toFixed(2)),
      multi_intent_detected_rate: Number(((multiIntentMessages / total) * 100).toFixed(2)),
      clarification_required_rate: Number(((clarificationMessages / total) * 100).toFixed(2)),
      avg_fragments_per_message: Number((totalFragments / total).toFixed(2)),
      out_of_scope_fragment_rate: Number(((outOfScopeFragments / totalFragmentsSafe) * 100).toFixed(2)),
      routed_fragment_rate: Number(((routedFragments / totalRoutedDecisions) * 100).toFixed(2)),
      no_route_fragment_rate: Number(((noRouteFragments / totalRoutedDecisions) * 100).toFixed(2)),
      route_resolution_accuracy:
        routeLabeledCases > 0 ? Number(((routeCorrectCases / routeLabeledCases) * 100).toFixed(2)) : null,
      no_route_precision:
        noRoutePrecisionDenominator > 0 ? Number(((noRouteTruePositive / noRoutePrecisionDenominator) * 100).toFixed(2)) : null,
      false_no_route_rate:
        expectedRoutedCases > 0 ? Number(((noRouteFalsePositive / expectedRoutedCases) * 100).toFixed(2)) : null,
      execution_state_consistency_rate:
        stateConsistencyChecks > 0 ? Number(((stateConsistencyPass / stateConsistencyChecks) * 100).toFixed(2)) : null,
      executable_with_soft_assumptions_rate: Number(((executableWithSoftAssumptionsFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
      soft_assumption_used_fragment_rate: Number(((softAssumptionFragments / Math.max(1, inScopeFragments)) * 100).toFixed(2)),
      clarification_precision:
        precisionDenominator > 0 ? Number(((clarificationTruePositive / precisionDenominator) * 100).toFixed(2)) : null,
      clarification_recall:
        recallDenominator > 0 ? Number(((clarificationTruePositive / recallDenominator) * 100).toFixed(2)) : null,
      false_clarification_rate:
        clarificationLabeledCases > 0 ? Number(((clarificationFalsePositive / clarificationLabeledCases) * 100).toFixed(2)) : null
    };

    const report = {
      run_id: runId,
      timestamp: new Date().toISOString(),
      mode: payload.mode,
      use_mock: Boolean(payload.useMock),
      prompt_version: payload.normalizeConfig.promptVersion ?? null,
      schema_version: String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
        .toLowerCase()
        .includes("v2_0_2")
        ? "v2_0_2"
        : String(payload.normalizeConfig.schemaVersion ?? payload.normalizeConfig.promptVersion ?? "")
              .toLowerCase()
              .includes("v2_0_1")
          ? "v2_0_1"
          : "v2",
      dataset: {
        source: payload.rawQuestions ? "inline_raw_questions" : payload.caseSetFile ? "file" : "data/eval_cases/*.json",
        file: payload.caseSetFile ?? null,
        raw_questions_count: payload.rawQuestions ? parseRawQuestions(payload.rawQuestions).length : null
      },
      cases_total: payload.cases.length,
      metrics,
      budget: {
        requests_total: requestsTotal,
        retries_used: retriesUsed
      },
      clarification_eval: {
        labeled_cases: clarificationLabeledCases,
        true_positive: clarificationTruePositive,
        false_positive: clarificationFalsePositive,
        false_negative: clarificationFalseNegative
      },
      route_eval: {
        labeled_cases: routeLabeledCases,
        correct_cases: routeCorrectCases,
        expected_routed_cases: expectedRoutedCases,
        no_route_true_positive: noRouteTruePositive,
        no_route_false_positive: noRouteFalsePositive
      },
      scope_eval: {
        labeled_cases: scopeLabeledCases,
        correct_cases: scopeCorrectCases
      },
      execution_state_eval: {
        checks_total: stateConsistencyChecks,
        checks_passed: stateConsistencyPass
      },
      route_distribution: routeCounter,
      fallback_distribution: fallbackCounter,
      results
    };

    ensureDir(EVAL_CASES_DIR);
    writeJsonFile(path.resolve(EVAL_CASES_DIR, `${runId}.report.json`), report);

    return report;
  }

  private collectAssistantSignals(finalResponse: AssistantMessageResponsePayload, turnResponses: AssistantMessageResponsePayload[]): AssistantCaseSignalSummary {
    const debug = finalResponse.debug;
    const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
    const sourceRefSet = new Set<string>();
    const limitationCodeSet = new Set<string>();
    const routeSet = new Set<string>();
    const confidenceScores: number[] = [];
    const narrowingOrder: Record<"weak" | "medium" | "strong", number> = { weak: 0, medium: 1, strong: 2 };

    let broadQueryDetected = false;
    let broadResultFlag = false;
    let minimumEvidenceFailed = false;
    let degradedTo: "partial" | "clarification" | null = null;
    let narrowingStrength: "weak" | "medium" | "strong" | null = null;

    for (const result of retrievalResults) {
      routeSet.add(String(result.route ?? "unknown"));
      const summary = result.summary ?? {};
      if (summary.broad_query_detected === true) broadQueryDetected = true;
      if (summary.broad_result_flag === true) broadResultFlag = true;
      if (summary.minimum_evidence_failed === true) minimumEvidenceFailed = true;

      const degraded = toDegradedTo(summary.degraded_to);
      if (degraded === "clarification") {
        degradedTo = "clarification";
      } else if (!degradedTo && degraded === "partial") {
        degradedTo = "partial";
      }

      const narrowed = toNarrowingStrength(summary.narrowing_strength);
      if (narrowed && (!narrowingStrength || narrowingOrder[narrowed] < narrowingOrder[narrowingStrength])) {
        narrowingStrength = narrowed;
      }

      if (result.confidence === "high") confidenceScores.push(3);
      if (result.confidence === "medium") confidenceScores.push(2);
      if (result.confidence === "low") confidenceScores.push(1);

      for (const evidence of Array.isArray(result.evidence) ? result.evidence : []) {
        const canonicalRef = String(evidence.source_ref?.canonical_ref ?? "").trim();
        if (canonicalRef) {
          sourceRefSet.add(canonicalRef);
        }
        const reasonCode = String(evidence.limitation?.reason_code ?? "").trim();
        if (reasonCode) {
          limitationCodeSet.add(reasonCode);
        }
        if (evidence.confidence === "high") confidenceScores.push(3);
        if (evidence.confidence === "medium") confidenceScores.push(2);
        if (evidence.confidence === "low") confidenceScores.push(1);
      }
    }

    const averageConfidence =
      confidenceScores.length > 0 ? confidenceScores.reduce((acc, item) => acc + item, 0) / confidenceScores.length : null;
    const evidenceConfidence: "high" | "medium" | "low" | null =
      averageConfidence === null ? null : averageConfidence >= 2.6 ? "high" : averageConfidence >= 1.8 ? "medium" : "low";

    const mechanismStatus =
      debug?.answer_structure_v11?.mechanism_block?.status === "grounded" ||
      debug?.answer_structure_v11?.mechanism_block?.status === "limited" ||
      debug?.answer_structure_v11?.mechanism_block?.status === "unresolved"
        ? debug.answer_structure_v11.mechanism_block.status
        : null;

    const followupStateApplied = turnResponses.some((item) => item.debug?.followup_state_usage?.applied === true);
    const uncertaintyLimitationsCount = debug?.answer_structure_v11?.uncertainty_block?.limitations?.length ?? 0;

    return {
      broad_query_detected: broadQueryDetected,
      broad_result_flag: broadResultFlag,
      narrowing_strength: narrowingStrength,
      minimum_evidence_failed: minimumEvidenceFailed,
      degraded_to: degradedTo,
      evidence_confidence: evidenceConfidence,
      limitation_reason_codes: [...limitationCodeSet],
      mechanism_status: mechanismStatus,
      source_refs: [...sourceRefSet],
      routes: [...routeSet],
      followup_state_applied: followupStateApplied,
      uncertainty_limitations_count: uncertaintyLimitationsCount
    };
  }

  private computeAssistantMetrics(input: {
    diagnostics: AssistantCaseDiagnostics[];
  }): {
    raw: AssistantEvalMetricVector;
    rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null>;
    denominators: Record<string, number>;
    signature_counts: Record<string, number>;
  } {
    const diagnostics = input.diagnostics;
    const total = Math.max(1, diagnostics.length);
    const signatureCounter = diagnostics.reduce<Record<string, number>>((acc, item) => {
      acc[item.signature] = (acc[item.signature] ?? 0) + 1;
      return acc;
    }, {});

    const uniqueSignatures = Object.keys(signatureCounter).length;
    const genericCases = diagnostics.filter((item) => item.is_generic).length;
    const falseConfidenceCases = diagnostics.filter((item) => item.is_false_confident).length;
    const broadCases = diagnostics.filter((item) => item.is_broad_answer !== null);
    const broadAnswerCases = broadCases.filter((item) => item.is_broad_answer === true).length;
    const followupCases = diagnostics.filter((item) => item.followup_retention_score !== null);

    const avgActionability =
      diagnostics.length > 0
        ? diagnostics.reduce((acc, item) => acc + item.accountant_actionability_score, 0) / diagnostics.length
        : null;
    const avgMechanism =
      diagnostics.length > 0 ? diagnostics.reduce((acc, item) => acc + item.mechanism_specificity_score, 0) / diagnostics.length : null;
    const avgFollowup =
      followupCases.length > 0
        ? followupCases.reduce((acc, item) => acc + Number(item.followup_retention_score ?? 0), 0) / followupCases.length
        : null;

    const raw: AssistantEvalMetricVector = {
      retrieval_differentiation_rate: round2(uniqueSignatures / total),
      generic_explanation_rate: round2(genericCases / total),
      accountant_actionability_score: avgActionability === null ? null : round2(avgActionability),
      false_confidence_rate: round2(falseConfidenceCases / total),
      broad_answer_rate: broadCases.length > 0 ? round2(broadAnswerCases / broadCases.length) : null,
      mechanism_specificity_score: avgMechanism === null ? null : round2(avgMechanism),
      followup_context_retention_score: avgFollowup === null ? null : round2(avgFollowup)
    };

    const rubric_bands: Record<AssistantMetricKey, AccountantMetricRubricBand | null> = {
      retrieval_differentiation_rate: rubricBandForMetric("retrieval_differentiation_rate", raw.retrieval_differentiation_rate),
      generic_explanation_rate: rubricBandForMetric("generic_explanation_rate", raw.generic_explanation_rate),
      accountant_actionability_score: rubricBandForMetric("accountant_actionability_score", raw.accountant_actionability_score),
      false_confidence_rate: rubricBandForMetric("false_confidence_rate", raw.false_confidence_rate),
      broad_answer_rate: rubricBandForMetric("broad_answer_rate", raw.broad_answer_rate),
      mechanism_specificity_score: rubricBandForMetric("mechanism_specificity_score", raw.mechanism_specificity_score),
      followup_context_retention_score: rubricBandForMetric("followup_context_retention_score", raw.followup_context_retention_score)
    };

    return {
      raw,
      rubric_bands,
      denominators: {
        cases_total: diagnostics.length,
        broad_cases_total: broadCases.length,
        followup_cases_total: followupCases.length
      },
      signature_counts: signatureCounter
    };
  }

  private buildAssistantComparisonReport(input: {
    currentReport: Record<string, unknown>;
    baselineReportFile: string;
  }): Record<string, unknown> {
    const baselinePath = resolveReadablePath(input.baselineReportFile);
    const baselineReport = JSON.parse(fs.readFileSync(baselinePath, "utf-8")) as Record<string, unknown>;
    const currentReport = input.currentReport;
    const metricKeys: AssistantMetricKey[] = [
      "retrieval_differentiation_rate",
      "generic_explanation_rate",
      "accountant_actionability_score",
      "false_confidence_rate",
      "broad_answer_rate",
      "mechanism_specificity_score",
      "followup_context_retention_score"
    ];
    const lowerIsBetter = new Set<AssistantMetricKey>(["generic_explanation_rate", "false_confidence_rate", "broad_answer_rate"]);

    const baselineRaw = ((baselineReport.metrics ?? {}) as { raw?: Record<string, number | null> }).raw ?? {};
    const currentRaw = ((currentReport.metrics ?? {}) as { raw?: Record<string, number | null> }).raw ?? {};
    const deltas: Record<string, { baseline: number | null; current: number | null; delta: number | null; trend: string }> = {};

    for (const metric of metricKeys) {
      const baseline = typeof baselineRaw[metric] === "number" ? Number(baselineRaw[metric]) : null;
      const current = typeof currentRaw[metric] === "number" ? Number(currentRaw[metric]) : null;
      const delta = baseline !== null && current !== null ? round2(current - baseline) : null;
      let trend = "n/a";
      if (baseline !== null && current !== null) {
        const improved = lowerIsBetter.has(metric) ? current < baseline - 0.01 : current > baseline + 0.01;
        const weakened = lowerIsBetter.has(metric) ? current > baseline + 0.01 : current < baseline - 0.01;
        trend = improved ? "improved" : weakened ? "weakened" : "unchanged";
      }
      deltas[metric] = { baseline, current, delta, trend };
    }

    const baselineResults = Array.isArray(baselineReport.results) ? (baselineReport.results as Record<string, unknown>[]) : [];
    const currentResults = Array.isArray(currentReport.results) ? (currentReport.results as Record<string, unknown>[]) : [];
    const baselineByCase = new Map<string, Record<string, unknown>>();
    for (const row of baselineResults) {
      baselineByCase.set(String(row.case_id ?? ""), row);
    }

    const improvedNotes: string[] = [];
    const unchangedNotes: string[] = [];
    const weakenedNotes: string[] = [];

    for (const row of currentResults) {
      const caseId = String(row.case_id ?? "");
      const currentUsefulness = typeof row.accountant_usefulness_score === "number" ? Number(row.accountant_usefulness_score) : null;
      const baselineRow = baselineByCase.get(caseId);
      const baselineUsefulness =
        baselineRow && typeof baselineRow.accountant_usefulness_score === "number"
          ? Number(baselineRow.accountant_usefulness_score)
          : null;
      if (baselineUsefulness === null || currentUsefulness === null) {
        continue;
      }
      const delta = round2(currentUsefulness - baselineUsefulness);
      const note = `${caseId}: usefulness ${baselineUsefulness} -> ${currentUsefulness} (delta ${delta})`;
      if (delta > 0.25) {
        improvedNotes.push(note);
      } else if (delta < -0.25) {
        weakenedNotes.push(note);
      } else {
        unchangedNotes.push(note);
      }
    }

    const comparisonId = `assistant-compare-${nanoid(8)}`;
    const comparisonReport = {
      schema_version: ASSISTANT_STAGE1_COMPARISON_SCHEMA_VERSION,
      comparison_id: comparisonId,
      run_timestamp: new Date().toISOString(),
      baseline_run_id: baselineReport.run_id ?? null,
      current_run_id: currentReport.run_id ?? null,
      eval_target: "assistant_stage1",
      suite_id: currentReport.suite_id ?? baselineReport.suite_id ?? null,
      suite_version: currentReport.suite_version ?? baselineReport.suite_version ?? null,
      baseline_report_file: baselinePath,
      current_report_file: currentReport.artifacts && typeof currentReport.artifacts === "object"
        ? (currentReport.artifacts as { run_report_json_path?: string }).run_report_json_path ?? null
        : null,
      metric_deltas: deltas,
      scenario_notes_summary: {
        improved: improvedNotes.length,
        unchanged: unchangedNotes.length,
        weakened: weakenedNotes.length
      },
      scenario_notes: {
        improved: improvedNotes,
        unchanged: unchangedNotes,
        weakened: weakenedNotes
      },
      known_limitations: currentReport.known_limitations ?? [
        "Comparison is run-to-run and depends on stable mock/runtime flags.",
        "Metrics remain Stage 1 heuristic bands, not full product scorecards."
      ],
      report_title: "Assistant Stage 1 Baseline vs Current"
    };

    ensureDir(REPORTS_DIR);
    const jsonPath = path.resolve(REPORTS_DIR, `${comparisonId}.json`);
    const mdPath = path.resolve(REPORTS_DIR, `${comparisonId}.md`);
    writeJsonFile(jsonPath, comparisonReport);
    fs.writeFileSync(mdPath, buildAssistantComparisonMarkdownReport(comparisonReport), "utf-8");

    return {
      ...comparisonReport,
      artifacts: {
        comparison_report_json_path: jsonPath,
        comparison_report_md_path: mdPath
      }
    };
  }

  private async runAssistantStage1(payload: {
    normalizeConfig: Omit<NormalizeRequestPayload, "userQuestion" | "context">;
    caseIds?: string[];
    useMock?: boolean;
    mode: EvalRunMode;
    caseSetFile?: string;
    compareWithReportFile?: string;
  }): Promise<Record<string, unknown>> {
    if (!FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1) {
      throw new ApiError(
        "ASSISTANT_STAGE1_EVAL_DISABLED",
        "Assistant Stage 1 eval target is disabled by FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1.",
        409
      );
    }

    const suite = parseAssistantSuiteFile(payload.caseSetFile);
    const suiteCases = suite.cases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
    const runId = `assistant-stage1-${nanoid(10)}`;
    const assistantService = new AssistantService(this.normalizerService, new AssistantSessionStore());
    const diagnostics: AssistantCaseDiagnostics[] = [];
    let requestsTotal = 0;

    for (const suiteCase of suiteCases) {
      const sessionId = `${runId}-${suiteCase.case_id}`;
      const turnResponses: AssistantMessageResponsePayload[] = [];
      const notes: string[] = [];
      const limitations: string[] = [];

      try {
        for (const turn of suiteCase.turns) {
          const response = await assistantService.handleMessage({
            session_id: sessionId,
            user_message: turn.user_message,
            message: turn.user_message,
            mode: "assistant",
            apiKey: payload.normalizeConfig.apiKey,
            model: payload.normalizeConfig.model,
            baseUrl: payload.normalizeConfig.baseUrl,
            temperature: payload.normalizeConfig.temperature,
            maxOutputTokens: payload.normalizeConfig.maxOutputTokens,
            promptVersion: payload.normalizeConfig.promptVersion,
            systemPrompt: payload.normalizeConfig.systemPrompt,
            developerPrompt: payload.normalizeConfig.developerPrompt,
            domainPrompt: payload.normalizeConfig.domainPrompt,
            fewShotExamples: payload.normalizeConfig.fewShotExamples,
            useMock: payload.useMock
          });
          turnResponses.push(response);
          requestsTotal += 1;
        }
      } catch (error) {
        const errorMessage = error instanceof Error ? error.message : String(error);
        diagnostics.push({
          suite_case: suiteCase,
          session_id: sessionId,
          trace_id: null,
          final_reply_type: "backend_error",
          turn_count: turnResponses.length,
          narrowing_result: "failed",
          signature: `backend_error|${suiteCase.scenario_tag}`,
          is_generic: true,
          is_false_confident: false,
          is_broad_answer: suiteCase.broadness_level === "low" ? null : false,
          followup_retention_score: suiteCase.question_type === "followup" || suiteCase.turns.length > 1 ? 0 : null,
          evidence_quality_score: 0,
          mechanism_specificity_score: 0,
          genericness_score: 5,
          accountant_actionability_score: 0,
          accountant_usefulness_score: 0,
          signals: {
            broad_query_detected: suiteCase.broadness_level !== "low",
            broad_result_flag: false,
            narrowing_strength: null,
            minimum_evidence_failed: true,
            degraded_to: "clarification",
            evidence_confidence: "low",
            limitation_reason_codes: [],
            mechanism_status: null,
            source_refs: [],
            routes: [],
            followup_state_applied: false,
            uncertainty_limitations_count: 0
          },
          limitations: [errorMessage],
          notes: [`Case execution failed: ${errorMessage}`]
        });
        continue;
      }

      const finalResponse = turnResponses[turnResponses.length - 1];
      const signals = this.collectAssistantSignals(finalResponse, turnResponses);
      const structure = finalResponse.debug?.answer_structure_v11 ?? null;
      const recommendedActions = extractTextList(structure?.next_step_block?.recommended_actions);
      const clarificationQuestions = extractTextList(structure?.next_step_block?.clarification_questions);
      const mechanismNotes = extractTextList(structure?.mechanism_block?.mechanism_notes);
      const uncertaintyLimitations = extractTextList(structure?.uncertainty_block?.limitations);
      const directAnswer = String(structure?.direct_answer ?? finalResponse.assistant_reply ?? "");

      const hasAnchors = hasDomainAnchors(
        [directAnswer, ...recommendedActions, ...clarificationQuestions, ...signals.source_refs].join(" ")
      );
      let genericnessScore = 0;
      if (!hasAnchors) genericnessScore += 2;
      if (mechanismNotes.length === 0) genericnessScore += 1;
      if (signals.source_refs.length === 0) genericnessScore += 1;
      if (recommendedActions.length === 0) genericnessScore += 1;
      genericnessScore = clampScore(genericnessScore);

      let actionabilityScore = 0;
      if (recommendedActions.length > 0) actionabilityScore += 2;
      if (recommendedActions.some((item) => hasDomainAnchors(item))) actionabilityScore += 2;
      if (clarificationQuestions.length > 0 && (finalResponse.reply_type === "clarification_required" || signals.degraded_to === "clarification")) {
        actionabilityScore += 1;
      }
      if (signals.source_refs.length > 0 && actionabilityScore < 5) {
        actionabilityScore += 1;
      }
      actionabilityScore = clampScore(actionabilityScore);

      let evidenceQualityScore = 0;
      if (signals.source_refs.length >= 3) evidenceQualityScore += 2;
      else if (signals.source_refs.length > 0) evidenceQualityScore += 1;
      if (signals.evidence_confidence === "high") evidenceQualityScore += 2;
      if (signals.evidence_confidence === "medium") evidenceQualityScore += 1;
      if (signals.minimum_evidence_failed) evidenceQualityScore -= 2;
      if (signals.limitation_reason_codes.includes("insufficient_detail")) evidenceQualityScore -= 1;
      if (signals.limitation_reason_codes.includes("missing_mechanism")) evidenceQualityScore -= 1;
      evidenceQualityScore = clampScore(evidenceQualityScore);

      let mechanismSpecificityScore = 0;
      if (signals.mechanism_status === "grounded" && mechanismNotes.length > 0 && !signals.limitation_reason_codes.includes("missing_mechanism")) {
        mechanismSpecificityScore = 5;
      } else if (signals.mechanism_status === "limited" && mechanismNotes.length > 0) {
        mechanismSpecificityScore = 3;
      } else if (mechanismNotes.length > 0) {
        mechanismSpecificityScore = 2;
      } else {
        mechanismSpecificityScore = 0;
      }

      const usefulnessScore = clampScore((actionabilityScore + (5 - genericnessScore) + evidenceQualityScore + mechanismSpecificityScore) / 4);
      const isGeneric = genericnessScore >= 3;
      const factualReply = finalResponse.reply_type === "factual" || finalResponse.reply_type === "factual_with_explanation";
      const isFalseConfident =
        factualReply &&
        (signals.minimum_evidence_failed ||
          signals.degraded_to !== null ||
          signals.evidence_confidence === "low" ||
          (signals.limitation_reason_codes.length > 0 && signals.uncertainty_limitations_count === 0));

      const isBroadCase = suiteCase.broadness_level !== "low" || signals.broad_query_detected;
      const isBroadAnswer = isBroadCase
        ? factualReply && signals.degraded_to === null && !signals.minimum_evidence_failed
        : null;

      const isFollowupCase = suiteCase.question_type === "followup" || suiteCase.turns.length > 1;
      let followupRetentionScore: number | null = null;
      if (isFollowupCase) {
        const finalTurnIndex = Number(finalResponse.debug?.investigation_state_snapshot?.turn_index ?? 0);
        if (signals.followup_state_applied && finalTurnIndex >= suiteCase.turns.length) {
          followupRetentionScore = 5;
        } else if (finalTurnIndex >= suiteCase.turns.length) {
          followupRetentionScore = 3;
        } else {
          followupRetentionScore = 0;
        }
      }

      let narrowingResult: AssistantEvalNarrowingResult = "not_required";
      if (signals.degraded_to === "clarification" || finalResponse.reply_type === "clarification_required") {
        narrowingResult = "clarification_requested";
      } else if (signals.broad_query_detected || signals.broad_result_flag) {
        narrowingResult = signals.minimum_evidence_failed ? "failed" : "applied";
      }

      if (signals.minimum_evidence_failed) {
        limitations.push("minimum_evidence_failed");
      }
      limitations.push(...signals.limitation_reason_codes.map((item) => `limitation_reason:${item}`));
      if (signals.mechanism_status === "unresolved") {
        limitations.push("mechanism_unresolved");
      }
      limitations.push(...uncertaintyLimitations);

      if (isGeneric) notes.push("genericness_high");
      if (isFalseConfident) notes.push("false_confidence_risk");
      if (isBroadCase && isBroadAnswer) notes.push("broad_answer_without_degradation");
      if (followupRetentionScore !== null && followupRetentionScore < 3) notes.push("followup_context_retention_weak");

      diagnostics.push({
        suite_case: suiteCase,
        session_id: sessionId,
        trace_id: finalResponse.debug?.trace_id ?? null,
        final_reply_type: finalResponse.reply_type,
        turn_count: suiteCase.turns.length,
        narrowing_result: narrowingResult,
        signature: [
          finalResponse.reply_type,
          signals.routes.sort().join(","),
          signals.degraded_to ?? "none",
          signals.mechanism_status ?? "unknown",
          signals.source_refs.slice(0, 2).join(",")
        ].join("|"),
        is_generic: isGeneric,
        is_false_confident: isFalseConfident,
        is_broad_answer: isBroadAnswer,
        followup_retention_score: followupRetentionScore,
        evidence_quality_score: evidenceQualityScore,
        mechanism_specificity_score: mechanismSpecificityScore,
        genericness_score: genericnessScore,
        accountant_actionability_score: actionabilityScore,
        accountant_usefulness_score: round2(usefulnessScore),
        signals,
        limitations: Array.from(new Set(limitations)),
        notes
      });
    }

    const metrics = this.computeAssistantMetrics({ diagnostics });
    const caseRecords: AssistantEvalRecord[] = diagnostics.map((item) => {
      const signatureHits = metrics.signature_counts[item.signature] ?? 1;
      const caseMetricVector: AssistantEvalMetricVector = {
        retrieval_differentiation_rate: signatureHits === 1 ? 1 : 0,
        generic_explanation_rate: item.is_generic ? 1 : 0,
        accountant_actionability_score: round2(item.accountant_actionability_score),
        false_confidence_rate: item.is_false_confident ? 1 : 0,
        broad_answer_rate: item.is_broad_answer === null ? null : item.is_broad_answer ? 1 : 0,
        mechanism_specificity_score: round2(item.mechanism_specificity_score),
        followup_context_retention_score:
          item.followup_retention_score === null ? null : round2(item.followup_retention_score)
      };
      return {
        schema_version: ASSISTANT_EVAL_RECORD_SCHEMA_VERSION,
        created_at: new Date().toISOString(),
        case_id: item.suite_case.case_id,
        scenario_tag: item.suite_case.scenario_tag,
        session_id: item.session_id,
        trace_id: item.trace_id,
        question_type: item.suite_case.question_type,
        broadness_level: item.suite_case.broadness_level,
        narrowing_result: item.narrowing_result,
        evidence_quality_score: round2(item.evidence_quality_score),
        genericness_score: round2(item.genericness_score),
        accountant_usefulness_score: round2(item.accountant_usefulness_score),
        accountant_metrics: caseMetricVector,
        raw_signals: {
          final_reply_type: item.final_reply_type,
          turn_count: item.turn_count,
          broad_query_detected: item.signals.broad_query_detected,
          broad_result_flag: item.signals.broad_result_flag,
          narrowing_strength: item.signals.narrowing_strength,
          minimum_evidence_failed: item.signals.minimum_evidence_failed,
          degraded_to: item.signals.degraded_to,
          evidence_confidence: item.signals.evidence_confidence,
          limitation_reason_codes: item.signals.limitation_reason_codes,
          mechanism_status: item.signals.mechanism_status,
          source_refs: item.signals.source_refs,
          routes: item.signals.routes,
          followup_state_applied: item.signals.followup_state_applied
        },
        metric_subscores: caseMetricVector,
        limitations: item.limitations,
        notes: item.notes
      };
    });

    const strongestSignals = (Object.entries(metrics.rubric_bands) as Array<[AccountantMetricName, AccountantMetricRubricBand | null]>)
      .filter(([, band]) => band?.score === 5)
      .map(([name]) => name);
    const weakestSignals = (Object.entries(metrics.rubric_bands) as Array<[AccountantMetricName, AccountantMetricRubricBand | null]>)
      .filter(([, band]) => band?.score === 0)
      .map(([name]) => name);

    const runTimestamp = new Date().toISOString();
    const report: Record<string, unknown> = {
      schema_version: ASSISTANT_STAGE1_RUN_SCHEMA_VERSION,
      run_id: runId,
      run_timestamp: runTimestamp,
      eval_target: "assistant_stage1",
      mode: payload.mode,
      use_mock: Boolean(payload.useMock),
      prompt_version: payload.normalizeConfig.promptVersion ?? null,
      suite_id: suite.suite_id,
      suite_version: suite.suite_version,
      suite_schema_version: suite.schema_version ?? null,
      scenario_count: suite.scenario_count,
      case_ids: suiteCases.map((item) => item.case_id),
      cases_total: caseRecords.length,
      feature_profile_snapshot: buildFeatureProfileSnapshot(),
      code_version: buildCodeVersionMarker(),
      metrics: {
        raw: metrics.raw,
        denominators: metrics.denominators
      },
      rubric_bands: metrics.rubric_bands,
      subsets: {
        broad_cases_total: metrics.denominators.broad_cases_total,
        followup_cases_total: metrics.denominators.followup_cases_total
      },
      budget: {
        requests_total: requestsTotal
      },
      results: caseRecords,
      scenario_summary: {
        improved_or_strong: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) >= 4).length,
        unchanged_or_mixed: caseRecords.filter((item) => {
          const value = Number(item.accountant_usefulness_score ?? 0);
          return value >= 2.5 && value < 4;
        }).length,
        weak_or_regressed: caseRecords.filter((item) => Number(item.accountant_usefulness_score ?? 0) < 2.5).length
      },
      improvement_hints: {
        strongest_signals: strongestSignals.length > 0 ? strongestSignals.join(", ") : "none",
        weakest_signals: weakestSignals.length > 0 ? weakestSignals.join(", ") : "none"
      },
      known_limitations: [
        "Snapshot-only retrieval contour remains (no live verification core in Stage 1).",
        "Metric mapping for genericness/false confidence is heuristic by design.",
        "Stage 1 eval excludes Stage 2+ metrics (problem-unit/lifecycle/graph/investigation engine)."
      ],
      report_title: "Assistant Stage 1 Eval Run"
    };

    ensureDir(REPORTS_DIR);
    const runJsonPath = path.resolve(REPORTS_DIR, `${runId}.json`);
    const runMdPath = path.resolve(REPORTS_DIR, `${runId}.md`);
    writeJsonFile(runJsonPath, report);
    fs.writeFileSync(runMdPath, buildAssistantEvalMarkdownReport(report), "utf-8");

    report.artifacts = {
      run_report_json_path: runJsonPath,
      run_report_md_path: runMdPath
    };

    if (payload.compareWithReportFile) {
      report.comparison = this.buildAssistantComparisonReport({
        currentReport: report,
        baselineReportFile: payload.compareWithReportFile
      });
    }

    return report;
  }

  public async run(payload: {
    normalizeConfig: Omit<NormalizeRequestPayload, "userQuestion" | "context">;
    caseIds?: string[];
    useMock?: boolean;
    mode?: EvalRunMode;
    caseSetFile?: string;
    rawQuestions?: string;
    evalTarget?: EvalTarget;
    compareWithReportFile?: string;
  }): Promise<Record<string, unknown>> {
    const mode = payload.mode ?? "standard";
    const evalTarget = payload.evalTarget ?? "normalizer";

    if (evalTarget === "assistant_stage1") {
      return this.runAssistantStage1({
        normalizeConfig: payload.normalizeConfig,
        caseIds: payload.caseIds,
        useMock: payload.useMock,
        mode,
        caseSetFile: payload.caseSetFile,
        compareWithReportFile: payload.compareWithReportFile
      });
    }

    const promptVersion = String(payload.normalizeConfig.promptVersion ?? "").toLowerCase();
    const schemaVersion = String(payload.normalizeConfig.schemaVersion ?? "").toLowerCase();
    const isV2 =
      promptVersion.startsWith("normalizer_v2") || schemaVersion === "v2" || schemaVersion === "v2_0_1" || schemaVersion === "v2_0_2";
    const inlineQuestions = payload.rawQuestions ? parseRawQuestions(payload.rawQuestions) : [];
    const inlineCases: EvalInputCase[] = inlineQuestions.map((question, index) => ({
      case_id: formatCaseId("BQ", index),
      raw_question: question,
      expected: null
    }));

    if (isV2) {
      const sourceCases =
        inlineCases.length > 0
          ? inlineCases
          : payload.caseSetFile
            ? parseCaseSetFile(payload.caseSetFile).map((item) => ({
                case_id: item.case_id,
                raw_question: item.raw_question,
                expected: item.expected
              }))
            : this.listCases().map((item) => ({
                case_id: item.case_id,
                raw_question: item.raw_question,
                expected: item.expected
              }));
      const filtered = sourceCases.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
      return this.runV2({
        ...payload,
        mode,
        cases: filtered
      });
    }

    if (inlineCases.length > 0) {
      throw new Error("rawQuestions batch is supported for normalizer_v2 only.");
    }

    const casesSource = payload.caseSetFile ? parseCaseSetFile(payload.caseSetFile) : this.listCases();
    const filteredCases = casesSource.filter((item) => !payload.caseIds || payload.caseIds.includes(item.case_id));
    const runId = `eval-${nanoid(10)}`;
    const results: Array<Record<string, unknown>> = [];
    const mismatches: Array<Record<string, unknown>> = [];
    const badConfidenceCases: Array<Record<string, unknown>> = [];
    const classCounter: Record<string, { total: number; passed: number }> = {};

    let schemaPass = 0;
    let intentPass = 0;
    let routePass = 0;
    let causalPass = 0;
    let highConfidenceErrors = 0;
    let requestsTotal = 0;
    let retriesUsed = 0;

    for (const item of filteredCases) {
      const response = await this.normalizerService.normalize({
        ...payload.normalizeConfig,
        userQuestion: item.raw_question,
        context: {
          expected_route: item.expected.route_hint as NormalizeRequestPayload["context"] extends infer C
            ? C extends { expected_route?: infer R }
              ? R
              : never
            : never,
          eval_label: runId,
          case_id: item.case_id,
          eval_mode: mode
        },
        retryPolicy: mode === "single-pass-strict" ? "single-pass-strict" : "default",
        useMock: payload.useMock
      });

      const normalized =
        response.normalized && (response.normalized as { schema_version?: string }).schema_version === "normalized_query_v1"
          ? (response.normalized as NormalizedQueryV1)
          : null;
      const intentMatch = Boolean(normalized && item.expected.intent_class === normalized.intent_class);
      const routeMatch = Boolean(normalized && item.expected.route_hint === normalized.route_hint);
      const causalMatch = Boolean(
        normalized &&
          item.expected.requires &&
          item.expected.requires.needs_cross_entity_join === normalized.requires.needs_cross_entity_join &&
          item.expected.requires.needs_causal_chain === normalized.requires.needs_causal_chain
      );

      if (response.validation.passed) schemaPass += 1;
      if (intentMatch) intentPass += 1;
      if (routeMatch) routePass += 1;
      if (causalMatch || !item.expected.requires) causalPass += 1;

      const requestCount = Number(response.request_count_for_case ?? 0);
      requestsTotal += requestCount;
      if (requestCount > 1) {
        retriesUsed += 1;
      }

      const classKey = String(item.expected.intent_class ?? "unknown");
      if (!classCounter[classKey]) {
        classCounter[classKey] = { total: 0, passed: 0 };
      }
      classCounter[classKey].total += 1;
      if (intentMatch) {
        classCounter[classKey].passed += 1;
      }

      const confidenceOverall = normalized?.confidence.overall ?? null;
      const hasMismatch = !intentMatch || !routeMatch || (!causalMatch && Boolean(item.expected.requires));
      if (confidenceOverall === "high" && hasMismatch) {
        highConfidenceErrors += 1;
        badConfidenceCases.push({
          case_id: item.case_id,
          confidence_overall: confidenceOverall,
          intent_match: intentMatch,
          route_match: routeMatch,
          causal_match: causalMatch || !item.expected.requires,
          trace_id: response.trace_id
        });
      }

      if (hasMismatch || !response.validation.passed) {
        mismatches.push({
          case_id: item.case_id,
          expected_intent_class: item.expected.intent_class ?? null,
          actual_intent_class: normalized?.intent_class ?? null,
          expected_route_hint: item.expected.route_hint ?? null,
          actual_route_hint: normalized?.route_hint ?? null,
          expected_requires: item.expected.requires ?? null,
          actual_requires: normalized?.requires ?? null,
          comment: shortMismatchComment({
            intentMatch,
            routeMatch,
            causalMatch: causalMatch || !item.expected.requires,
            validationPassed: response.validation.passed
          }),
          trace_id: response.trace_id
        });
      }

      results.push({
        case_id: item.case_id,
        raw_question: item.raw_question,
        validation_passed: response.validation.passed,
        intent_match: intentMatch,
        route_match: routeMatch,
        causal_flags_match: causalMatch || !item.expected.requires,
        expected_intent_class: item.expected.intent_class ?? null,
        actual_intent_class: normalized?.intent_class ?? null,
        expected_route_hint: item.expected.route_hint ?? null,
        actual_route_hint: normalized?.route_hint ?? null,
        expected_requires: item.expected.requires ?? null,
        actual_requires: normalized?.requires ?? null,
        confidence_overall: confidenceOverall,
        trace_id: response.trace_id,
        request_count_for_case: requestCount
      });
    }

    const total = Math.max(1, filteredCases.length);
    const metrics: BaselineMetrics = {
      schema_validation_pass_rate: Number(((schemaPass / total) * 100).toFixed(2)),
      intent_class_accuracy: Number(((intentPass / total) * 100).toFixed(2)),
      route_hint_accuracy: Number(((routePass / total) * 100).toFixed(2)),
      causal_flag_accuracy: Number(((causalPass / total) * 100).toFixed(2)),
      high_confidence_error_rate: Number(((highConfidenceErrors / total) * 100).toFixed(2))
    };

    const classAccuracy = Object.fromEntries(
      Object.entries(classCounter).map(([key, value]) => [
        key,
        {
          total: value.total,
          passed: value.passed,
          accuracy_percent: Number(((value.passed / Math.max(1, value.total)) * 100).toFixed(2))
        }
      ])
    );

    const baselineAsMap = BASELINE_METRICS as unknown as Record<string, number>;
    const baselineDelta = Object.fromEntries(
      Object.entries(metrics).map(([key, value]) => [key, Number((value - baselineAsMap[key]).toFixed(2))])
    );

    const report = {
      run_id: runId,
      timestamp: new Date().toISOString(),
      mode,
      use_mock: Boolean(payload.useMock),
      prompt_version: payload.normalizeConfig.promptVersion ?? null,
      dataset: {
        source: payload.caseSetFile ? "file" : "data/eval_cases/*.json",
        file: payload.caseSetFile ?? null
      },
      cases_total: filteredCases.length,
      metrics,
      baseline_metrics: BASELINE_METRICS,
      baseline_delta: baselineDelta,
      class_accuracy: classAccuracy,
      budget: {
        requests_total: requestsTotal,
        retries_used: retriesUsed,
        guidance: {
          forensic_calls_max: 10,
          final_eval_calls_max: 30,
          target_total_calls_max: 40,
          hard_cap_calls_max: 45
        }
      },
      mismatches,
      bad_confidence_cases: badConfidenceCases,
      results
    };

    ensureDir(EVAL_CASES_DIR);
    writeJsonFile(path.resolve(EVAL_CASES_DIR, `${runId}.report.json`), report);

    const shouldWriteV11Artifacts =
      mode === "single-pass-strict" &&
      Boolean(payload.caseSetFile) &&
      path.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_30cases.json";

    if (shouldWriteV11Artifacts) {
      ensureDir(REPORTS_DIR);
      writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_eval_v1_1_run.json"), report);
      fs.writeFileSync(
        path.resolve(REPORTS_DIR, "normalizer_eval_v1_1_run.md"),
        buildMarkdownReport({
          ...report,
          report_title: "LLM Normalizer v1.1 Eval Run"
        }),
        "utf-8"
      );
    }

    const shouldWriteV1121EvalArtifacts =
      mode === "single-pass-strict" &&
      String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2_1" &&
      Boolean(payload.caseSetFile) &&
      path.basename(String(payload.caseSetFile)).toLowerCase() === "normalizer_eval_v1_1_2_1_30cases.json";

    if (shouldWriteV1121EvalArtifacts) {
      ensureDir(REPORTS_DIR);
      writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_v1_1_2_1_eval.json"), report);
      fs.writeFileSync(
        path.resolve(REPORTS_DIR, "normalizer_v1_1_2_1_eval.md"),
        buildMarkdownReport({
          ...report,
          report_title: "LLM Normalizer v1.1.2.1 Eval Run"
        }),
        "utf-8"
      );
    }

    const shouldWriteV111MicroArtifacts =
      mode === "single-pass-strict" &&
      String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_1" &&
      isSameCaseSet(payload.caseIds, V111_MICRO_CASE_IDS);

    if (shouldWriteV111MicroArtifacts) {
      ensureDir(REPORTS_DIR);
      writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_v1_1_1_micro_eval.json"), report);
      fs.writeFileSync(
        path.resolve(REPORTS_DIR, "normalizer_v1_1_1_micro_eval.md"),
        buildMarkdownReport({
          ...report,
          report_title: "LLM Normalizer v1.1.1 Micro Eval"
        }),
        "utf-8"
      );
    }

    const shouldWriteV112MicroArtifacts =
      mode === "single-pass-strict" &&
      String(payload.normalizeConfig.promptVersion ?? "") === "normalizer_v1_1_2" &&
      isSameCaseSet(payload.caseIds, V112_MICRO_CASE_IDS);

    if (shouldWriteV112MicroArtifacts) {
      ensureDir(REPORTS_DIR);
      writeJsonFile(path.resolve(REPORTS_DIR, "normalizer_v1_1_2_micro_eval.json"), report);
      fs.writeFileSync(
        path.resolve(REPORTS_DIR, "normalizer_v1_1_2_micro_eval.md"),
        buildMarkdownReport({
          ...report,
          report_title: "LLM Normalizer v1.1.2 Micro Eval"
        }),
        "utf-8"
      );
    }

    return report;
  }
}