NODEDC_1C/llm_normalizer/backend/tests/assistantEvalHarness.test.ts

236 lines
8.4 KiB
TypeScript

import request from "supertest";
import { afterEach, describe, expect, it, vi } from "vitest";
const FLAG_KEYS = [
"FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1",
"FEATURE_ASSISTANT_ANSWER_POLICY_V11",
"FEATURE_ASSISTANT_BROAD_GUARD_V1",
"FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1",
"FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1",
"FEATURE_ASSISTANT_INVESTIGATION_STATE_V1",
"FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1",
"FEATURE_ASSISTANT_PROBLEM_UNITS_V1",
"FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1",
"FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1",
"FEATURE_ASSISTANT_STAGE2_EVAL_V1"
] as const;
const ORIGINAL_FLAGS: Record<string, string | undefined> = Object.fromEntries(
FLAG_KEYS.map((key) => [key, process.env[key]])
);
function restoreFlags(): void {
for (const key of FLAG_KEYS) {
const original = ORIGINAL_FLAGS[key];
if (original === undefined) {
delete process.env[key];
} else {
process.env[key] = original;
}
}
}
async function createAppWithFlags(flags: {
accountantEval: "0" | "1";
answerPolicy: "0" | "1";
}): Promise<import("express").Express> {
process.env.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1 = flags.accountantEval;
process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 = flags.answerPolicy;
process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 = "1";
process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 = "1";
process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 = "1";
process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 = "1";
process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 = "1";
process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 = "0";
process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 = "0";
process.env.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1 = "0";
process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 = "0";
vi.resetModules();
const { createApp } = await import("../src/server");
return createApp();
}
describe.sequential("assistant Stage 1 eval harness", () => {
afterEach(() => {
restoreFlags();
vi.resetModules();
});
it("runs assistant_stage1 harness and returns raw metrics + rubric bands", async () => {
const app = await createAppWithFlags({
accountantEval: "1",
answerPolicy: "1"
});
const response = await request(app).post("/api/eval/run").send({
eval_target: "assistant_stage1",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "assistant_stage1_canonical_v0_1.json",
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(response.status).toBe(200);
expect(response.body.ok).toBe(true);
expect(response.body.report?.eval_target).toBe("assistant_stage1");
expect(response.body.report?.metrics?.raw).toBeTruthy();
const rawMetricKeys = Object.keys(response.body.report?.metrics?.raw ?? {});
expect(rawMetricKeys).toEqual([
"retrieval_differentiation_rate",
"generic_explanation_rate",
"accountant_actionability_score",
"false_confidence_rate",
"broad_answer_rate",
"mechanism_specificity_score",
"followup_context_retention_score"
]);
expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy();
expect(response.body.report?.feature_profile_snapshot).toBeTruthy();
expect(response.body.report?.code_version).toBeTruthy();
expect(typeof response.body.report?.run_timestamp).toBe("string");
expect(Array.isArray(response.body.report?.results)).toBe(true);
expect(response.body.report?.results?.length).toBeGreaterThan(0);
});
it("loads canonical suite metadata and keeps it stable", async () => {
const app = await createAppWithFlags({
accountantEval: "1",
answerPolicy: "1"
});
const response = await request(app).post("/api/eval/run").send({
eval_target: "assistant_stage1",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "assistant_stage1_canonical_v0_1.json",
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(response.status).toBe(200);
expect(response.body.report?.suite_id).toBe("assistant_stage1_canonical");
expect(response.body.report?.suite_version).toBe("0.1.0");
expect(response.body.report?.scenario_count).toBe(9);
expect(Array.isArray(response.body.report?.case_ids)).toBe(true);
expect(response.body.report?.case_ids?.length).toBe(9);
});
it("handles follow-up cases as dedicated subset", async () => {
const app = await createAppWithFlags({
accountantEval: "1",
answerPolicy: "1"
});
const response = await request(app).post("/api/eval/run").send({
eval_target: "assistant_stage1",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "assistant_stage1_canonical_v0_1.json",
caseIds: ["S1-FOLLOWUP-INVESTIGATION", "S1-60-SUPPLIER-TAILS"],
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(response.status).toBe(200);
expect(response.body.report?.subsets?.followup_cases_total).toBeGreaterThan(0);
expect(response.body.report?.metrics?.raw?.followup_context_retention_score).not.toBeNull();
});
it("builds comparison artifact from baseline and current runs", async () => {
const baselineApp = await createAppWithFlags({
accountantEval: "1",
answerPolicy: "0"
});
const baseline = await request(baselineApp).post("/api/eval/run").send({
eval_target: "assistant_stage1",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "assistant_stage1_canonical_v0_1.json",
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(baseline.status).toBe(200);
const baselinePath = String(baseline.body.report?.artifacts?.run_report_json_path ?? "");
expect(baselinePath.length).toBeGreaterThan(0);
const currentApp = await createAppWithFlags({
accountantEval: "1",
answerPolicy: "1"
});
const current = await request(currentApp).post("/api/eval/run").send({
eval_target: "assistant_stage1",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "assistant_stage1_canonical_v0_1.json",
compare_with_report_file: baselinePath,
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(current.status).toBe(200);
expect(current.body.report?.comparison).toBeTruthy();
expect(current.body.report?.comparison?.metric_deltas).toBeTruthy();
expect(current.body.report?.comparison?.artifacts?.comparison_report_json_path).toBeTruthy();
});
it("keeps legacy eval path unchanged by default", async () => {
const app = await createAppWithFlags({
accountantEval: "1",
answerPolicy: "1"
});
const response = await request(app).post("/api/eval/run").send({
useMock: true,
mode: "single-pass-strict",
rawQuestions: "Проверь счет 60 за июнь 2020; Покажи риски по счету 97",
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(response.status).toBe(200);
expect(response.body.report?.eval_target).toBeUndefined();
expect(response.body.report?.metrics?.schema_validation_pass_rate).not.toBeUndefined();
expect(response.body.report?.metrics?.route_resolution_accuracy).not.toBeUndefined();
});
it("respects accountant eval feature flag OFF/ON", async () => {
const appOff = await createAppWithFlags({
accountantEval: "0",
answerPolicy: "1"
});
const offResponse = await request(appOff).post("/api/eval/run").send({
eval_target: "assistant_stage1",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "assistant_stage1_canonical_v0_1.json",
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(offResponse.status).toBe(409);
expect(offResponse.body?.error?.code).toBe("ASSISTANT_STAGE1_EVAL_DISABLED");
const appOn = await createAppWithFlags({
accountantEval: "1",
answerPolicy: "1"
});
const onResponse = await request(appOn).post("/api/eval/run").send({
eval_target: "assistant_stage1",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "assistant_stage1_canonical_v0_1.json",
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(onResponse.status).toBe(200);
expect(onResponse.body.report?.eval_target).toBe("assistant_stage1");
});
});