236 lines
8.4 KiB
TypeScript
236 lines
8.4 KiB
TypeScript
import request from "supertest";
|
|
import { afterEach, describe, expect, it, vi } from "vitest";
|
|
|
|
const FLAG_KEYS = [
|
|
"FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1",
|
|
"FEATURE_ASSISTANT_ANSWER_POLICY_V11",
|
|
"FEATURE_ASSISTANT_BROAD_GUARD_V1",
|
|
"FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1",
|
|
"FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1",
|
|
"FEATURE_ASSISTANT_INVESTIGATION_STATE_V1",
|
|
"FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1",
|
|
"FEATURE_ASSISTANT_PROBLEM_UNITS_V1",
|
|
"FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1",
|
|
"FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1",
|
|
"FEATURE_ASSISTANT_STAGE2_EVAL_V1"
|
|
] as const;
|
|
|
|
const ORIGINAL_FLAGS: Record<string, string | undefined> = Object.fromEntries(
|
|
FLAG_KEYS.map((key) => [key, process.env[key]])
|
|
);
|
|
|
|
function restoreFlags(): void {
|
|
for (const key of FLAG_KEYS) {
|
|
const original = ORIGINAL_FLAGS[key];
|
|
if (original === undefined) {
|
|
delete process.env[key];
|
|
} else {
|
|
process.env[key] = original;
|
|
}
|
|
}
|
|
}
|
|
|
|
async function createAppWithFlags(flags: {
|
|
accountantEval: "0" | "1";
|
|
answerPolicy: "0" | "1";
|
|
}): Promise<import("express").Express> {
|
|
process.env.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1 = flags.accountantEval;
|
|
process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 = flags.answerPolicy;
|
|
process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 = "0";
|
|
process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 = "0";
|
|
process.env.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1 = "0";
|
|
process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 = "0";
|
|
vi.resetModules();
|
|
const { createApp } = await import("../src/server");
|
|
return createApp();
|
|
}
|
|
|
|
describe.sequential("assistant Stage 1 eval harness", () => {
|
|
afterEach(() => {
|
|
restoreFlags();
|
|
vi.resetModules();
|
|
});
|
|
|
|
it("runs assistant_stage1 harness and returns raw metrics + rubric bands", async () => {
|
|
const app = await createAppWithFlags({
|
|
accountantEval: "1",
|
|
answerPolicy: "1"
|
|
});
|
|
|
|
const response = await request(app).post("/api/eval/run").send({
|
|
eval_target: "assistant_stage1",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "assistant_stage1_canonical_v0_1.json",
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
|
|
expect(response.status).toBe(200);
|
|
expect(response.body.ok).toBe(true);
|
|
expect(response.body.report?.eval_target).toBe("assistant_stage1");
|
|
expect(response.body.report?.metrics?.raw).toBeTruthy();
|
|
const rawMetricKeys = Object.keys(response.body.report?.metrics?.raw ?? {});
|
|
expect(rawMetricKeys).toEqual([
|
|
"retrieval_differentiation_rate",
|
|
"generic_explanation_rate",
|
|
"accountant_actionability_score",
|
|
"false_confidence_rate",
|
|
"broad_answer_rate",
|
|
"mechanism_specificity_score",
|
|
"followup_context_retention_score"
|
|
]);
|
|
expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy();
|
|
expect(response.body.report?.feature_profile_snapshot).toBeTruthy();
|
|
expect(response.body.report?.code_version).toBeTruthy();
|
|
expect(typeof response.body.report?.run_timestamp).toBe("string");
|
|
expect(Array.isArray(response.body.report?.results)).toBe(true);
|
|
expect(response.body.report?.results?.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
it("loads canonical suite metadata and keeps it stable", async () => {
|
|
const app = await createAppWithFlags({
|
|
accountantEval: "1",
|
|
answerPolicy: "1"
|
|
});
|
|
|
|
const response = await request(app).post("/api/eval/run").send({
|
|
eval_target: "assistant_stage1",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "assistant_stage1_canonical_v0_1.json",
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
|
|
expect(response.status).toBe(200);
|
|
expect(response.body.report?.suite_id).toBe("assistant_stage1_canonical");
|
|
expect(response.body.report?.suite_version).toBe("0.1.0");
|
|
expect(response.body.report?.scenario_count).toBe(9);
|
|
expect(Array.isArray(response.body.report?.case_ids)).toBe(true);
|
|
expect(response.body.report?.case_ids?.length).toBe(9);
|
|
});
|
|
|
|
it("handles follow-up cases as dedicated subset", async () => {
|
|
const app = await createAppWithFlags({
|
|
accountantEval: "1",
|
|
answerPolicy: "1"
|
|
});
|
|
|
|
const response = await request(app).post("/api/eval/run").send({
|
|
eval_target: "assistant_stage1",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "assistant_stage1_canonical_v0_1.json",
|
|
caseIds: ["S1-FOLLOWUP-INVESTIGATION", "S1-60-SUPPLIER-TAILS"],
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
|
|
expect(response.status).toBe(200);
|
|
expect(response.body.report?.subsets?.followup_cases_total).toBeGreaterThan(0);
|
|
expect(response.body.report?.metrics?.raw?.followup_context_retention_score).not.toBeNull();
|
|
});
|
|
|
|
it("builds comparison artifact from baseline and current runs", async () => {
|
|
const baselineApp = await createAppWithFlags({
|
|
accountantEval: "1",
|
|
answerPolicy: "0"
|
|
});
|
|
const baseline = await request(baselineApp).post("/api/eval/run").send({
|
|
eval_target: "assistant_stage1",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "assistant_stage1_canonical_v0_1.json",
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(baseline.status).toBe(200);
|
|
const baselinePath = String(baseline.body.report?.artifacts?.run_report_json_path ?? "");
|
|
expect(baselinePath.length).toBeGreaterThan(0);
|
|
|
|
const currentApp = await createAppWithFlags({
|
|
accountantEval: "1",
|
|
answerPolicy: "1"
|
|
});
|
|
const current = await request(currentApp).post("/api/eval/run").send({
|
|
eval_target: "assistant_stage1",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "assistant_stage1_canonical_v0_1.json",
|
|
compare_with_report_file: baselinePath,
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(current.status).toBe(200);
|
|
expect(current.body.report?.comparison).toBeTruthy();
|
|
expect(current.body.report?.comparison?.metric_deltas).toBeTruthy();
|
|
expect(current.body.report?.comparison?.artifacts?.comparison_report_json_path).toBeTruthy();
|
|
});
|
|
|
|
it("keeps legacy eval path unchanged by default", async () => {
|
|
const app = await createAppWithFlags({
|
|
accountantEval: "1",
|
|
answerPolicy: "1"
|
|
});
|
|
|
|
const response = await request(app).post("/api/eval/run").send({
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
rawQuestions: "Проверь счет 60 за июнь 2020; Покажи риски по счету 97",
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
|
|
expect(response.status).toBe(200);
|
|
expect(response.body.report?.eval_target).toBeUndefined();
|
|
expect(response.body.report?.metrics?.schema_validation_pass_rate).not.toBeUndefined();
|
|
expect(response.body.report?.metrics?.route_resolution_accuracy).not.toBeUndefined();
|
|
});
|
|
|
|
it("respects accountant eval feature flag OFF/ON", async () => {
|
|
const appOff = await createAppWithFlags({
|
|
accountantEval: "0",
|
|
answerPolicy: "1"
|
|
});
|
|
const offResponse = await request(appOff).post("/api/eval/run").send({
|
|
eval_target: "assistant_stage1",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "assistant_stage1_canonical_v0_1.json",
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(offResponse.status).toBe(409);
|
|
expect(offResponse.body?.error?.code).toBe("ASSISTANT_STAGE1_EVAL_DISABLED");
|
|
|
|
const appOn = await createAppWithFlags({
|
|
accountantEval: "1",
|
|
answerPolicy: "1"
|
|
});
|
|
const onResponse = await request(appOn).post("/api/eval/run").send({
|
|
eval_target: "assistant_stage1",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "assistant_stage1_canonical_v0_1.json",
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(onResponse.status).toBe(200);
|
|
expect(onResponse.body.report?.eval_target).toBe("assistant_stage1");
|
|
});
|
|
});
|