import request from "supertest"; import { afterEach, describe, expect, it, vi } from "vitest"; const FLAG_KEYS = [ "FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1", "FEATURE_ASSISTANT_ANSWER_POLICY_V11", "FEATURE_ASSISTANT_BROAD_GUARD_V1", "FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1", "FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1", "FEATURE_ASSISTANT_INVESTIGATION_STATE_V1", "FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1", "FEATURE_ASSISTANT_PROBLEM_UNITS_V1", "FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1", "FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1", "FEATURE_ASSISTANT_STAGE2_EVAL_V1" ] as const; const ORIGINAL_FLAGS: Record = Object.fromEntries( FLAG_KEYS.map((key) => [key, process.env[key]]) ); function restoreFlags(): void { for (const key of FLAG_KEYS) { const original = ORIGINAL_FLAGS[key]; if (original === undefined) { delete process.env[key]; } else { process.env[key] = original; } } } async function createAppWithFlags(flags: { accountantEval: "0" | "1"; answerPolicy: "0" | "1"; }): Promise { process.env.FEATURE_ASSISTANT_ACCOUNTANT_EVAL_V1 = flags.accountantEval; process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 = flags.answerPolicy; process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 = "1"; process.env.FEATURE_ASSISTANT_MIN_EVIDENCE_GATE_V1 = "1"; process.env.FEATURE_ASSISTANT_ANTI_GENERIC_RANKING_GUARD_V1 = "1"; process.env.FEATURE_ASSISTANT_INVESTIGATION_STATE_V1 = "1"; process.env.FEATURE_ASSISTANT_STATE_FOLLOWUP_BINDING_V1 = "1"; process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 = "0"; process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 = "0"; process.env.FEATURE_ASSISTANT_PROBLEM_UNIT_CONTINUITY_V1 = "0"; process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 = "0"; vi.resetModules(); const { createApp } = await import("../src/server"); return createApp(); } describe.sequential("assistant Stage 1 eval harness", () => { afterEach(() => { restoreFlags(); vi.resetModules(); }); it("runs assistant_stage1 harness and returns raw metrics + rubric bands", async () => { const app = await createAppWithFlags({ accountantEval: "1", answerPolicy: "1" }); const response = await request(app).post("/api/eval/run").send({ eval_target: "assistant_stage1", useMock: true, mode: "single-pass-strict", caseSetFile: "assistant_stage1_canonical_v0_1.json", normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(response.status).toBe(200); expect(response.body.ok).toBe(true); expect(response.body.report?.eval_target).toBe("assistant_stage1"); expect(response.body.report?.metrics?.raw).toBeTruthy(); const rawMetricKeys = Object.keys(response.body.report?.metrics?.raw ?? {}); expect(rawMetricKeys).toEqual([ "retrieval_differentiation_rate", "generic_explanation_rate", "accountant_actionability_score", "false_confidence_rate", "broad_answer_rate", "mechanism_specificity_score", "followup_context_retention_score", "stage4_contract_compliance_rate" ]); expect(response.body.report?.rubric_bands?.generic_explanation_rate).toBeTruthy(); expect(response.body.report?.feature_profile_snapshot).toBeTruthy(); expect(response.body.report?.code_version).toBeTruthy(); expect(typeof response.body.report?.run_timestamp).toBe("string"); expect(Array.isArray(response.body.report?.results)).toBe(true); expect(response.body.report?.results?.length).toBeGreaterThan(0); }); it("loads canonical suite metadata and keeps it stable", async () => { const app = await createAppWithFlags({ accountantEval: "1", answerPolicy: "1" }); const response = await request(app).post("/api/eval/run").send({ eval_target: "assistant_stage1", useMock: true, mode: "single-pass-strict", caseSetFile: "assistant_stage1_canonical_v0_1.json", normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(response.status).toBe(200); expect(response.body.report?.suite_id).toBe("assistant_stage1_canonical"); expect(response.body.report?.suite_version).toBe("0.1.0"); expect(response.body.report?.scenario_count).toBe(9); expect(Array.isArray(response.body.report?.case_ids)).toBe(true); expect(response.body.report?.case_ids?.length).toBe(9); }); it("handles follow-up cases as dedicated subset", async () => { const app = await createAppWithFlags({ accountantEval: "1", answerPolicy: "1" }); const response = await request(app).post("/api/eval/run").send({ eval_target: "assistant_stage1", useMock: true, mode: "single-pass-strict", caseSetFile: "assistant_stage1_canonical_v0_1.json", caseIds: ["S1-FOLLOWUP-INVESTIGATION", "S1-60-SUPPLIER-TAILS"], normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(response.status).toBe(200); expect(response.body.report?.subsets?.followup_cases_total).toBeGreaterThan(0); expect(response.body.report?.metrics?.raw?.followup_context_retention_score).not.toBeNull(); }); it("builds comparison artifact from baseline and current runs", async () => { const baselineApp = await createAppWithFlags({ accountantEval: "1", answerPolicy: "0" }); const baseline = await request(baselineApp).post("/api/eval/run").send({ eval_target: "assistant_stage1", useMock: true, mode: "single-pass-strict", caseSetFile: "assistant_stage1_canonical_v0_1.json", normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(baseline.status).toBe(200); const baselinePath = String(baseline.body.report?.artifacts?.run_report_json_path ?? ""); expect(baselinePath.length).toBeGreaterThan(0); const currentApp = await createAppWithFlags({ accountantEval: "1", answerPolicy: "1" }); const current = await request(currentApp).post("/api/eval/run").send({ eval_target: "assistant_stage1", useMock: true, mode: "single-pass-strict", caseSetFile: "assistant_stage1_canonical_v0_1.json", compare_with_report_file: baselinePath, normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(current.status).toBe(200); expect(current.body.report?.comparison).toBeTruthy(); expect(current.body.report?.comparison?.metric_deltas).toBeTruthy(); expect(current.body.report?.comparison?.artifacts?.comparison_report_json_path).toBeTruthy(); }); it("keeps legacy eval path unchanged by default", async () => { const app = await createAppWithFlags({ accountantEval: "1", answerPolicy: "1" }); const response = await request(app).post("/api/eval/run").send({ useMock: true, mode: "single-pass-strict", rawQuestions: "Проверь счет 60 за июнь 2020; Покажи риски по счету 97", normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(response.status).toBe(200); expect(response.body.report?.eval_target).toBeUndefined(); expect(response.body.report?.metrics?.schema_validation_pass_rate).not.toBeUndefined(); expect(response.body.report?.metrics?.route_resolution_accuracy).not.toBeUndefined(); }); it("respects accountant eval feature flag OFF/ON", async () => { const appOff = await createAppWithFlags({ accountantEval: "0", answerPolicy: "1" }); const offResponse = await request(appOff).post("/api/eval/run").send({ eval_target: "assistant_stage1", useMock: true, mode: "single-pass-strict", caseSetFile: "assistant_stage1_canonical_v0_1.json", normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(offResponse.status).toBe(409); expect(offResponse.body?.error?.code).toBe("ASSISTANT_STAGE1_EVAL_DISABLED"); const appOn = await createAppWithFlags({ accountantEval: "1", answerPolicy: "1" }); const onResponse = await request(appOn).post("/api/eval/run").send({ eval_target: "assistant_stage1", useMock: true, mode: "single-pass-strict", caseSetFile: "assistant_stage1_canonical_v0_1.json", normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(onResponse.status).toBe(200); expect(onResponse.body.report?.eval_target).toBe("assistant_stage1"); }); });