import request from "supertest"; import { afterEach, describe, expect, it, vi } from "vitest"; const FLAG_KEYS = [ "FEATURE_ASSISTANT_ANSWER_POLICY_V11", "FEATURE_ASSISTANT_BROAD_GUARD_V1", "FEATURE_ASSISTANT_PROBLEM_UNITS_V1", "FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1", "FEATURE_ASSISTANT_LIFECYCLE_RUNTIME_V1", "FEATURE_ASSISTANT_LIFECYCLE_ANSWER_V1", "FEATURE_ASSISTANT_GRAPH_RUNTIME_V1", "FEATURE_ASSISTANT_STAGE2_EVAL_V1" ] as const; const ORIGINAL_FLAGS: Record = Object.fromEntries(FLAG_KEYS.map((key) => [key, process.env[key]])); function restoreFlags(): void { for (const key of FLAG_KEYS) { const original = ORIGINAL_FLAGS[key]; if (original === undefined) { delete process.env[key]; } else { process.env[key] = original; } } } async function createAppWithFlags(flags: { answerPolicy: "0" | "1"; stage2Eval: "0" | "1"; problemUnits: "0" | "1"; problemCentric: "0" | "1"; }): Promise { process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 = flags.answerPolicy; process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 = flags.stage2Eval; process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 = flags.problemUnits; process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 = flags.problemCentric; process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 = "1"; process.env.FEATURE_ASSISTANT_LIFECYCLE_RUNTIME_V1 = "1"; process.env.FEATURE_ASSISTANT_LIFECYCLE_ANSWER_V1 = "1"; process.env.FEATURE_ASSISTANT_GRAPH_RUNTIME_V1 = "1"; vi.resetModules(); const { createApp } = await import("../src/server"); return createApp(); } describe.sequential("assistant P0 eval harness (Wave 7)", () => { afterEach(() => { restoreFlags(); vi.resetModules(); }); it("runs assistant_p0 eval and returns formal product metrics + verdict", async () => { const app = await createAppWithFlags({ answerPolicy: "1", stage2Eval: "1", problemUnits: "1", problemCentric: "1" }); const response = await request(app).post("/api/eval/run").send({ eval_target: "assistant_p0", useMock: true, mode: "single-pass-strict", caseSetFile: "p0_eval_corpus_v0_1.json", normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(response.status).toBe(200); expect(response.body.report?.eval_target).toBe("assistant_p0"); expect(response.body.report?.suite_id).toBe("assistant_p0_eval_corpus"); expect(response.body.report?.scenario_count).toBe(36); expect(response.body.report?.cases_total).toBe(36); expect(response.body.report?.metrics?.raw).toBeTruthy(); expect(Object.keys(response.body.report?.metrics?.raw ?? {})).toEqual([ "problem_first_answer_rate", "mechanism_coherence_score", "entity_leakage_rate", "accountant_actionability_score", "route_correctness_rate", "domain_purity_rate", "limitation_honesty_rate", "top_problem_unit_match_rate" ]); expect(Object.keys(response.body.report?.quality_gap_metrics?.raw ?? {})).toEqual([ "generic_explanation_rate", "false_confidence_rate", "mechanism_specificity_score", "followup_context_retention_score", "stage4_contract_compliance_rate" ]); expect(Number(response.body.report?.quality_gap_metrics?.denominators?.stage4_contract_audited_cases_total ?? 0)).toBeGreaterThan(0); const qualityGapChecks = Array.isArray(response.body.report?.baseline_stability_gate?.quality_gap_checks) ? response.body.report.baseline_stability_gate.quality_gap_checks : []; expect(qualityGapChecks.some((item: { metric?: string }) => item.metric === "stage4_contract_compliance_rate")).toBe(true); expect(["P0_ACCEPTED", "P0_ACCEPTED_WITH_LIMITATIONS", "P0_NOT_ACCEPTED"]).toContain( String(response.body.report?.acceptance_gate?.verdict ?? "") ); expect(["P0_BASELINE_STABLE", "P0_BASELINE_STABLE_WITH_OPEN_QUALITY_GAPS"]).toContain( String(response.body.report?.baseline_stability_gate?.verdict ?? "") ); }); it("loads formal P0 corpus split for 3 domains", async () => { const app = await createAppWithFlags({ answerPolicy: "1", stage2Eval: "1", problemUnits: "1", problemCentric: "1" }); const response = await request(app).post("/api/eval/run").send({ eval_target: "assistant_p0", useMock: true, mode: "single-pass-strict", caseSetFile: "p0_eval_corpus_v0_1.json", caseIds: ["P0-SET-01", "P0-VAT-01", "P0-CLOSE-01"], normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(response.status).toBe(200); expect(response.body.report?.domain_distribution?.settlements_60_62).toBe(1); expect(response.body.report?.domain_distribution?.vat_document_register_book).toBe(1); expect(response.body.report?.domain_distribution?.month_close_costs_20_44).toBe(1); }); it("supports Wave 9 expanded corpus classes and follow-up context metrics", async () => { const app = await createAppWithFlags({ answerPolicy: "1", stage2Eval: "1", problemUnits: "1", problemCentric: "1" }); const response = await request(app).post("/api/eval/run").send({ eval_target: "assistant_p0", useMock: true, mode: "single-pass-strict", caseSetFile: "p0_eval_corpus_v0_2.json", caseIds: ["P0-W9-25", "P0-W9-30", "P0-W9-35", "P0-W9-40"], normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(response.status).toBe(200); expect(response.body.report?.cases_total).toBe(4); expect(response.body.report?.query_class_distribution?.followup_investigation).toBe(1); expect(response.body.report?.query_class_distribution?.noisy_input).toBe(1); expect(response.body.report?.query_class_distribution?.translit_noisy).toBe(1); expect(response.body.report?.query_class_distribution?.multi_intent).toBe(1); expect(response.body.report?.quality_gap_metrics?.denominators?.followup_cases_total).toBe(1); expect(Number(response.body.report?.budget?.requests_total ?? 0)).toBeGreaterThanOrEqual(5); const followupCase = Array.isArray(response.body.report?.results) ? response.body.report.results.find((item: { case_id?: string }) => item.case_id === "P0-W9-25") : null; expect(followupCase?.followup_seed_query).toBeTruthy(); expect(followupCase?.actual?.followup_context_match_ratio).not.toBeNull(); }); it("builds before/after comparison and returns formal verdict delta", async () => { const caseSubset = [ "P0-SET-01", "P0-SET-02", "P0-SET-09", "P0-VAT-01", "P0-VAT-02", "P0-VAT-09", "P0-CLOSE-01", "P0-CLOSE-02", "P0-CLOSE-09" ]; const baselineApp = await createAppWithFlags({ answerPolicy: "0", stage2Eval: "1", problemUnits: "0", problemCentric: "0" }); const baseline = await request(baselineApp).post("/api/eval/run").send({ eval_target: "assistant_p0", useMock: true, mode: "single-pass-strict", caseSetFile: "p0_eval_corpus_v0_1.json", caseIds: caseSubset, normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(baseline.status).toBe(200); const baselinePath = String(baseline.body.report?.artifacts?.run_report_json_path ?? ""); expect(baselinePath.length).toBeGreaterThan(0); const currentApp = await createAppWithFlags({ answerPolicy: "1", stage2Eval: "1", problemUnits: "1", problemCentric: "1" }); const current = await request(currentApp).post("/api/eval/run").send({ eval_target: "assistant_p0", useMock: true, mode: "single-pass-strict", caseSetFile: "p0_eval_corpus_v0_1.json", caseIds: caseSubset, compare_with_report_file: baselinePath, normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(current.status).toBe(200); expect(current.body.report?.comparison).toBeTruthy(); expect(current.body.report?.comparison?.metric_deltas).toBeTruthy(); expect(current.body.report?.comparison?.verdict_delta).toBeTruthy(); expect(current.body.report?.comparison?.artifacts?.comparison_report_json_path).toBeTruthy(); }); it("respects P0 eval feature gate via Stage2 eval flag OFF/ON", async () => { const appOff = await createAppWithFlags({ answerPolicy: "1", stage2Eval: "0", problemUnits: "1", problemCentric: "1" }); const offResponse = await request(appOff).post("/api/eval/run").send({ eval_target: "assistant_p0", useMock: true, mode: "single-pass-strict", caseSetFile: "p0_eval_corpus_v0_1.json", caseIds: ["P0-SET-01"], normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(offResponse.status).toBe(409); expect(offResponse.body?.error?.code).toBe("ASSISTANT_P0_EVAL_DISABLED"); const appOn = await createAppWithFlags({ answerPolicy: "1", stage2Eval: "1", problemUnits: "1", problemCentric: "1" }); const onResponse = await request(appOn).post("/api/eval/run").send({ eval_target: "assistant_p0", useMock: true, mode: "single-pass-strict", caseSetFile: "p0_eval_corpus_v0_1.json", caseIds: ["P0-SET-01"], normalizeConfig: { promptVersion: "normalizer_v2_0_2" } }); expect(onResponse.status).toBe(200); expect(onResponse.body.report?.eval_target).toBe("assistant_p0"); }); });