NODEDC_1C/llm_normalizer/backend/tests/assistantP0EvalHarness.test.ts

264 lines
9.4 KiB
TypeScript

import request from "supertest";
import { afterEach, describe, expect, it, vi } from "vitest";
const FLAG_KEYS = [
"FEATURE_ASSISTANT_ANSWER_POLICY_V11",
"FEATURE_ASSISTANT_BROAD_GUARD_V1",
"FEATURE_ASSISTANT_PROBLEM_UNITS_V1",
"FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1",
"FEATURE_ASSISTANT_LIFECYCLE_RUNTIME_V1",
"FEATURE_ASSISTANT_LIFECYCLE_ANSWER_V1",
"FEATURE_ASSISTANT_GRAPH_RUNTIME_V1",
"FEATURE_ASSISTANT_STAGE2_EVAL_V1"
] as const;
const ORIGINAL_FLAGS: Record<string, string | undefined> = Object.fromEntries(FLAG_KEYS.map((key) => [key, process.env[key]]));
function restoreFlags(): void {
for (const key of FLAG_KEYS) {
const original = ORIGINAL_FLAGS[key];
if (original === undefined) {
delete process.env[key];
} else {
process.env[key] = original;
}
}
}
async function createAppWithFlags(flags: {
answerPolicy: "0" | "1";
stage2Eval: "0" | "1";
problemUnits: "0" | "1";
problemCentric: "0" | "1";
}): Promise<import("express").Express> {
process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 = flags.answerPolicy;
process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 = flags.stage2Eval;
process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 = flags.problemUnits;
process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 = flags.problemCentric;
process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 = "1";
process.env.FEATURE_ASSISTANT_LIFECYCLE_RUNTIME_V1 = "1";
process.env.FEATURE_ASSISTANT_LIFECYCLE_ANSWER_V1 = "1";
process.env.FEATURE_ASSISTANT_GRAPH_RUNTIME_V1 = "1";
vi.resetModules();
const { createApp } = await import("../src/server");
return createApp();
}
describe.sequential("assistant P0 eval harness (Wave 7)", () => {
afterEach(() => {
restoreFlags();
vi.resetModules();
});
it("runs assistant_p0 eval and returns formal product metrics + verdict", async () => {
const app = await createAppWithFlags({
answerPolicy: "1",
stage2Eval: "1",
problemUnits: "1",
problemCentric: "1"
});
const response = await request(app).post("/api/eval/run").send({
eval_target: "assistant_p0",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "p0_eval_corpus_v0_1.json",
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(response.status).toBe(200);
expect(response.body.report?.eval_target).toBe("assistant_p0");
expect(response.body.report?.suite_id).toBe("assistant_p0_eval_corpus");
expect(response.body.report?.scenario_count).toBe(36);
expect(response.body.report?.cases_total).toBe(36);
expect(response.body.report?.metrics?.raw).toBeTruthy();
expect(Object.keys(response.body.report?.metrics?.raw ?? {})).toEqual([
"problem_first_answer_rate",
"mechanism_coherence_score",
"entity_leakage_rate",
"accountant_actionability_score",
"route_correctness_rate",
"domain_purity_rate",
"limitation_honesty_rate",
"top_problem_unit_match_rate"
]);
expect(Object.keys(response.body.report?.quality_gap_metrics?.raw ?? {})).toEqual([
"generic_explanation_rate",
"false_confidence_rate",
"mechanism_specificity_score",
"followup_context_retention_score",
"stage4_contract_compliance_rate"
]);
expect(Number(response.body.report?.quality_gap_metrics?.denominators?.stage4_contract_audited_cases_total ?? 0)).toBeGreaterThan(0);
const qualityGapChecks = Array.isArray(response.body.report?.baseline_stability_gate?.quality_gap_checks)
? response.body.report.baseline_stability_gate.quality_gap_checks
: [];
expect(qualityGapChecks.some((item: { metric?: string }) => item.metric === "stage4_contract_compliance_rate")).toBe(true);
expect(["P0_ACCEPTED", "P0_ACCEPTED_WITH_LIMITATIONS", "P0_NOT_ACCEPTED"]).toContain(
String(response.body.report?.acceptance_gate?.verdict ?? "")
);
expect(["P0_BASELINE_STABLE", "P0_BASELINE_STABLE_WITH_OPEN_QUALITY_GAPS"]).toContain(
String(response.body.report?.baseline_stability_gate?.verdict ?? "")
);
});
it("loads formal P0 corpus split for 3 domains", async () => {
const app = await createAppWithFlags({
answerPolicy: "1",
stage2Eval: "1",
problemUnits: "1",
problemCentric: "1"
});
const response = await request(app).post("/api/eval/run").send({
eval_target: "assistant_p0",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "p0_eval_corpus_v0_1.json",
caseIds: ["P0-SET-01", "P0-VAT-01", "P0-CLOSE-01"],
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(response.status).toBe(200);
expect(response.body.report?.domain_distribution?.settlements_60_62).toBe(1);
expect(response.body.report?.domain_distribution?.vat_document_register_book).toBe(1);
expect(response.body.report?.domain_distribution?.month_close_costs_20_44).toBe(1);
});
it("supports Wave 9 expanded corpus classes and follow-up context metrics", async () => {
const app = await createAppWithFlags({
answerPolicy: "1",
stage2Eval: "1",
problemUnits: "1",
problemCentric: "1"
});
const response = await request(app).post("/api/eval/run").send({
eval_target: "assistant_p0",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "p0_eval_corpus_v0_2.json",
caseIds: ["P0-W9-25", "P0-W9-30", "P0-W9-35", "P0-W9-40"],
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(response.status).toBe(200);
expect(response.body.report?.cases_total).toBe(4);
expect(response.body.report?.query_class_distribution?.followup_investigation).toBe(1);
expect(response.body.report?.query_class_distribution?.noisy_input).toBe(1);
expect(response.body.report?.query_class_distribution?.translit_noisy).toBe(1);
expect(response.body.report?.query_class_distribution?.multi_intent).toBe(1);
expect(response.body.report?.quality_gap_metrics?.denominators?.followup_cases_total).toBe(1);
expect(Number(response.body.report?.budget?.requests_total ?? 0)).toBeGreaterThanOrEqual(5);
const followupCase = Array.isArray(response.body.report?.results)
? response.body.report.results.find((item: { case_id?: string }) => item.case_id === "P0-W9-25")
: null;
expect(followupCase?.followup_seed_query).toBeTruthy();
expect(followupCase?.actual?.followup_context_match_ratio).not.toBeNull();
});
it("builds before/after comparison and returns formal verdict delta", async () => {
const caseSubset = [
"P0-SET-01",
"P0-SET-02",
"P0-SET-09",
"P0-VAT-01",
"P0-VAT-02",
"P0-VAT-09",
"P0-CLOSE-01",
"P0-CLOSE-02",
"P0-CLOSE-09"
];
const baselineApp = await createAppWithFlags({
answerPolicy: "0",
stage2Eval: "1",
problemUnits: "0",
problemCentric: "0"
});
const baseline = await request(baselineApp).post("/api/eval/run").send({
eval_target: "assistant_p0",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "p0_eval_corpus_v0_1.json",
caseIds: caseSubset,
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(baseline.status).toBe(200);
const baselinePath = String(baseline.body.report?.artifacts?.run_report_json_path ?? "");
expect(baselinePath.length).toBeGreaterThan(0);
const currentApp = await createAppWithFlags({
answerPolicy: "1",
stage2Eval: "1",
problemUnits: "1",
problemCentric: "1"
});
const current = await request(currentApp).post("/api/eval/run").send({
eval_target: "assistant_p0",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "p0_eval_corpus_v0_1.json",
caseIds: caseSubset,
compare_with_report_file: baselinePath,
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(current.status).toBe(200);
expect(current.body.report?.comparison).toBeTruthy();
expect(current.body.report?.comparison?.metric_deltas).toBeTruthy();
expect(current.body.report?.comparison?.verdict_delta).toBeTruthy();
expect(current.body.report?.comparison?.artifacts?.comparison_report_json_path).toBeTruthy();
});
it("respects P0 eval feature gate via Stage2 eval flag OFF/ON", async () => {
const appOff = await createAppWithFlags({
answerPolicy: "1",
stage2Eval: "0",
problemUnits: "1",
problemCentric: "1"
});
const offResponse = await request(appOff).post("/api/eval/run").send({
eval_target: "assistant_p0",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "p0_eval_corpus_v0_1.json",
caseIds: ["P0-SET-01"],
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(offResponse.status).toBe(409);
expect(offResponse.body?.error?.code).toBe("ASSISTANT_P0_EVAL_DISABLED");
const appOn = await createAppWithFlags({
answerPolicy: "1",
stage2Eval: "1",
problemUnits: "1",
problemCentric: "1"
});
const onResponse = await request(appOn).post("/api/eval/run").send({
eval_target: "assistant_p0",
useMock: true,
mode: "single-pass-strict",
caseSetFile: "p0_eval_corpus_v0_1.json",
caseIds: ["P0-SET-01"],
normalizeConfig: {
promptVersion: "normalizer_v2_0_2"
}
});
expect(onResponse.status).toBe(200);
expect(onResponse.body.report?.eval_target).toBe("assistant_p0");
});
});