264 lines
9.4 KiB
TypeScript
264 lines
9.4 KiB
TypeScript
import request from "supertest";
|
|
import { afterEach, describe, expect, it, vi } from "vitest";
|
|
|
|
const FLAG_KEYS = [
|
|
"FEATURE_ASSISTANT_ANSWER_POLICY_V11",
|
|
"FEATURE_ASSISTANT_BROAD_GUARD_V1",
|
|
"FEATURE_ASSISTANT_PROBLEM_UNITS_V1",
|
|
"FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1",
|
|
"FEATURE_ASSISTANT_LIFECYCLE_RUNTIME_V1",
|
|
"FEATURE_ASSISTANT_LIFECYCLE_ANSWER_V1",
|
|
"FEATURE_ASSISTANT_GRAPH_RUNTIME_V1",
|
|
"FEATURE_ASSISTANT_STAGE2_EVAL_V1"
|
|
] as const;
|
|
|
|
const ORIGINAL_FLAGS: Record<string, string | undefined> = Object.fromEntries(FLAG_KEYS.map((key) => [key, process.env[key]]));
|
|
|
|
function restoreFlags(): void {
|
|
for (const key of FLAG_KEYS) {
|
|
const original = ORIGINAL_FLAGS[key];
|
|
if (original === undefined) {
|
|
delete process.env[key];
|
|
} else {
|
|
process.env[key] = original;
|
|
}
|
|
}
|
|
}
|
|
|
|
async function createAppWithFlags(flags: {
|
|
answerPolicy: "0" | "1";
|
|
stage2Eval: "0" | "1";
|
|
problemUnits: "0" | "1";
|
|
problemCentric: "0" | "1";
|
|
}): Promise<import("express").Express> {
|
|
process.env.FEATURE_ASSISTANT_ANSWER_POLICY_V11 = flags.answerPolicy;
|
|
process.env.FEATURE_ASSISTANT_STAGE2_EVAL_V1 = flags.stage2Eval;
|
|
process.env.FEATURE_ASSISTANT_PROBLEM_UNITS_V1 = flags.problemUnits;
|
|
process.env.FEATURE_ASSISTANT_PROBLEM_CENTRIC_ANSWER_V1 = flags.problemCentric;
|
|
process.env.FEATURE_ASSISTANT_BROAD_GUARD_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_LIFECYCLE_RUNTIME_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_LIFECYCLE_ANSWER_V1 = "1";
|
|
process.env.FEATURE_ASSISTANT_GRAPH_RUNTIME_V1 = "1";
|
|
vi.resetModules();
|
|
const { createApp } = await import("../src/server");
|
|
return createApp();
|
|
}
|
|
|
|
describe.sequential("assistant P0 eval harness (Wave 7)", () => {
|
|
afterEach(() => {
|
|
restoreFlags();
|
|
vi.resetModules();
|
|
});
|
|
|
|
it("runs assistant_p0 eval and returns formal product metrics + verdict", async () => {
|
|
const app = await createAppWithFlags({
|
|
answerPolicy: "1",
|
|
stage2Eval: "1",
|
|
problemUnits: "1",
|
|
problemCentric: "1"
|
|
});
|
|
|
|
const response = await request(app).post("/api/eval/run").send({
|
|
eval_target: "assistant_p0",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "p0_eval_corpus_v0_1.json",
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
|
|
expect(response.status).toBe(200);
|
|
expect(response.body.report?.eval_target).toBe("assistant_p0");
|
|
expect(response.body.report?.suite_id).toBe("assistant_p0_eval_corpus");
|
|
expect(response.body.report?.scenario_count).toBe(36);
|
|
expect(response.body.report?.cases_total).toBe(36);
|
|
expect(response.body.report?.metrics?.raw).toBeTruthy();
|
|
expect(Object.keys(response.body.report?.metrics?.raw ?? {})).toEqual([
|
|
"problem_first_answer_rate",
|
|
"mechanism_coherence_score",
|
|
"entity_leakage_rate",
|
|
"accountant_actionability_score",
|
|
"route_correctness_rate",
|
|
"domain_purity_rate",
|
|
"limitation_honesty_rate",
|
|
"top_problem_unit_match_rate"
|
|
]);
|
|
expect(Object.keys(response.body.report?.quality_gap_metrics?.raw ?? {})).toEqual([
|
|
"generic_explanation_rate",
|
|
"false_confidence_rate",
|
|
"mechanism_specificity_score",
|
|
"followup_context_retention_score",
|
|
"stage4_contract_compliance_rate"
|
|
]);
|
|
expect(Number(response.body.report?.quality_gap_metrics?.denominators?.stage4_contract_audited_cases_total ?? 0)).toBeGreaterThan(0);
|
|
const qualityGapChecks = Array.isArray(response.body.report?.baseline_stability_gate?.quality_gap_checks)
|
|
? response.body.report.baseline_stability_gate.quality_gap_checks
|
|
: [];
|
|
expect(qualityGapChecks.some((item: { metric?: string }) => item.metric === "stage4_contract_compliance_rate")).toBe(true);
|
|
expect(["P0_ACCEPTED", "P0_ACCEPTED_WITH_LIMITATIONS", "P0_NOT_ACCEPTED"]).toContain(
|
|
String(response.body.report?.acceptance_gate?.verdict ?? "")
|
|
);
|
|
expect(["P0_BASELINE_STABLE", "P0_BASELINE_STABLE_WITH_OPEN_QUALITY_GAPS"]).toContain(
|
|
String(response.body.report?.baseline_stability_gate?.verdict ?? "")
|
|
);
|
|
});
|
|
|
|
it("loads formal P0 corpus split for 3 domains", async () => {
|
|
const app = await createAppWithFlags({
|
|
answerPolicy: "1",
|
|
stage2Eval: "1",
|
|
problemUnits: "1",
|
|
problemCentric: "1"
|
|
});
|
|
|
|
const response = await request(app).post("/api/eval/run").send({
|
|
eval_target: "assistant_p0",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "p0_eval_corpus_v0_1.json",
|
|
caseIds: ["P0-SET-01", "P0-VAT-01", "P0-CLOSE-01"],
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
|
|
expect(response.status).toBe(200);
|
|
expect(response.body.report?.domain_distribution?.settlements_60_62).toBe(1);
|
|
expect(response.body.report?.domain_distribution?.vat_document_register_book).toBe(1);
|
|
expect(response.body.report?.domain_distribution?.month_close_costs_20_44).toBe(1);
|
|
});
|
|
|
|
it("supports Wave 9 expanded corpus classes and follow-up context metrics", async () => {
|
|
const app = await createAppWithFlags({
|
|
answerPolicy: "1",
|
|
stage2Eval: "1",
|
|
problemUnits: "1",
|
|
problemCentric: "1"
|
|
});
|
|
|
|
const response = await request(app).post("/api/eval/run").send({
|
|
eval_target: "assistant_p0",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "p0_eval_corpus_v0_2.json",
|
|
caseIds: ["P0-W9-25", "P0-W9-30", "P0-W9-35", "P0-W9-40"],
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
|
|
expect(response.status).toBe(200);
|
|
expect(response.body.report?.cases_total).toBe(4);
|
|
expect(response.body.report?.query_class_distribution?.followup_investigation).toBe(1);
|
|
expect(response.body.report?.query_class_distribution?.noisy_input).toBe(1);
|
|
expect(response.body.report?.query_class_distribution?.translit_noisy).toBe(1);
|
|
expect(response.body.report?.query_class_distribution?.multi_intent).toBe(1);
|
|
expect(response.body.report?.quality_gap_metrics?.denominators?.followup_cases_total).toBe(1);
|
|
expect(Number(response.body.report?.budget?.requests_total ?? 0)).toBeGreaterThanOrEqual(5);
|
|
|
|
const followupCase = Array.isArray(response.body.report?.results)
|
|
? response.body.report.results.find((item: { case_id?: string }) => item.case_id === "P0-W9-25")
|
|
: null;
|
|
expect(followupCase?.followup_seed_query).toBeTruthy();
|
|
expect(followupCase?.actual?.followup_context_match_ratio).not.toBeNull();
|
|
});
|
|
|
|
it("builds before/after comparison and returns formal verdict delta", async () => {
|
|
const caseSubset = [
|
|
"P0-SET-01",
|
|
"P0-SET-02",
|
|
"P0-SET-09",
|
|
"P0-VAT-01",
|
|
"P0-VAT-02",
|
|
"P0-VAT-09",
|
|
"P0-CLOSE-01",
|
|
"P0-CLOSE-02",
|
|
"P0-CLOSE-09"
|
|
];
|
|
|
|
const baselineApp = await createAppWithFlags({
|
|
answerPolicy: "0",
|
|
stage2Eval: "1",
|
|
problemUnits: "0",
|
|
problemCentric: "0"
|
|
});
|
|
const baseline = await request(baselineApp).post("/api/eval/run").send({
|
|
eval_target: "assistant_p0",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "p0_eval_corpus_v0_1.json",
|
|
caseIds: caseSubset,
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(baseline.status).toBe(200);
|
|
const baselinePath = String(baseline.body.report?.artifacts?.run_report_json_path ?? "");
|
|
expect(baselinePath.length).toBeGreaterThan(0);
|
|
|
|
const currentApp = await createAppWithFlags({
|
|
answerPolicy: "1",
|
|
stage2Eval: "1",
|
|
problemUnits: "1",
|
|
problemCentric: "1"
|
|
});
|
|
const current = await request(currentApp).post("/api/eval/run").send({
|
|
eval_target: "assistant_p0",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "p0_eval_corpus_v0_1.json",
|
|
caseIds: caseSubset,
|
|
compare_with_report_file: baselinePath,
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(current.status).toBe(200);
|
|
expect(current.body.report?.comparison).toBeTruthy();
|
|
expect(current.body.report?.comparison?.metric_deltas).toBeTruthy();
|
|
expect(current.body.report?.comparison?.verdict_delta).toBeTruthy();
|
|
expect(current.body.report?.comparison?.artifacts?.comparison_report_json_path).toBeTruthy();
|
|
});
|
|
|
|
it("respects P0 eval feature gate via Stage2 eval flag OFF/ON", async () => {
|
|
const appOff = await createAppWithFlags({
|
|
answerPolicy: "1",
|
|
stage2Eval: "0",
|
|
problemUnits: "1",
|
|
problemCentric: "1"
|
|
});
|
|
const offResponse = await request(appOff).post("/api/eval/run").send({
|
|
eval_target: "assistant_p0",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "p0_eval_corpus_v0_1.json",
|
|
caseIds: ["P0-SET-01"],
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(offResponse.status).toBe(409);
|
|
expect(offResponse.body?.error?.code).toBe("ASSISTANT_P0_EVAL_DISABLED");
|
|
|
|
const appOn = await createAppWithFlags({
|
|
answerPolicy: "1",
|
|
stage2Eval: "1",
|
|
problemUnits: "1",
|
|
problemCentric: "1"
|
|
});
|
|
const onResponse = await request(appOn).post("/api/eval/run").send({
|
|
eval_target: "assistant_p0",
|
|
useMock: true,
|
|
mode: "single-pass-strict",
|
|
caseSetFile: "p0_eval_corpus_v0_1.json",
|
|
caseIds: ["P0-SET-01"],
|
|
normalizeConfig: {
|
|
promptVersion: "normalizer_v2_0_2"
|
|
}
|
|
});
|
|
expect(onResponse.status).toBe(200);
|
|
expect(onResponse.body.report?.eval_target).toBe("assistant_p0");
|
|
});
|
|
});
|