const fs = require("node:fs"); const path = require("node:path"); const request = require("supertest"); const CASES = [ { case_id: "L1", label: "vat_chain_furniture_july2020", expected_mode: "grounded_positive", user_message: "VAT chain july 2020 for furniture purchase and realization: prove document -> invoice -> register -> book linkage and show where chain is complete." }, { case_id: "L2", label: "rbp_writeoff_31_july", expected_mode: "limited", user_message: "RBP writeoff at 31 july 2020: confirm whether residual tail on account 97 is normal residual or unresolved writeoff gap." }, { case_id: "L3", label: "fa_amortization_three_amounts", expected_mode: "limited", user_message: "Fixed asset amortization in july 2020 by three amounts 12000.00, 8000.00, 233.33: detect if any object missed depreciation posting." }, { case_id: "L4", label: "settlement_supplier_60_closure", expected_mode: "grounded_positive", user_message: "Supplier settlement on account 60 in july 2020: payment exists but tail remains open. prove contract/object/closure mechanism." }, { case_id: "L5", label: "month_close_20_44_july", expected_mode: "grounded_positive", user_message: "Month close july 2020 on accounts 20 and 44: prove close operation and distribution chain, separate normal residual from contradiction." } ]; function ratio(num, den) { if (!Number.isFinite(num) || !Number.isFinite(den) || den <= 0) { return 0; } return Number((num / den).toFixed(4)); } function ensureDir(dirPath) { fs.mkdirSync(dirPath, { recursive: true }); } function writeJson(filePath, payload) { ensureDir(path.dirname(filePath)); fs.writeFileSync(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8"); } function writeText(filePath, text) { ensureDir(path.dirname(filePath)); fs.writeFileSync(filePath, text, "utf8"); } function clearBackendDistCache() { const marker = `${path.sep}backend${path.sep}dist${path.sep}`; for (const key of Object.keys(require.cache)) { if (key.includes(marker)) { delete require.cache[key]; } } } function extractLiveCallsFromDebug(debug) { const calls = []; const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : []; for (const result of retrievalResults) { const live = result?.summary?.live_mcp; if (!live || typeof live !== "object") { continue; } calls.push({ fragment_id: result?.fragment_id ?? null, route: result?.route ?? null, method: "execute_query", args_summary: { account_scope: Array.isArray(live.account_scope) ? live.account_scope : [], route: String(live.route ?? result?.route ?? ""), channel: String(live.channel ?? ""), proxy: String(live.proxy ?? "") }, fetched_rows: Number(live.fetched_rows ?? 0), matched_rows: Number(live.matched_rows ?? 0), returned_rows: Number(live.returned_rows ?? 0), status: String(live.status ?? "unknown"), error: live.error ? String(live.error) : null }); } return calls; } function summarizeCase(caseInput, responseBody, suiteMode) { const debug = responseBody?.debug ?? {}; const temporal = debug?.temporal_guard ?? {}; const eligibility = debug?.grounded_answer_eligibility_guard ?? {}; const evidenceGate = debug?.evidence_admissibility_gate ?? {}; const liveCalls = extractLiveCallsFromDebug(debug); const classified = Array.isArray(debug?.classified_numeric_tokens) ? debug.classified_numeric_tokens : []; const resolvedAccounts = Array.isArray(debug?.resolved_account_anchors) ? debug.resolved_account_anchors : []; const polluted = resolvedAccounts.some((token) => classified.some( (entry) => String(entry?.token ?? "").trim() === String(token ?? "").trim() && String(entry?.classification ?? "").trim() !== "account_token" ) ); const julySignal = /(?:2020[-/.]0?7|july|июл)/i.test(String(caseInput.user_message ?? "")); return { case_id: caseInput.case_id, label: caseInput.label, expected_mode: caseInput.expected_mode, suite_mode: suiteMode, trace_id: String(debug?.trace_id ?? ""), reply_type: String(responseBody?.reply_type ?? ""), assistant_reply: String(responseBody?.assistant_reply ?? ""), temporal: { raw_time_scope: temporal?.raw_time_scope ?? null, resolved_primary_period: temporal?.resolved_primary_period ?? null, temporal_alignment_status: temporal?.temporal_alignment_status ?? null, temporal_guard_basis: temporal?.temporal_guard_basis ?? null, temporal_guard_outcome: temporal?.temporal_guard_outcome ?? null }, anchor_pollution: { raw_numeric_tokens: Array.isArray(debug?.raw_numeric_tokens) ? debug.raw_numeric_tokens : [], classified_numeric_tokens: classified, rejected_as_non_accounts: Array.isArray(debug?.rejected_as_non_accounts) ? debug.rejected_as_non_accounts : [], resolved_account_anchors: resolvedAccounts, pollution_detected: polluted }, business_scope: { business_scope_raw: Array.isArray(debug?.business_scope_raw) ? debug.business_scope_raw : [], business_scope_resolved: Array.isArray(debug?.business_scope_resolved) ? debug.business_scope_resolved : [], company_grounding_applied: Boolean(debug?.company_grounding_applied), scope_resolution_reason: Array.isArray(debug?.scope_resolution_reason) ? debug.scope_resolution_reason : [], july_snapshot_signal: julySignal }, evidence: { candidate_evidence_total: Number(evidenceGate?.candidate_evidence_total ?? 0), admissible_evidence_count: Number(evidenceGate?.admissible_evidence_count ?? 0), rejected_evidence_count: Number(evidenceGate?.rejected_evidence_count ?? 0) }, eligibility: { eligible: Boolean(eligibility?.eligible), grounding_mode: String(eligibility?.grounding_mode ?? ""), outcome: String(eligibility?.outcome ?? ""), reason_codes: Array.isArray(eligibility?.reason_codes) ? eligibility.reason_codes : [], temporal_passed: Boolean(eligibility?.temporal_passed), eligibility_time_basis: String(eligibility?.eligibility_time_basis ?? ""), business_scope_passed: Boolean(eligibility?.business_scope_passed) }, live_calls: liveCalls, debug }; } function computeMetrics(rows) { const positiveCases = rows.filter((row) => row.expected_mode === "grounded_positive"); const temporalChecked = rows.filter((row) => row.temporal.temporal_guard_basis === "resolved_primary_period"); const alignmentGood = temporalChecked.filter((row) => ["aligned", "corrected"].includes(String(row.temporal.temporal_alignment_status)) ); const anchorPollutionFree = rows.filter((row) => !row.anchor_pollution.pollution_detected); const companyScopeResolved = rows.filter( (row) => !row.business_scope.july_snapshot_signal || row.business_scope.business_scope_resolved.includes("company_specific_accounting") ); const livePositive = positiveCases.filter((row) => row.eligibility.grounding_mode === "grounded_positive"); const falseGrounded = rows.filter( (row) => row.eligibility.grounding_mode === "grounded_positive" && row.evidence.admissible_evidence_count <= 0 ); const liveInventoryCovered = rows.filter((row) => row.live_calls.length > 0); return { case_count: rows.length, temporal_alignment_correctness_rate: ratio(alignmentGood.length, Math.max(1, temporalChecked.length)), anchor_pollution_free_rate: ratio(anchorPollutionFree.length, Math.max(1, rows.length)), company_scope_resolution_rate: ratio(companyScopeResolved.length, Math.max(1, rows.length)), live_positive_grounding_rate: ratio(livePositive.length, Math.max(1, positiveCases.length)), false_grounded_answer_rate: ratio(falseGrounded.length, Math.max(1, rows.length)), real_live_inventory_coverage_rate: ratio(liveInventoryCovered.length, Math.max(1, rows.length)) }; } function computeParity(mockRows, liveRows) { const byMock = new Map(mockRows.map((row) => [row.case_id, row])); const rows = liveRows.map((live) => { const mock = byMock.get(live.case_id); if (!mock) { return { case_id: live.case_id, label: live.label, parity_score: 0, parity_status: "missing_mock_case", checks: [] }; } const checks = [ { key: "temporal_basis", passed: String(mock.temporal.temporal_guard_basis) === String(live.temporal.temporal_guard_basis) }, { key: "anchor_pollution", passed: Boolean(mock.anchor_pollution.pollution_detected) === Boolean(live.anchor_pollution.pollution_detected) }, { key: "business_scope", passed: mock.business_scope.business_scope_resolved.includes("company_specific_accounting") === live.business_scope.business_scope_resolved.includes("company_specific_accounting") }, { key: "eligibility_outcome", passed: String(live.eligibility.outcome) === String(mock.eligibility.outcome) || (String(mock.eligibility.grounding_mode) === "limited_or_insufficient_evidence" && String(live.eligibility.grounding_mode) === "grounded_positive") } ]; const parityScore = ratio( checks.filter((item) => item.passed).length, Math.max(1, checks.length) ); return { case_id: live.case_id, label: live.label, parity_score: parityScore, parity_status: parityScore >= 0.75 ? "match_or_improved" : "diverged", checks }; }); return { rows, mock_live_parity_rate: ratio( rows.reduce((acc, item) => acc + Number(item.parity_score ?? 0), 0), Math.max(1, rows.length) ) }; } async function runSuite(input) { process.env.FEATURE_ASSISTANT_MCP_RUNTIME_V1 = input.mcpEnabled ? "1" : "0"; clearBackendDistCache(); const { createApp } = require("../dist/server.js"); const app = createApp(); const results = []; for (const testCase of CASES) { const res = await request(app).post("/api/assistant/message").send({ useMock: true, promptVersion: "normalizer_v2_0_2", user_message: testCase.user_message }); if (res.status !== 200) { throw new Error(`Suite ${input.suiteName}, case ${testCase.case_id} failed with status=${res.status}`); } results.push(summarizeCase(testCase, res.body, input.suiteName)); } return results; } function toMarkdownTable(header, rows) { return [header, ...rows].join("\n"); } async function main() { const runDir = process.argv[2]; if (!runDir) { throw new Error("Usage: node wave19_1LiveAlignmentPack.js "); } const mockRows = await runSuite({ suiteName: "mock_baseline_mcp_off", mcpEnabled: false }); const liveRows = await runSuite({ suiteName: "live_alignment_mcp_on", mcpEnabled: true }); const mockMetrics = computeMetrics(mockRows); const liveMetrics = computeMetrics(liveRows); const parity = computeParity(mockRows, liveRows); const beforeAfter = { baseline: "mock_baseline_mcp_off", after: "live_alignment_mcp_on", metrics_before: { temporal_alignment_correctness_rate: mockMetrics.temporal_alignment_correctness_rate, anchor_pollution_free_rate: mockMetrics.anchor_pollution_free_rate, company_scope_resolution_rate: mockMetrics.company_scope_resolution_rate, live_positive_grounding_rate: mockMetrics.live_positive_grounding_rate, mock_live_parity_rate: 1, real_live_inventory_coverage_rate: mockMetrics.real_live_inventory_coverage_rate, false_grounded_answer_rate: mockMetrics.false_grounded_answer_rate }, metrics_after: { temporal_alignment_correctness_rate: liveMetrics.temporal_alignment_correctness_rate, anchor_pollution_free_rate: liveMetrics.anchor_pollution_free_rate, company_scope_resolution_rate: liveMetrics.company_scope_resolution_rate, live_positive_grounding_rate: liveMetrics.live_positive_grounding_rate, mock_live_parity_rate: parity.mock_live_parity_rate, real_live_inventory_coverage_rate: liveMetrics.real_live_inventory_coverage_rate, false_grounded_answer_rate: liveMetrics.false_grounded_answer_rate } }; writeJson(path.join(runDir, "before_after_metrics.json"), beforeAfter); writeJson(path.join(runDir, "artifacts", "mock_probe_live5.json"), { generated_at: new Date().toISOString(), suite: "mock_baseline_mcp_off", cases: mockRows.map((row) => ({ ...row, debug: undefined })) }); writeJson(path.join(runDir, "artifacts", "live_probe_live5.json"), { generated_at: new Date().toISOString(), suite: "live_alignment_mcp_on", cases: liveRows.map((row) => ({ ...row, debug: undefined })) }); for (const row of liveRows) { writeJson(path.join(runDir, "debug_payloads", `${row.case_id}_${row.label}.json`), { case_id: row.case_id, label: row.label, suite_mode: row.suite_mode, debug: row.debug }); } const temporalAudit = { generated_at: new Date().toISOString(), cases: liveRows.map((row) => ({ case_id: row.case_id, label: row.label, raw_time_scope: row.temporal.raw_time_scope, resolved_primary_period: row.temporal.resolved_primary_period, temporal_alignment_status: row.temporal.temporal_alignment_status, temporal_guard_basis: row.temporal.temporal_guard_basis, eligibility_time_basis: row.eligibility.eligibility_time_basis, temporal_guard_outcome: row.temporal.temporal_guard_outcome })), metric: { temporal_alignment_correctness_rate: liveMetrics.temporal_alignment_correctness_rate } }; writeJson(path.join(runDir, "temporal_alignment_audit.json"), temporalAudit); const anchorAudit = { generated_at: new Date().toISOString(), cases: liveRows.map((row) => ({ case_id: row.case_id, label: row.label, raw_numeric_tokens: row.anchor_pollution.raw_numeric_tokens, classified_numeric_tokens: row.anchor_pollution.classified_numeric_tokens, rejected_as_non_accounts: row.anchor_pollution.rejected_as_non_accounts, resolved_account_anchors: row.anchor_pollution.resolved_account_anchors, pollution_detected: row.anchor_pollution.pollution_detected })), metric: { anchor_pollution_free_rate: liveMetrics.anchor_pollution_free_rate } }; writeJson(path.join(runDir, "anchor_pollution_audit.json"), anchorAudit); const scopeAudit = { generated_at: new Date().toISOString(), cases: liveRows.map((row) => ({ case_id: row.case_id, label: row.label, business_scope_raw: row.business_scope.business_scope_raw, business_scope_resolved: row.business_scope.business_scope_resolved, company_grounding_applied: row.business_scope.company_grounding_applied, scope_resolution_reason: row.business_scope.scope_resolution_reason, july_snapshot_signal: row.business_scope.july_snapshot_signal })), metric: { company_scope_resolution_rate: liveMetrics.company_scope_resolution_rate } }; writeJson(path.join(runDir, "business_scope_resolution_audit.json"), scopeAudit); const liveInventory = { generated_at: new Date().toISOString(), mcp_runtime_enabled: true, suite_mode: "live_alignment_mcp_on", cases: liveRows.map((row) => ({ case_id: row.case_id, label: row.label, expected_mode: row.expected_mode, live_calls: row.live_calls.map((call) => ({ ...call, used_for_admissible_evidence: row.evidence.admissible_evidence_count > 0, rejected_reason: row.evidence.admissible_evidence_count > 0 ? null : row.eligibility.reason_codes.length > 0 ? row.eligibility.reason_codes : ["insufficient_admissible_evidence"] })) })) }; writeJson(path.join(runDir, "real_live_call_inventory.json"), liveInventory); const parityHeader = "# Mock vs Live Parity Matrix\n\n| Case | Label | Parity Score | Status | Temporal Basis | Anchor Pollution | Business Scope | Eligibility |\n| --- | --- | ---: | --- | --- | --- | --- | --- |"; const parityRows = parity.rows.map((row) => { const lookup = new Map(row.checks.map((item) => [item.key, item.passed ? "pass" : "fail"])); return `| ${row.case_id} | ${row.label} | ${row.parity_score} | ${row.parity_status} | ${lookup.get("temporal_basis") ?? "n/a"} | ${lookup.get("anchor_pollution") ?? "n/a"} | ${lookup.get("business_scope") ?? "n/a"} | ${lookup.get("eligibility_outcome") ?? "n/a"} |`; }); writeText(path.join(runDir, "mock_vs_live_parity_matrix.md"), toMarkdownTable(parityHeader, parityRows)); const chatLines = ["# Chat Export Live-5", ""]; for (const row of liveRows) { const trimmed = row.assistant_reply.replace(/\s+/g, " ").trim(); chatLines.push(`## ${row.case_id} | ${row.label}`); chatLines.push(`user: ${CASES.find((item) => item.case_id === row.case_id)?.user_message ?? ""}`); chatLines.push(`assistant(reply_type=${row.reply_type}): ${trimmed}`); chatLines.push(""); } writeText(path.join(runDir, "chat_export_live5.md"), chatLines.join("\n")); const groundedHeader = "# Grounded Positive vs Limited (Live)\n\n| Case | Label | Expected | Grounding Mode | Admissible Evidence | Eligibility | Reply Type |\n| --- | --- | --- | --- | ---: | --- | --- |"; const groundedRows = liveRows.map( (row) => `| ${row.case_id} | ${row.label} | ${row.expected_mode} | ${row.eligibility.grounding_mode} | ${row.evidence.admissible_evidence_count} | ${row.eligibility.outcome} | ${row.reply_type} |` ); writeText(path.join(runDir, "grounded_positive_vs_limited_live.md"), toMarkdownTable(groundedHeader, groundedRows)); const liveAlignmentReport = `# Live Alignment Report (Wave 19.1) ## Scope - Temporal alignment sync: raw_time_scope -> resolved_primary_period -> guard/eligibility basis. - Anchor pollution cleanup: date/amount/percent numeric tokens excluded from account anchors. - Business scope resolution: generic -> company-specific for July 2020 P0 signals. - Live parity check: mock baseline (MCP OFF) vs live-alignment (MCP ON). ## Constraints - Normalizer was executed in \`useMock=true\` because OPENAI API key is unavailable in this environment. - MCP runtime was toggled ON for live-alignment suite; inventory contains actual MCP overlay summaries from runtime. ## Key Metrics (Live) - temporal_alignment_correctness_rate: ${liveMetrics.temporal_alignment_correctness_rate} - anchor_pollution_free_rate: ${liveMetrics.anchor_pollution_free_rate} - company_scope_resolution_rate: ${liveMetrics.company_scope_resolution_rate} - live_positive_grounding_rate: ${liveMetrics.live_positive_grounding_rate} - mock_live_parity_rate: ${parity.mock_live_parity_rate} - real_live_inventory_coverage_rate: ${liveMetrics.real_live_inventory_coverage_rate} - false_grounded_answer_rate: ${liveMetrics.false_grounded_answer_rate} `; writeText(path.join(runDir, "live_alignment_report.md"), liveAlignmentReport); const thresholds = { temporal_alignment_correctness_rate: 0.95, anchor_pollution_free_rate: 0.95, company_scope_resolution_rate: 0.95, mock_live_parity_rate: 0.85, false_grounded_answer_rate: 0 }; const temporalFixed = liveMetrics.temporal_alignment_correctness_rate >= thresholds.temporal_alignment_correctness_rate; const anchorFixed = liveMetrics.anchor_pollution_free_rate >= thresholds.anchor_pollution_free_rate; const companyScopeFixed = liveMetrics.company_scope_resolution_rate >= thresholds.company_scope_resolution_rate; const parityReached = parity.mock_live_parity_rate >= thresholds.mock_live_parity_rate && liveMetrics.false_grounded_answer_rate <= thresholds.false_grounded_answer_rate; const overallStatus = temporalFixed && anchorFixed && companyScopeFixed && parityReached ? "WAVE19_1_ACCEPTED" : liveMetrics.false_grounded_answer_rate <= 0 ? "WAVE19_1_ACCEPTED_WITH_LIMITATIONS" : "WAVE19_1_NOT_ACCEPTED"; const runSummary = { run_id: path.basename(runDir), stage: "Stage_04", wave: "Wave_19_1", scope: "live_alignment_fix_claim_bound_runtime", execution: { mock_baseline_suite: "MCP runtime OFF, useMock=true", live_alignment_suite: "MCP runtime ON, useMock=true" }, thresholds, metrics_live: liveMetrics, metrics_parity: { mock_live_parity_rate: parity.mock_live_parity_rate }, verdicts: { TEMPORAL_ALIGNMENT_FIXED: temporalFixed ? "FIXED" : "NOT_FIXED", ANCHOR_POLLUTION_FIXED: anchorFixed ? "FIXED" : "NOT_FIXED", COMPANY_SCOPE_FIXED: companyScopeFixed ? "FIXED" : "NOT_FIXED", LIVE_PARITY_REACHED: parityReached ? "REACHED" : "NOT_REACHED", overall_status: overallStatus } }; writeJson(path.join(runDir, "run_summary.json"), runSummary); const readme = `# Stage 4 / Wave 19.1 - Live Alignment Fix (Claim-Bound Runtime) ## What was executed - Backend build + full tests. - Two control suites on same 5 cases: - \`mock_baseline_mcp_off\`: MCP runtime disabled. - \`live_alignment_mcp_on\`: MCP runtime enabled. - Normalizer used \`useMock=true\` due missing OPENAI API key in environment. ## Output artifacts - run_summary.json - before_after_metrics.json - live_alignment_report.md - mock_vs_live_parity_matrix.md - chat_export_live5.md - debug_payloads/ - real_live_call_inventory.json - temporal_alignment_audit.json - anchor_pollution_audit.json - business_scope_resolution_audit.json - grounded_positive_vs_limited_live.md ## Final verdict - TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"} - ANCHOR_POLLUTION_FIXED: ${anchorFixed ? "FIXED" : "NOT_FIXED"} - COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"} - LIVE_PARITY_REACHED: ${parityReached ? "REACHED" : "NOT_REACHED"} - Overall: ${overallStatus} `; writeText(path.join(runDir, "README.md"), readme); } main().catch((error) => { process.stderr.write(`${error instanceof Error ? error.stack || error.message : String(error)}\n`); process.exit(1); });