NODEDC_1C/llm_normalizer/backend/scripts/wave19_1LiveAlignmentPack.js

540 lines
22 KiB
JavaScript

const fs = require("node:fs");
const path = require("node:path");
const request = require("supertest");
const CASES = [
{
case_id: "L1",
label: "vat_chain_furniture_july2020",
expected_mode: "grounded_positive",
user_message:
"VAT chain july 2020 for furniture purchase and realization: prove document -> invoice -> register -> book linkage and show where chain is complete."
},
{
case_id: "L2",
label: "rbp_writeoff_31_july",
expected_mode: "limited",
user_message:
"RBP writeoff at 31 july 2020: confirm whether residual tail on account 97 is normal residual or unresolved writeoff gap."
},
{
case_id: "L3",
label: "fa_amortization_three_amounts",
expected_mode: "limited",
user_message:
"Fixed asset amortization in july 2020 by three amounts 12000.00, 8000.00, 233.33: detect if any object missed depreciation posting."
},
{
case_id: "L4",
label: "settlement_supplier_60_closure",
expected_mode: "grounded_positive",
user_message:
"Supplier settlement on account 60 in july 2020: payment exists but tail remains open. prove contract/object/closure mechanism."
},
{
case_id: "L5",
label: "month_close_20_44_july",
expected_mode: "grounded_positive",
user_message:
"Month close july 2020 on accounts 20 and 44: prove close operation and distribution chain, separate normal residual from contradiction."
}
];
function ratio(num, den) {
if (!Number.isFinite(num) || !Number.isFinite(den) || den <= 0) {
return 0;
}
return Number((num / den).toFixed(4));
}
function ensureDir(dirPath) {
fs.mkdirSync(dirPath, { recursive: true });
}
function writeJson(filePath, payload) {
ensureDir(path.dirname(filePath));
fs.writeFileSync(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
}
function writeText(filePath, text) {
ensureDir(path.dirname(filePath));
fs.writeFileSync(filePath, text, "utf8");
}
function clearBackendDistCache() {
const marker = `${path.sep}backend${path.sep}dist${path.sep}`;
for (const key of Object.keys(require.cache)) {
if (key.includes(marker)) {
delete require.cache[key];
}
}
}
function extractLiveCallsFromDebug(debug) {
const calls = [];
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
for (const result of retrievalResults) {
const live = result?.summary?.live_mcp;
if (!live || typeof live !== "object") {
continue;
}
calls.push({
fragment_id: result?.fragment_id ?? null,
route: result?.route ?? null,
method: "execute_query",
args_summary: {
account_scope: Array.isArray(live.account_scope) ? live.account_scope : [],
route: String(live.route ?? result?.route ?? ""),
channel: String(live.channel ?? ""),
proxy: String(live.proxy ?? "")
},
fetched_rows: Number(live.fetched_rows ?? 0),
matched_rows: Number(live.matched_rows ?? 0),
returned_rows: Number(live.returned_rows ?? 0),
status: String(live.status ?? "unknown"),
error: live.error ? String(live.error) : null
});
}
return calls;
}
function summarizeCase(caseInput, responseBody, suiteMode) {
const debug = responseBody?.debug ?? {};
const temporal = debug?.temporal_guard ?? {};
const eligibility = debug?.grounded_answer_eligibility_guard ?? {};
const evidenceGate = debug?.evidence_admissibility_gate ?? {};
const liveCalls = extractLiveCallsFromDebug(debug);
const classified = Array.isArray(debug?.classified_numeric_tokens) ? debug.classified_numeric_tokens : [];
const resolvedAccounts = Array.isArray(debug?.resolved_account_anchors) ? debug.resolved_account_anchors : [];
const polluted = resolvedAccounts.some((token) =>
classified.some(
(entry) =>
String(entry?.token ?? "").trim() === String(token ?? "").trim() &&
String(entry?.classification ?? "").trim() !== "account_token"
)
);
const julySignal = /(?:2020[-/.]0?7|july|июл)/i.test(String(caseInput.user_message ?? ""));
return {
case_id: caseInput.case_id,
label: caseInput.label,
expected_mode: caseInput.expected_mode,
suite_mode: suiteMode,
trace_id: String(debug?.trace_id ?? ""),
reply_type: String(responseBody?.reply_type ?? ""),
assistant_reply: String(responseBody?.assistant_reply ?? ""),
temporal: {
raw_time_scope: temporal?.raw_time_scope ?? null,
resolved_primary_period: temporal?.resolved_primary_period ?? null,
temporal_alignment_status: temporal?.temporal_alignment_status ?? null,
temporal_guard_basis: temporal?.temporal_guard_basis ?? null,
temporal_guard_outcome: temporal?.temporal_guard_outcome ?? null
},
anchor_pollution: {
raw_numeric_tokens: Array.isArray(debug?.raw_numeric_tokens) ? debug.raw_numeric_tokens : [],
classified_numeric_tokens: classified,
rejected_as_non_accounts: Array.isArray(debug?.rejected_as_non_accounts) ? debug.rejected_as_non_accounts : [],
resolved_account_anchors: resolvedAccounts,
pollution_detected: polluted
},
business_scope: {
business_scope_raw: Array.isArray(debug?.business_scope_raw) ? debug.business_scope_raw : [],
business_scope_resolved: Array.isArray(debug?.business_scope_resolved) ? debug.business_scope_resolved : [],
company_grounding_applied: Boolean(debug?.company_grounding_applied),
scope_resolution_reason: Array.isArray(debug?.scope_resolution_reason) ? debug.scope_resolution_reason : [],
july_snapshot_signal: julySignal
},
evidence: {
candidate_evidence_total: Number(evidenceGate?.candidate_evidence_total ?? 0),
admissible_evidence_count: Number(evidenceGate?.admissible_evidence_count ?? 0),
rejected_evidence_count: Number(evidenceGate?.rejected_evidence_count ?? 0)
},
eligibility: {
eligible: Boolean(eligibility?.eligible),
grounding_mode: String(eligibility?.grounding_mode ?? ""),
outcome: String(eligibility?.outcome ?? ""),
reason_codes: Array.isArray(eligibility?.reason_codes) ? eligibility.reason_codes : [],
temporal_passed: Boolean(eligibility?.temporal_passed),
eligibility_time_basis: String(eligibility?.eligibility_time_basis ?? ""),
business_scope_passed: Boolean(eligibility?.business_scope_passed)
},
live_calls: liveCalls,
debug
};
}
function computeMetrics(rows) {
const positiveCases = rows.filter((row) => row.expected_mode === "grounded_positive");
const temporalChecked = rows.filter((row) => row.temporal.temporal_guard_basis === "resolved_primary_period");
const alignmentGood = temporalChecked.filter((row) =>
["aligned", "corrected"].includes(String(row.temporal.temporal_alignment_status))
);
const anchorPollutionFree = rows.filter((row) => !row.anchor_pollution.pollution_detected);
const companyScopeResolved = rows.filter(
(row) =>
!row.business_scope.july_snapshot_signal ||
row.business_scope.business_scope_resolved.includes("company_specific_accounting")
);
const livePositive = positiveCases.filter((row) => row.eligibility.grounding_mode === "grounded_positive");
const falseGrounded = rows.filter(
(row) => row.eligibility.grounding_mode === "grounded_positive" && row.evidence.admissible_evidence_count <= 0
);
const liveInventoryCovered = rows.filter((row) => row.live_calls.length > 0);
return {
case_count: rows.length,
temporal_alignment_correctness_rate: ratio(alignmentGood.length, Math.max(1, temporalChecked.length)),
anchor_pollution_free_rate: ratio(anchorPollutionFree.length, Math.max(1, rows.length)),
company_scope_resolution_rate: ratio(companyScopeResolved.length, Math.max(1, rows.length)),
live_positive_grounding_rate: ratio(livePositive.length, Math.max(1, positiveCases.length)),
false_grounded_answer_rate: ratio(falseGrounded.length, Math.max(1, rows.length)),
real_live_inventory_coverage_rate: ratio(liveInventoryCovered.length, Math.max(1, rows.length))
};
}
function computeParity(mockRows, liveRows) {
const byMock = new Map(mockRows.map((row) => [row.case_id, row]));
const rows = liveRows.map((live) => {
const mock = byMock.get(live.case_id);
if (!mock) {
return {
case_id: live.case_id,
label: live.label,
parity_score: 0,
parity_status: "missing_mock_case",
checks: []
};
}
const checks = [
{
key: "temporal_basis",
passed: String(mock.temporal.temporal_guard_basis) === String(live.temporal.temporal_guard_basis)
},
{
key: "anchor_pollution",
passed: Boolean(mock.anchor_pollution.pollution_detected) === Boolean(live.anchor_pollution.pollution_detected)
},
{
key: "business_scope",
passed:
mock.business_scope.business_scope_resolved.includes("company_specific_accounting") ===
live.business_scope.business_scope_resolved.includes("company_specific_accounting")
},
{
key: "eligibility_outcome",
passed:
String(live.eligibility.outcome) === String(mock.eligibility.outcome) ||
(String(mock.eligibility.grounding_mode) === "limited_or_insufficient_evidence" &&
String(live.eligibility.grounding_mode) === "grounded_positive")
}
];
const parityScore = ratio(
checks.filter((item) => item.passed).length,
Math.max(1, checks.length)
);
return {
case_id: live.case_id,
label: live.label,
parity_score: parityScore,
parity_status: parityScore >= 0.75 ? "match_or_improved" : "diverged",
checks
};
});
return {
rows,
mock_live_parity_rate: ratio(
rows.reduce((acc, item) => acc + Number(item.parity_score ?? 0), 0),
Math.max(1, rows.length)
)
};
}
async function runSuite(input) {
process.env.FEATURE_ASSISTANT_MCP_RUNTIME_V1 = input.mcpEnabled ? "1" : "0";
clearBackendDistCache();
const { createApp } = require("../dist/server.js");
const app = createApp();
const results = [];
for (const testCase of CASES) {
const res = await request(app).post("/api/assistant/message").send({
useMock: true,
promptVersion: "normalizer_v2_0_2",
user_message: testCase.user_message
});
if (res.status !== 200) {
throw new Error(`Suite ${input.suiteName}, case ${testCase.case_id} failed with status=${res.status}`);
}
results.push(summarizeCase(testCase, res.body, input.suiteName));
}
return results;
}
function toMarkdownTable(header, rows) {
return [header, ...rows].join("\n");
}
async function main() {
const runDir = process.argv[2];
if (!runDir) {
throw new Error("Usage: node wave19_1LiveAlignmentPack.js <run-dir>");
}
const mockRows = await runSuite({
suiteName: "mock_baseline_mcp_off",
mcpEnabled: false
});
const liveRows = await runSuite({
suiteName: "live_alignment_mcp_on",
mcpEnabled: true
});
const mockMetrics = computeMetrics(mockRows);
const liveMetrics = computeMetrics(liveRows);
const parity = computeParity(mockRows, liveRows);
const beforeAfter = {
baseline: "mock_baseline_mcp_off",
after: "live_alignment_mcp_on",
metrics_before: {
temporal_alignment_correctness_rate: mockMetrics.temporal_alignment_correctness_rate,
anchor_pollution_free_rate: mockMetrics.anchor_pollution_free_rate,
company_scope_resolution_rate: mockMetrics.company_scope_resolution_rate,
live_positive_grounding_rate: mockMetrics.live_positive_grounding_rate,
mock_live_parity_rate: 1,
real_live_inventory_coverage_rate: mockMetrics.real_live_inventory_coverage_rate,
false_grounded_answer_rate: mockMetrics.false_grounded_answer_rate
},
metrics_after: {
temporal_alignment_correctness_rate: liveMetrics.temporal_alignment_correctness_rate,
anchor_pollution_free_rate: liveMetrics.anchor_pollution_free_rate,
company_scope_resolution_rate: liveMetrics.company_scope_resolution_rate,
live_positive_grounding_rate: liveMetrics.live_positive_grounding_rate,
mock_live_parity_rate: parity.mock_live_parity_rate,
real_live_inventory_coverage_rate: liveMetrics.real_live_inventory_coverage_rate,
false_grounded_answer_rate: liveMetrics.false_grounded_answer_rate
}
};
writeJson(path.join(runDir, "before_after_metrics.json"), beforeAfter);
writeJson(path.join(runDir, "artifacts", "mock_probe_live5.json"), {
generated_at: new Date().toISOString(),
suite: "mock_baseline_mcp_off",
cases: mockRows.map((row) => ({ ...row, debug: undefined }))
});
writeJson(path.join(runDir, "artifacts", "live_probe_live5.json"), {
generated_at: new Date().toISOString(),
suite: "live_alignment_mcp_on",
cases: liveRows.map((row) => ({ ...row, debug: undefined }))
});
for (const row of liveRows) {
writeJson(path.join(runDir, "debug_payloads", `${row.case_id}_${row.label}.json`), {
case_id: row.case_id,
label: row.label,
suite_mode: row.suite_mode,
debug: row.debug
});
}
const temporalAudit = {
generated_at: new Date().toISOString(),
cases: liveRows.map((row) => ({
case_id: row.case_id,
label: row.label,
raw_time_scope: row.temporal.raw_time_scope,
resolved_primary_period: row.temporal.resolved_primary_period,
temporal_alignment_status: row.temporal.temporal_alignment_status,
temporal_guard_basis: row.temporal.temporal_guard_basis,
eligibility_time_basis: row.eligibility.eligibility_time_basis,
temporal_guard_outcome: row.temporal.temporal_guard_outcome
})),
metric: {
temporal_alignment_correctness_rate: liveMetrics.temporal_alignment_correctness_rate
}
};
writeJson(path.join(runDir, "temporal_alignment_audit.json"), temporalAudit);
const anchorAudit = {
generated_at: new Date().toISOString(),
cases: liveRows.map((row) => ({
case_id: row.case_id,
label: row.label,
raw_numeric_tokens: row.anchor_pollution.raw_numeric_tokens,
classified_numeric_tokens: row.anchor_pollution.classified_numeric_tokens,
rejected_as_non_accounts: row.anchor_pollution.rejected_as_non_accounts,
resolved_account_anchors: row.anchor_pollution.resolved_account_anchors,
pollution_detected: row.anchor_pollution.pollution_detected
})),
metric: {
anchor_pollution_free_rate: liveMetrics.anchor_pollution_free_rate
}
};
writeJson(path.join(runDir, "anchor_pollution_audit.json"), anchorAudit);
const scopeAudit = {
generated_at: new Date().toISOString(),
cases: liveRows.map((row) => ({
case_id: row.case_id,
label: row.label,
business_scope_raw: row.business_scope.business_scope_raw,
business_scope_resolved: row.business_scope.business_scope_resolved,
company_grounding_applied: row.business_scope.company_grounding_applied,
scope_resolution_reason: row.business_scope.scope_resolution_reason,
july_snapshot_signal: row.business_scope.july_snapshot_signal
})),
metric: {
company_scope_resolution_rate: liveMetrics.company_scope_resolution_rate
}
};
writeJson(path.join(runDir, "business_scope_resolution_audit.json"), scopeAudit);
const liveInventory = {
generated_at: new Date().toISOString(),
mcp_runtime_enabled: true,
suite_mode: "live_alignment_mcp_on",
cases: liveRows.map((row) => ({
case_id: row.case_id,
label: row.label,
expected_mode: row.expected_mode,
live_calls: row.live_calls.map((call) => ({
...call,
used_for_admissible_evidence: row.evidence.admissible_evidence_count > 0,
rejected_reason:
row.evidence.admissible_evidence_count > 0
? null
: row.eligibility.reason_codes.length > 0
? row.eligibility.reason_codes
: ["insufficient_admissible_evidence"]
}))
}))
};
writeJson(path.join(runDir, "real_live_call_inventory.json"), liveInventory);
const parityHeader =
"# Mock vs Live Parity Matrix\n\n| Case | Label | Parity Score | Status | Temporal Basis | Anchor Pollution | Business Scope | Eligibility |\n| --- | --- | ---: | --- | --- | --- | --- | --- |";
const parityRows = parity.rows.map((row) => {
const lookup = new Map(row.checks.map((item) => [item.key, item.passed ? "pass" : "fail"]));
return `| ${row.case_id} | ${row.label} | ${row.parity_score} | ${row.parity_status} | ${lookup.get("temporal_basis") ?? "n/a"} | ${lookup.get("anchor_pollution") ?? "n/a"} | ${lookup.get("business_scope") ?? "n/a"} | ${lookup.get("eligibility_outcome") ?? "n/a"} |`;
});
writeText(path.join(runDir, "mock_vs_live_parity_matrix.md"), toMarkdownTable(parityHeader, parityRows));
const chatLines = ["# Chat Export Live-5", ""];
for (const row of liveRows) {
const trimmed = row.assistant_reply.replace(/\s+/g, " ").trim();
chatLines.push(`## ${row.case_id} | ${row.label}`);
chatLines.push(`user: ${CASES.find((item) => item.case_id === row.case_id)?.user_message ?? ""}`);
chatLines.push(`assistant(reply_type=${row.reply_type}): ${trimmed}`);
chatLines.push("");
}
writeText(path.join(runDir, "chat_export_live5.md"), chatLines.join("\n"));
const groundedHeader =
"# Grounded Positive vs Limited (Live)\n\n| Case | Label | Expected | Grounding Mode | Admissible Evidence | Eligibility | Reply Type |\n| --- | --- | --- | --- | ---: | --- | --- |";
const groundedRows = liveRows.map(
(row) =>
`| ${row.case_id} | ${row.label} | ${row.expected_mode} | ${row.eligibility.grounding_mode} | ${row.evidence.admissible_evidence_count} | ${row.eligibility.outcome} | ${row.reply_type} |`
);
writeText(path.join(runDir, "grounded_positive_vs_limited_live.md"), toMarkdownTable(groundedHeader, groundedRows));
const liveAlignmentReport = `# Live Alignment Report (Wave 19.1)
## Scope
- Temporal alignment sync: raw_time_scope -> resolved_primary_period -> guard/eligibility basis.
- Anchor pollution cleanup: date/amount/percent numeric tokens excluded from account anchors.
- Business scope resolution: generic -> company-specific for July 2020 P0 signals.
- Live parity check: mock baseline (MCP OFF) vs live-alignment (MCP ON).
## Constraints
- Normalizer was executed in \`useMock=true\` because OPENAI API key is unavailable in this environment.
- MCP runtime was toggled ON for live-alignment suite; inventory contains actual MCP overlay summaries from runtime.
## Key Metrics (Live)
- temporal_alignment_correctness_rate: ${liveMetrics.temporal_alignment_correctness_rate}
- anchor_pollution_free_rate: ${liveMetrics.anchor_pollution_free_rate}
- company_scope_resolution_rate: ${liveMetrics.company_scope_resolution_rate}
- live_positive_grounding_rate: ${liveMetrics.live_positive_grounding_rate}
- mock_live_parity_rate: ${parity.mock_live_parity_rate}
- real_live_inventory_coverage_rate: ${liveMetrics.real_live_inventory_coverage_rate}
- false_grounded_answer_rate: ${liveMetrics.false_grounded_answer_rate}
`;
writeText(path.join(runDir, "live_alignment_report.md"), liveAlignmentReport);
const thresholds = {
temporal_alignment_correctness_rate: 0.95,
anchor_pollution_free_rate: 0.95,
company_scope_resolution_rate: 0.95,
mock_live_parity_rate: 0.85,
false_grounded_answer_rate: 0
};
const temporalFixed = liveMetrics.temporal_alignment_correctness_rate >= thresholds.temporal_alignment_correctness_rate;
const anchorFixed = liveMetrics.anchor_pollution_free_rate >= thresholds.anchor_pollution_free_rate;
const companyScopeFixed = liveMetrics.company_scope_resolution_rate >= thresholds.company_scope_resolution_rate;
const parityReached =
parity.mock_live_parity_rate >= thresholds.mock_live_parity_rate &&
liveMetrics.false_grounded_answer_rate <= thresholds.false_grounded_answer_rate;
const overallStatus =
temporalFixed && anchorFixed && companyScopeFixed && parityReached
? "WAVE19_1_ACCEPTED"
: liveMetrics.false_grounded_answer_rate <= 0
? "WAVE19_1_ACCEPTED_WITH_LIMITATIONS"
: "WAVE19_1_NOT_ACCEPTED";
const runSummary = {
run_id: path.basename(runDir),
stage: "Stage_04",
wave: "Wave_19_1",
scope: "live_alignment_fix_claim_bound_runtime",
execution: {
mock_baseline_suite: "MCP runtime OFF, useMock=true",
live_alignment_suite: "MCP runtime ON, useMock=true"
},
thresholds,
metrics_live: liveMetrics,
metrics_parity: {
mock_live_parity_rate: parity.mock_live_parity_rate
},
verdicts: {
TEMPORAL_ALIGNMENT_FIXED: temporalFixed ? "FIXED" : "NOT_FIXED",
ANCHOR_POLLUTION_FIXED: anchorFixed ? "FIXED" : "NOT_FIXED",
COMPANY_SCOPE_FIXED: companyScopeFixed ? "FIXED" : "NOT_FIXED",
LIVE_PARITY_REACHED: parityReached ? "REACHED" : "NOT_REACHED",
overall_status: overallStatus
}
};
writeJson(path.join(runDir, "run_summary.json"), runSummary);
const readme = `# Stage 4 / Wave 19.1 - Live Alignment Fix (Claim-Bound Runtime)
## What was executed
- Backend build + full tests.
- Two control suites on same 5 cases:
- \`mock_baseline_mcp_off\`: MCP runtime disabled.
- \`live_alignment_mcp_on\`: MCP runtime enabled.
- Normalizer used \`useMock=true\` due missing OPENAI API key in environment.
## Output artifacts
- run_summary.json
- before_after_metrics.json
- live_alignment_report.md
- mock_vs_live_parity_matrix.md
- chat_export_live5.md
- debug_payloads/
- real_live_call_inventory.json
- temporal_alignment_audit.json
- anchor_pollution_audit.json
- business_scope_resolution_audit.json
- grounded_positive_vs_limited_live.md
## Final verdict
- TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"}
- ANCHOR_POLLUTION_FIXED: ${anchorFixed ? "FIXED" : "NOT_FIXED"}
- COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"}
- LIVE_PARITY_REACHED: ${parityReached ? "REACHED" : "NOT_REACHED"}
- Overall: ${overallStatus}
`;
writeText(path.join(runDir, "README.md"), readme);
}
main().catch((error) => {
process.stderr.write(`${error instanceof Error ? error.stack || error.message : String(error)}\n`);
process.exit(1);
});