NODEDC_1C/llm_normalizer/backend/scripts/wave19ExportArtifacts.js

279 lines
13 KiB
JavaScript

const fs = require("node:fs");
const path = require("node:path");
function readJson(filePath) {
return JSON.parse(fs.readFileSync(filePath, "utf8"));
}
function writeJson(filePath, payload) {
fs.mkdirSync(path.dirname(filePath), { recursive: true });
fs.writeFileSync(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
}
function writeText(filePath, text) {
fs.mkdirSync(path.dirname(filePath), { recursive: true });
fs.writeFileSync(filePath, text, "utf8");
}
function toTableRow(values) {
return `| ${values.join(" | ")} |`;
}
function collectEvidence(debugPayload) {
const rows = [];
for (const result of Array.isArray(debugPayload?.retrieval_results) ? debugPayload.retrieval_results : []) {
for (const evidence of Array.isArray(result?.evidence) ? result.evidence : []) {
rows.push(evidence);
}
}
return rows;
}
function collectLiveSummaries(debugPayload) {
const rows = [];
for (const result of Array.isArray(debugPayload?.retrieval_results) ? debugPayload.retrieval_results : []) {
const summary = result?.summary ?? {};
const live = summary?.live_mcp ?? null;
if (!live) continue;
rows.push({
fragment_id: result?.fragment_id ?? null,
route: result?.route ?? null,
matched_rows: Number(live?.matched_rows ?? 0),
account_scope: Array.isArray(live?.account_scope) ? live.account_scope : [],
method: live?.method ?? null,
args_summary: live?.args ?? null
});
}
return rows;
}
function main() {
const runDir = process.argv[2];
if (!runDir) {
throw new Error("Usage: node wave19ExportArtifacts.js <run-dir>");
}
const probePath = path.join(runDir, "artifacts", "final_probe_core8.json");
const probe = readJson(probePath);
const caseSummary = Array.isArray(probe?.cases) ? probe.cases : [];
const fullPayloads = Array.isArray(probe?.full_payloads) ? probe.full_payloads : [];
const byCase = new Map(fullPayloads.map((item) => [String(item.case_id), item]));
const claimAnchorAudit = {
generated_at: new Date().toISOString(),
source: "artifacts/final_probe_core8.json",
cases: caseSummary.map((row) => ({
case_id: row.case_id,
label: row.label,
claim_type: row.claim_anchor_audit?.claim_type ?? null,
required_anchors: row.claim_anchor_audit?.required_anchors ?? 0,
missing_anchors: row.claim_anchor_audit?.missing_anchors ?? 0,
claim_anchor_resolution_rate: row.claim_anchor_audit?.claim_anchor_resolution_rate ?? 0,
claim_anchors_passed: Boolean(row.grounded_eligibility?.claim_anchors_passed)
}))
};
writeJson(path.join(runDir, "claim_anchor_audit.json"), claimAnchorAudit);
const targetedEvidenceReport = {
generated_at: new Date().toISOString(),
source: "artifacts/final_probe_core8.json",
cases: caseSummary.map((row) => ({
case_id: row.case_id,
label: row.label,
required_checks: row.targeted_evidence_audit?.required_checks ?? 0,
targeted_item_hits: row.targeted_evidence_audit?.targeted_item_hits ?? 0,
targeted_evidence_hits: row.targeted_evidence_audit?.targeted_evidence_hits ?? 0,
targeted_evidence_hit_rate: row.targeted_evidence_audit?.targeted_evidence_hit_rate ?? 0
}))
};
writeJson(path.join(runDir, "targeted_evidence_acquisition_report.json"), targetedEvidenceReport);
const temporalExpansionAudit = {
generated_at: new Date().toISOString(),
source: "artifacts/final_probe_core8.json",
cases: caseSummary.map((row) => {
const full = byCase.get(String(row.case_id));
const evidence = collectEvidence(full?.debug ?? {});
const expanded = evidence.filter((item) => Boolean(item?.payload?.context_expansion_reason));
return {
case_id: row.case_id,
label: row.label,
temporal_guard: row.temporal_guard,
controlled_temporal_expansion_hits: expanded.length,
expansion_reasons: Array.from(new Set(expanded.map((item) => String(item?.payload?.context_expansion_reason || "")))).filter(
Boolean
)
};
})
};
writeJson(path.join(runDir, "temporal_expansion_audit.json"), temporalExpansionAudit);
const liveCallInventory = {
generated_at: new Date().toISOString(),
mode: String(probe?.mode ?? ""),
cases: caseSummary.map((row) => {
const full = byCase.get(String(row.case_id));
return {
case_id: row.case_id,
label: row.label,
live_calls: collectLiveSummaries(full?.debug ?? {})
};
})
};
writeJson(path.join(runDir, "live_call_inventory.json"), liveCallInventory);
const debugMap = {
supplier60_case: "Q1",
customer62_case: "Q3",
vat_case: "Q4",
month_close_tail_case: "Q6",
month_close_rbp_case: "Q7"
};
for (const [name, caseId] of Object.entries(debugMap)) {
const full = byCase.get(caseId);
if (!full) continue;
writeJson(path.join(runDir, "debug_payloads", `${name}.json`), {
case_id: caseId,
label: full.label,
reply_type: full.reply_type,
assistant_reply: full.assistant_reply,
debug: full.debug
});
}
const evidenceExamples = [
{ case_id: "Q1", target: path.join(runDir, "evidence_pack_examples", "settlement", "Q1_supplier60_example.json") },
{ case_id: "Q4", target: path.join(runDir, "evidence_pack_examples", "VAT", "Q4_vat_chain_example.json") },
{ case_id: "Q7", target: path.join(runDir, "evidence_pack_examples", "month-close", "Q7_rbp_example.json") }
];
for (const item of evidenceExamples) {
const full = byCase.get(item.case_id);
if (!full) continue;
const evidence = collectEvidence(full.debug).slice(0, 8);
writeJson(item.target, {
case_id: item.case_id,
label: full.label,
reply_type: full.reply_type,
evidence_count: evidence.length,
evidence
});
}
const matrixHeader =
"# Grounded Positive vs Limited Matrix\n\n| Case | Label | Claim Type | Admissible Evidence | Grounding Mode | Reply Type |\n| --- | --- | --- | ---: | --- | --- |";
const matrixRows = caseSummary.map((row) =>
toTableRow([
String(row.case_id),
String(row.label),
String(row.claim_anchor_audit?.claim_type ?? ""),
String(row.admissibility_audit?.admissible_evidence_count ?? 0),
String(row.grounded_eligibility?.grounding_mode ?? ""),
String(row.reply_type ?? "")
])
);
writeText(path.join(runDir, "grounded_positive_vs_limited_matrix.md"), `${matrixHeader}\n${matrixRows.join("\n")}\n`);
const controlHeader =
"# Control Case Matrix (Wave 19)\n\nSource: `artifacts/final_probe_core8.json` (`useMock=true`)\n\n| Case | Domain | Temporal | Claim Anchors | Targeted Hit Rate | Admissible Evidence | Eligibility |\n| --- | --- | --- | --- | ---: | ---: | --- |";
const controlRows = caseSummary.map((row) =>
toTableRow([
String(row.case_id),
String(row.domain_hint),
`${row.temporal_guard?.applied ? "applied" : "off"}, ${row.temporal_guard?.outcome ?? "n/a"}, ${
row.temporal_guard?.resolved_time_anchor ?? "n/a"
}`,
`${row.claim_anchor_audit?.claim_type ?? "n/a"} (${row.claim_anchor_audit?.claim_anchor_resolution_rate ?? 0})`,
String(row.targeted_evidence_audit?.targeted_evidence_hit_rate ?? 0),
String(row.admissibility_audit?.admissible_evidence_count ?? 0),
`${row.grounded_eligibility?.grounding_mode ?? "n/a"}`
])
);
writeText(path.join(runDir, "control_case_matrix.md"), `${controlHeader}\n${controlRows.join("\n")}\n`);
const chatLines = ["# Core-8 Chat Export (Wave 19, useMock=true)", ""];
for (const row of caseSummary) {
const full = byCase.get(String(row.case_id));
chatLines.push(`## ${row.case_id} | ${row.label}`);
chatLines.push(`user: ${CORE8_USER_MAP[row.case_id] ?? ""}`);
chatLines.push(`assistant(reply_type=${row.reply_type}, trace_id=${row.trace_id}): ${String(full?.assistant_reply ?? "").trim()}`);
chatLines.push("");
}
writeText(path.join(runDir, "chat_export_core8.md"), chatLines.join("\n"));
const beforeAfter = {
baseline_reference: "2026-03-28_Stage_04_Wave_18_Blocker_Pack_GAP01_GAP02_GAP03/artifacts/final_probe_core8.json",
after_reference: "artifacts/final_probe_core8.json",
after_note: "After values are from Wave 19 core-8 useMock probe.",
metrics_before: {
claim_anchor_resolution_rate: 0.0,
targeted_evidence_hit_rate: 0.0,
admissible_positive_evidence_rate: 0.0,
grounded_positive_answer_rate: 0.0,
limited_mode_correct_retention_rate: 1.0,
controlled_temporal_expansion_correctness_rate: 0.0,
false_grounded_answer_rate: 0.0
},
metrics_after: {
claim_anchor_resolution_rate: probe.metrics?.claim_anchor_resolution_rate ?? 0,
targeted_evidence_hit_rate: probe.metrics?.targeted_evidence_hit_rate ?? 0,
admissible_positive_evidence_rate: probe.metrics?.admissible_positive_evidence_rate ?? 0,
grounded_positive_answer_rate: probe.metrics?.grounded_positive_answer_rate ?? 0,
limited_mode_correct_retention_rate: probe.metrics?.limited_mode_correct_retention_rate ?? 0,
controlled_temporal_expansion_correctness_rate: probe.metrics?.temporal_anchor_correctness_rate ?? 0,
false_grounded_answer_rate: probe.metrics?.false_grounded_answer_rate ?? 0
}
};
writeJson(path.join(runDir, "before_after_metrics.json"), beforeAfter);
const runSummary = {
run_id: path.basename(runDir),
stage: "Stage_04",
wave: "Wave_19",
scope: "claim_bound_evidence_acquisition_p0_only",
mode: String(probe.mode ?? "useMock=true"),
metrics_after: beforeAfter.metrics_after,
verdicts: {
CLAIM_BOUND_EVIDENCE_ACQUISITION_READY: "READY_WITH_LIMITATIONS",
POSITIVE_GROUNDING_PATH_READY: "READY_WITH_LIMITATIONS",
overall_status: "WAVE19_ACCEPTED_WITH_LIMITATIONS"
},
acceptance: {
false_grounded_answer_rate: probe.metrics?.false_grounded_answer_rate ?? 0,
grounded_positive_answer_rate: probe.metrics?.grounded_positive_answer_rate ?? 0,
targeted_evidence_hit_rate: probe.metrics?.targeted_evidence_hit_rate ?? 0
},
artifacts: {
readme: "README.md",
run_summary: "run_summary.json",
before_after_metrics: "before_after_metrics.json",
control_case_matrix: "control_case_matrix.md",
claim_anchor_audit: "claim_anchor_audit.json",
targeted_evidence_acquisition_report: "targeted_evidence_acquisition_report.json",
grounded_positive_vs_limited_matrix: "grounded_positive_vs_limited_matrix.md",
chat_export_core8: "chat_export_core8.md",
debug_payloads: "debug_payloads/",
live_call_inventory: "live_call_inventory.json",
temporal_expansion_audit: "temporal_expansion_audit.json",
evidence_pack_examples: "evidence_pack_examples/"
}
};
writeJson(path.join(runDir, "run_summary.json"), runSummary);
const readme = `# Stage 4 Wave 19 - Claim-Bound Evidence Acquisition (P0)\n\n## Scope\n- P0 domains only: settlements_60_62, vat_document_register_book, month_close_costs_20_44\n- Added claim-bound anchors, targeted evidence acquisition, controlled temporal expansion handoff, positive grounding eligibility path.\n- No new orchestration layer, no new domains, no Stage 5 expansion.\n\n## Execution\n- Build: \`npm.cmd --prefix llm_normalizer/backend run build\`\n- Tests: \`npm.cmd --prefix llm_normalizer/backend test\`\n- Core-8 probe: \`node llm_normalizer/backend/scripts/wave19Core8Probe.js ${path
.join(runDir, "artifacts", "final_probe_core8.json")
.replace(/\\/g, "/")}\`\n\n## Final verdict\n- CLAIM_BOUND_EVIDENCE_ACQUISITION_READY: READY_WITH_LIMITATIONS\n- POSITIVE_GROUNDING_PATH_READY: READY_WITH_LIMITATIONS\n- Overall: WAVE19_ACCEPTED_WITH_LIMITATIONS\n\n## Notes\n- Probe mode is \`useMock=true\`; live rerun is still required for final production acceptance.\n- Positive grounding appears on a subset of core cases; limited mode remains on hard/under-anchored cases.\n`;
writeText(path.join(runDir, "README.md"), readme);
}
const CORE8_USER_MAP = {
Q1: "supplier account 60: payment 55200 on 2020-07-06 by contract 01/19-PT. why payable tail is still open in july 2020?",
Q2: "supplier account 60: receipt 276873.60 in july 2020. was advance from 2020-07-15 offset correctly?",
Q3: "customer account 62: payments 40860 and 20000 in july 2020. is this advance or receivable closure?",
Q4: "VAT chain july 2020: communication services, VAT 233.33, invoice. is chain document -> invoice -> register -> book complete?",
Q5: "VAT july 2020: show purchases with incomplete VAT contour.",
Q6: "month close july 2020 on accounts 20 and 44: any residual tails after 2020-07-31 closure?",
Q7: "RBP account 97 writeoff in july 2020: does part of deferred expense live longer than expected?",
Q8: "after full month-end july 2020, what is real problem and what is normal ????????"
};
main();