279 lines
13 KiB
JavaScript
279 lines
13 KiB
JavaScript
const fs = require("node:fs");
|
|
const path = require("node:path");
|
|
|
|
function readJson(filePath) {
|
|
return JSON.parse(fs.readFileSync(filePath, "utf8"));
|
|
}
|
|
|
|
function writeJson(filePath, payload) {
|
|
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
fs.writeFileSync(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
|
|
}
|
|
|
|
function writeText(filePath, text) {
|
|
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
fs.writeFileSync(filePath, text, "utf8");
|
|
}
|
|
|
|
function toTableRow(values) {
|
|
return `| ${values.join(" | ")} |`;
|
|
}
|
|
|
|
function collectEvidence(debugPayload) {
|
|
const rows = [];
|
|
for (const result of Array.isArray(debugPayload?.retrieval_results) ? debugPayload.retrieval_results : []) {
|
|
for (const evidence of Array.isArray(result?.evidence) ? result.evidence : []) {
|
|
rows.push(evidence);
|
|
}
|
|
}
|
|
return rows;
|
|
}
|
|
|
|
function collectLiveSummaries(debugPayload) {
|
|
const rows = [];
|
|
for (const result of Array.isArray(debugPayload?.retrieval_results) ? debugPayload.retrieval_results : []) {
|
|
const summary = result?.summary ?? {};
|
|
const live = summary?.live_mcp ?? null;
|
|
if (!live) continue;
|
|
rows.push({
|
|
fragment_id: result?.fragment_id ?? null,
|
|
route: result?.route ?? null,
|
|
matched_rows: Number(live?.matched_rows ?? 0),
|
|
account_scope: Array.isArray(live?.account_scope) ? live.account_scope : [],
|
|
method: live?.method ?? null,
|
|
args_summary: live?.args ?? null
|
|
});
|
|
}
|
|
return rows;
|
|
}
|
|
|
|
function main() {
|
|
const runDir = process.argv[2];
|
|
if (!runDir) {
|
|
throw new Error("Usage: node wave19ExportArtifacts.js <run-dir>");
|
|
}
|
|
const probePath = path.join(runDir, "artifacts", "final_probe_core8.json");
|
|
const probe = readJson(probePath);
|
|
const caseSummary = Array.isArray(probe?.cases) ? probe.cases : [];
|
|
const fullPayloads = Array.isArray(probe?.full_payloads) ? probe.full_payloads : [];
|
|
const byCase = new Map(fullPayloads.map((item) => [String(item.case_id), item]));
|
|
|
|
const claimAnchorAudit = {
|
|
generated_at: new Date().toISOString(),
|
|
source: "artifacts/final_probe_core8.json",
|
|
cases: caseSummary.map((row) => ({
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
claim_type: row.claim_anchor_audit?.claim_type ?? null,
|
|
required_anchors: row.claim_anchor_audit?.required_anchors ?? 0,
|
|
missing_anchors: row.claim_anchor_audit?.missing_anchors ?? 0,
|
|
claim_anchor_resolution_rate: row.claim_anchor_audit?.claim_anchor_resolution_rate ?? 0,
|
|
claim_anchors_passed: Boolean(row.grounded_eligibility?.claim_anchors_passed)
|
|
}))
|
|
};
|
|
writeJson(path.join(runDir, "claim_anchor_audit.json"), claimAnchorAudit);
|
|
|
|
const targetedEvidenceReport = {
|
|
generated_at: new Date().toISOString(),
|
|
source: "artifacts/final_probe_core8.json",
|
|
cases: caseSummary.map((row) => ({
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
required_checks: row.targeted_evidence_audit?.required_checks ?? 0,
|
|
targeted_item_hits: row.targeted_evidence_audit?.targeted_item_hits ?? 0,
|
|
targeted_evidence_hits: row.targeted_evidence_audit?.targeted_evidence_hits ?? 0,
|
|
targeted_evidence_hit_rate: row.targeted_evidence_audit?.targeted_evidence_hit_rate ?? 0
|
|
}))
|
|
};
|
|
writeJson(path.join(runDir, "targeted_evidence_acquisition_report.json"), targetedEvidenceReport);
|
|
|
|
const temporalExpansionAudit = {
|
|
generated_at: new Date().toISOString(),
|
|
source: "artifacts/final_probe_core8.json",
|
|
cases: caseSummary.map((row) => {
|
|
const full = byCase.get(String(row.case_id));
|
|
const evidence = collectEvidence(full?.debug ?? {});
|
|
const expanded = evidence.filter((item) => Boolean(item?.payload?.context_expansion_reason));
|
|
return {
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
temporal_guard: row.temporal_guard,
|
|
controlled_temporal_expansion_hits: expanded.length,
|
|
expansion_reasons: Array.from(new Set(expanded.map((item) => String(item?.payload?.context_expansion_reason || "")))).filter(
|
|
Boolean
|
|
)
|
|
};
|
|
})
|
|
};
|
|
writeJson(path.join(runDir, "temporal_expansion_audit.json"), temporalExpansionAudit);
|
|
|
|
const liveCallInventory = {
|
|
generated_at: new Date().toISOString(),
|
|
mode: String(probe?.mode ?? ""),
|
|
cases: caseSummary.map((row) => {
|
|
const full = byCase.get(String(row.case_id));
|
|
return {
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
live_calls: collectLiveSummaries(full?.debug ?? {})
|
|
};
|
|
})
|
|
};
|
|
writeJson(path.join(runDir, "live_call_inventory.json"), liveCallInventory);
|
|
|
|
const debugMap = {
|
|
supplier60_case: "Q1",
|
|
customer62_case: "Q3",
|
|
vat_case: "Q4",
|
|
month_close_tail_case: "Q6",
|
|
month_close_rbp_case: "Q7"
|
|
};
|
|
for (const [name, caseId] of Object.entries(debugMap)) {
|
|
const full = byCase.get(caseId);
|
|
if (!full) continue;
|
|
writeJson(path.join(runDir, "debug_payloads", `${name}.json`), {
|
|
case_id: caseId,
|
|
label: full.label,
|
|
reply_type: full.reply_type,
|
|
assistant_reply: full.assistant_reply,
|
|
debug: full.debug
|
|
});
|
|
}
|
|
|
|
const evidenceExamples = [
|
|
{ case_id: "Q1", target: path.join(runDir, "evidence_pack_examples", "settlement", "Q1_supplier60_example.json") },
|
|
{ case_id: "Q4", target: path.join(runDir, "evidence_pack_examples", "VAT", "Q4_vat_chain_example.json") },
|
|
{ case_id: "Q7", target: path.join(runDir, "evidence_pack_examples", "month-close", "Q7_rbp_example.json") }
|
|
];
|
|
for (const item of evidenceExamples) {
|
|
const full = byCase.get(item.case_id);
|
|
if (!full) continue;
|
|
const evidence = collectEvidence(full.debug).slice(0, 8);
|
|
writeJson(item.target, {
|
|
case_id: item.case_id,
|
|
label: full.label,
|
|
reply_type: full.reply_type,
|
|
evidence_count: evidence.length,
|
|
evidence
|
|
});
|
|
}
|
|
|
|
const matrixHeader =
|
|
"# Grounded Positive vs Limited Matrix\n\n| Case | Label | Claim Type | Admissible Evidence | Grounding Mode | Reply Type |\n| --- | --- | --- | ---: | --- | --- |";
|
|
const matrixRows = caseSummary.map((row) =>
|
|
toTableRow([
|
|
String(row.case_id),
|
|
String(row.label),
|
|
String(row.claim_anchor_audit?.claim_type ?? ""),
|
|
String(row.admissibility_audit?.admissible_evidence_count ?? 0),
|
|
String(row.grounded_eligibility?.grounding_mode ?? ""),
|
|
String(row.reply_type ?? "")
|
|
])
|
|
);
|
|
writeText(path.join(runDir, "grounded_positive_vs_limited_matrix.md"), `${matrixHeader}\n${matrixRows.join("\n")}\n`);
|
|
|
|
const controlHeader =
|
|
"# Control Case Matrix (Wave 19)\n\nSource: `artifacts/final_probe_core8.json` (`useMock=true`)\n\n| Case | Domain | Temporal | Claim Anchors | Targeted Hit Rate | Admissible Evidence | Eligibility |\n| --- | --- | --- | --- | ---: | ---: | --- |";
|
|
const controlRows = caseSummary.map((row) =>
|
|
toTableRow([
|
|
String(row.case_id),
|
|
String(row.domain_hint),
|
|
`${row.temporal_guard?.applied ? "applied" : "off"}, ${row.temporal_guard?.outcome ?? "n/a"}, ${
|
|
row.temporal_guard?.resolved_time_anchor ?? "n/a"
|
|
}`,
|
|
`${row.claim_anchor_audit?.claim_type ?? "n/a"} (${row.claim_anchor_audit?.claim_anchor_resolution_rate ?? 0})`,
|
|
String(row.targeted_evidence_audit?.targeted_evidence_hit_rate ?? 0),
|
|
String(row.admissibility_audit?.admissible_evidence_count ?? 0),
|
|
`${row.grounded_eligibility?.grounding_mode ?? "n/a"}`
|
|
])
|
|
);
|
|
writeText(path.join(runDir, "control_case_matrix.md"), `${controlHeader}\n${controlRows.join("\n")}\n`);
|
|
|
|
const chatLines = ["# Core-8 Chat Export (Wave 19, useMock=true)", ""];
|
|
for (const row of caseSummary) {
|
|
const full = byCase.get(String(row.case_id));
|
|
chatLines.push(`## ${row.case_id} | ${row.label}`);
|
|
chatLines.push(`user: ${CORE8_USER_MAP[row.case_id] ?? ""}`);
|
|
chatLines.push(`assistant(reply_type=${row.reply_type}, trace_id=${row.trace_id}): ${String(full?.assistant_reply ?? "").trim()}`);
|
|
chatLines.push("");
|
|
}
|
|
writeText(path.join(runDir, "chat_export_core8.md"), chatLines.join("\n"));
|
|
|
|
const beforeAfter = {
|
|
baseline_reference: "2026-03-28_Stage_04_Wave_18_Blocker_Pack_GAP01_GAP02_GAP03/artifacts/final_probe_core8.json",
|
|
after_reference: "artifacts/final_probe_core8.json",
|
|
after_note: "After values are from Wave 19 core-8 useMock probe.",
|
|
metrics_before: {
|
|
claim_anchor_resolution_rate: 0.0,
|
|
targeted_evidence_hit_rate: 0.0,
|
|
admissible_positive_evidence_rate: 0.0,
|
|
grounded_positive_answer_rate: 0.0,
|
|
limited_mode_correct_retention_rate: 1.0,
|
|
controlled_temporal_expansion_correctness_rate: 0.0,
|
|
false_grounded_answer_rate: 0.0
|
|
},
|
|
metrics_after: {
|
|
claim_anchor_resolution_rate: probe.metrics?.claim_anchor_resolution_rate ?? 0,
|
|
targeted_evidence_hit_rate: probe.metrics?.targeted_evidence_hit_rate ?? 0,
|
|
admissible_positive_evidence_rate: probe.metrics?.admissible_positive_evidence_rate ?? 0,
|
|
grounded_positive_answer_rate: probe.metrics?.grounded_positive_answer_rate ?? 0,
|
|
limited_mode_correct_retention_rate: probe.metrics?.limited_mode_correct_retention_rate ?? 0,
|
|
controlled_temporal_expansion_correctness_rate: probe.metrics?.temporal_anchor_correctness_rate ?? 0,
|
|
false_grounded_answer_rate: probe.metrics?.false_grounded_answer_rate ?? 0
|
|
}
|
|
};
|
|
writeJson(path.join(runDir, "before_after_metrics.json"), beforeAfter);
|
|
|
|
const runSummary = {
|
|
run_id: path.basename(runDir),
|
|
stage: "Stage_04",
|
|
wave: "Wave_19",
|
|
scope: "claim_bound_evidence_acquisition_p0_only",
|
|
mode: String(probe.mode ?? "useMock=true"),
|
|
metrics_after: beforeAfter.metrics_after,
|
|
verdicts: {
|
|
CLAIM_BOUND_EVIDENCE_ACQUISITION_READY: "READY_WITH_LIMITATIONS",
|
|
POSITIVE_GROUNDING_PATH_READY: "READY_WITH_LIMITATIONS",
|
|
overall_status: "WAVE19_ACCEPTED_WITH_LIMITATIONS"
|
|
},
|
|
acceptance: {
|
|
false_grounded_answer_rate: probe.metrics?.false_grounded_answer_rate ?? 0,
|
|
grounded_positive_answer_rate: probe.metrics?.grounded_positive_answer_rate ?? 0,
|
|
targeted_evidence_hit_rate: probe.metrics?.targeted_evidence_hit_rate ?? 0
|
|
},
|
|
artifacts: {
|
|
readme: "README.md",
|
|
run_summary: "run_summary.json",
|
|
before_after_metrics: "before_after_metrics.json",
|
|
control_case_matrix: "control_case_matrix.md",
|
|
claim_anchor_audit: "claim_anchor_audit.json",
|
|
targeted_evidence_acquisition_report: "targeted_evidence_acquisition_report.json",
|
|
grounded_positive_vs_limited_matrix: "grounded_positive_vs_limited_matrix.md",
|
|
chat_export_core8: "chat_export_core8.md",
|
|
debug_payloads: "debug_payloads/",
|
|
live_call_inventory: "live_call_inventory.json",
|
|
temporal_expansion_audit: "temporal_expansion_audit.json",
|
|
evidence_pack_examples: "evidence_pack_examples/"
|
|
}
|
|
};
|
|
writeJson(path.join(runDir, "run_summary.json"), runSummary);
|
|
|
|
const readme = `# Stage 4 Wave 19 - Claim-Bound Evidence Acquisition (P0)\n\n## Scope\n- P0 domains only: settlements_60_62, vat_document_register_book, month_close_costs_20_44\n- Added claim-bound anchors, targeted evidence acquisition, controlled temporal expansion handoff, positive grounding eligibility path.\n- No new orchestration layer, no new domains, no Stage 5 expansion.\n\n## Execution\n- Build: \`npm.cmd --prefix llm_normalizer/backend run build\`\n- Tests: \`npm.cmd --prefix llm_normalizer/backend test\`\n- Core-8 probe: \`node llm_normalizer/backend/scripts/wave19Core8Probe.js ${path
|
|
.join(runDir, "artifacts", "final_probe_core8.json")
|
|
.replace(/\\/g, "/")}\`\n\n## Final verdict\n- CLAIM_BOUND_EVIDENCE_ACQUISITION_READY: READY_WITH_LIMITATIONS\n- POSITIVE_GROUNDING_PATH_READY: READY_WITH_LIMITATIONS\n- Overall: WAVE19_ACCEPTED_WITH_LIMITATIONS\n\n## Notes\n- Probe mode is \`useMock=true\`; live rerun is still required for final production acceptance.\n- Positive grounding appears on a subset of core cases; limited mode remains on hard/under-anchored cases.\n`;
|
|
writeText(path.join(runDir, "README.md"), readme);
|
|
}
|
|
|
|
const CORE8_USER_MAP = {
|
|
Q1: "supplier account 60: payment 55200 on 2020-07-06 by contract 01/19-PT. why payable tail is still open in july 2020?",
|
|
Q2: "supplier account 60: receipt 276873.60 in july 2020. was advance from 2020-07-15 offset correctly?",
|
|
Q3: "customer account 62: payments 40860 and 20000 in july 2020. is this advance or receivable closure?",
|
|
Q4: "VAT chain july 2020: communication services, VAT 233.33, invoice. is chain document -> invoice -> register -> book complete?",
|
|
Q5: "VAT july 2020: show purchases with incomplete VAT contour.",
|
|
Q6: "month close july 2020 on accounts 20 and 44: any residual tails after 2020-07-31 closure?",
|
|
Q7: "RBP account 97 writeoff in july 2020: does part of deferred expense live longer than expected?",
|
|
Q8: "after full month-end july 2020, what is real problem and what is normal ????????"
|
|
};
|
|
|
|
main();
|