NODEDC_1C/llm_normalizer/backend/scripts/wave19_2LiveReplayPack.js

535 lines
22 KiB
JavaScript

const fs = require("node:fs");
const path = require("node:path");
const request = require("supertest");
const CASE_LABELS = [
{ case_id: "L1", label: "vat_chain_furniture_13_15_july", expected_mode: "grounded_or_stronger" },
{ case_id: "L2", label: "rbp_tail_31_july_5000", expected_mode: "limited_or_grounded" },
{ case_id: "L3", label: "fa_amortization_2471_2465_849", expected_mode: "limited_or_grounded" }
];
function ensureDir(dirPath) {
fs.mkdirSync(dirPath, { recursive: true });
}
function writeJson(filePath, payload) {
ensureDir(path.dirname(filePath));
fs.writeFileSync(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
}
function writeText(filePath, text) {
ensureDir(path.dirname(filePath));
fs.writeFileSync(filePath, text, "utf8");
}
function ratio(numerator, denominator) {
if (!Number.isFinite(numerator) || !Number.isFinite(denominator) || denominator <= 0) {
return 0;
}
return Number((numerator / denominator).toFixed(4));
}
function parseConversationSections(markdown) {
const lines = String(markdown ?? "").split(/\r?\n/);
const sections = [];
let current = null;
let mode = "idle";
function pushCurrent() {
if (!current) {
return;
}
sections.push({
role: current.role,
index: current.index,
metadata: current.metadata,
body: current.body.join("\n").trim()
});
current = null;
mode = "idle";
}
for (const line of lines) {
const heading = line.match(/^##\s+(\d+)\.\s+(user|assistant)\s*$/i);
if (heading) {
pushCurrent();
current = {
index: Number(heading[1]),
role: String(heading[2]).toLowerCase(),
metadata: {},
body: []
};
mode = "meta";
continue;
}
if (!current) {
continue;
}
if (mode === "meta") {
if (line.trim() === "") {
mode = "body";
continue;
}
const meta = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/);
if (meta) {
current.metadata[meta[1]] = meta[2];
}
continue;
}
if (/^###\s+technical_debug_payload_json\s*$/i.test(line)) {
pushCurrent();
continue;
}
current.body.push(line);
}
pushCurrent();
return sections;
}
function isJuly2020Period(period) {
if (!period || typeof period !== "object") {
return false;
}
const from = String(period.from ?? "").trim();
const to = String(period.to ?? "").trim();
return /^2020-07-\d{2}$/.test(from) && /^2020-07-\d{2}$/.test(to);
}
function extractLiveCalls(debug) {
const rows = [];
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
for (const result of retrievalResults) {
const summary = result?.summary ?? {};
const live = summary?.live_mcp;
if (!live || typeof live !== "object") {
continue;
}
rows.push({
fragment_id: result?.fragment_id ?? null,
route: result?.route ?? null,
method: String(live.method ?? "execute_query"),
args_summary: live.args ?? null,
query_subject: String(live.query_subject ?? summary.query_subject ?? ""),
account_scope: Array.isArray(live.account_scope) ? live.account_scope : [],
fetched_rows: Number(live.fetched_rows ?? 0),
returned_rows: Number(live.returned_rows ?? 0),
matched_rows: Number(live.matched_rows ?? 0),
status: String(live.status ?? "unknown")
});
}
return rows;
}
function contradictionFlags(debug) {
const temporal = debug?.temporal_guard ?? {};
const eligibility = debug?.grounded_answer_eligibility_guard ?? {};
const effective = temporal.effective_primary_period ?? debug?.effective_primary_period ?? null;
const temporalOutcome = String(temporal.temporal_guard_outcome ?? "");
const hasJulyEffective = isJuly2020Period(effective);
const basisMismatch =
String(temporal.temporal_guard_basis ?? "") !== String(eligibility.eligibility_time_basis ?? "");
const failedUnderJuly = hasJulyEffective && temporalOutcome === "failed_out_of_snapshot_window";
return {
has_july_effective_primary_period: hasJulyEffective,
temporal_guard_outcome: temporalOutcome,
temporal_basis_mismatch: basisMismatch,
failed_under_july_effective_period: failedUnderJuly,
contradiction: failedUnderJuly || basisMismatch
};
}
function claimPathCompleted(debug) {
const targeted = debug?.targeted_evidence_acquisition ?? {};
const hitRate = Number(targeted.targeted_evidence_hit_rate ?? 0);
const checkStatus = targeted.check_status && typeof targeted.check_status === "object" ? targeted.check_status : {};
const hasFound = Object.values(checkStatus).some((value) => String(value) === "found");
return hitRate > 0 && hasFound;
}
function composeLiveReplayExport(session, caseRows, generatedAtIso) {
const lines = [];
lines.push("# Assistant conversation export");
lines.push(`session_id: ${session.session_id}`);
lines.push("export_mode: technical");
lines.push(`exported_at: ${generatedAtIso}`);
lines.push("");
let sectionIndex = 1;
for (const row of caseRows) {
const user = row.userItem;
const assistant = row.assistantItem;
lines.push(`## ${sectionIndex}. user`);
lines.push(`message_id: ${user.message_id}`);
lines.push(`created_at: ${user.created_at}`);
lines.push("reply_type: n/a");
lines.push("");
lines.push(user.text);
lines.push("");
sectionIndex += 1;
lines.push(`## ${sectionIndex}. assistant`);
lines.push(`message_id: ${assistant.message_id}`);
lines.push(`created_at: ${assistant.created_at}`);
lines.push(`reply_type: ${assistant.reply_type}`);
lines.push(`trace_id: ${assistant.trace_id}`);
lines.push("");
lines.push(assistant.text);
lines.push("");
lines.push("### technical_debug_payload_json");
lines.push("```json");
lines.push(JSON.stringify(assistant.debug ?? {}, null, 2));
lines.push("```");
lines.push("");
sectionIndex += 1;
}
return lines.join("\n");
}
async function main() {
const runDir = process.argv[2];
const sourceFile = process.argv[3];
if (!runDir || !sourceFile) {
throw new Error("Usage: node wave19_2LiveReplayPack.js <run-dir> <source-1.txt>");
}
const sourceText = fs.readFileSync(sourceFile, "utf8");
const sections = parseConversationSections(sourceText);
const userSections = sections.filter((item) => item.role === "user");
const assistantSections = sections.filter((item) => item.role === "assistant");
const userMessages = userSections.map((item) => item.body).filter((item) => item.length > 0).slice(0, 3);
if (userMessages.length < 3) {
throw new Error(`Expected at least 3 user messages in source file, got ${userMessages.length}`);
}
const baselinePartialCoverage = assistantSections
.slice(0, 3)
.filter((item) => String(item.metadata.reply_type ?? "") === "partial_coverage").length;
const baselinePartialCoverageRate = ratio(baselinePartialCoverage, Math.max(1, Math.min(3, assistantSections.length)));
process.env.FEATURE_ASSISTANT_MCP_RUNTIME_V1 = "1";
const { createApp } = require("../dist/server.js");
const app = createApp();
const sessionId = `asst-wave19_2-${Date.now()}`;
const replayRows = [];
for (let i = 0; i < 3; i += 1) {
const message = userMessages[i];
const response = await request(app).post("/api/assistant/message").send({
session_id: sessionId,
useMock: true,
promptVersion: "normalizer_v2_0_2",
user_message: message
});
if (response.status !== 200) {
throw new Error(`Replay case ${i + 1} failed with status=${response.status}`);
}
const debug = response.body?.debug ?? {};
const eligibility = debug?.grounded_answer_eligibility_guard ?? {};
const admissibility = debug?.evidence_admissibility_gate ?? {};
const temporal = debug?.temporal_guard ?? {};
const targeted = debug?.targeted_evidence_acquisition ?? {};
const calls = extractLiveCalls(debug);
const contradiction = contradictionFlags(debug);
replayRows.push({
...CASE_LABELS[i],
user_message: message,
reply_type: String(response.body?.reply_type ?? ""),
assistant_reply: String(response.body?.assistant_reply ?? ""),
trace_id: String(response.body?.trace_id ?? debug?.trace_id ?? ""),
business_scope_raw: Array.isArray(debug?.business_scope_raw) ? debug.business_scope_raw : [],
business_scope_resolved: Array.isArray(debug?.business_scope_resolved) ? debug.business_scope_resolved : [],
company_scope_resolution_reason: Array.isArray(debug?.company_scope_resolution_reason)
? debug.company_scope_resolution_reason
: Array.isArray(debug?.scope_resolution_reason)
? debug.scope_resolution_reason
: [],
raw_time_scope: temporal?.raw_time_scope ?? null,
resolved_time_anchor: temporal?.resolved_time_anchor ?? null,
effective_primary_period: temporal?.effective_primary_period ?? null,
temporal_guard_input: temporal?.temporal_guard_input ?? null,
temporal_guard_outcome: temporal?.temporal_guard_outcome ?? null,
eligibility_time_basis: eligibility?.eligibility_time_basis ?? null,
temporal_guard_basis: temporal?.temporal_guard_basis ?? null,
contradiction,
claim_type: debug?.claim_anchor_audit?.claim_type ?? null,
claim_anchor_resolution_rate: Number(debug?.claim_anchor_audit?.claim_anchor_resolution_rate ?? 0),
targeted_evidence_hit_rate: Number(targeted?.targeted_evidence_hit_rate ?? 0),
admissible_evidence_count: Number(admissibility?.admissible_evidence_count ?? 0),
reject_breakdown: admissibility?.reject_breakdown ?? null,
eligibility: {
eligible: Boolean(eligibility?.eligible),
grounding_mode: String(eligibility?.grounding_mode ?? ""),
outcome: String(eligibility?.outcome ?? ""),
reason_codes: Array.isArray(eligibility?.reason_codes) ? eligibility.reason_codes : []
},
live_calls: calls,
debug
});
}
const sessionResponse = await request(app).get(`/api/assistant/session/${sessionId}`);
if (sessionResponse.status !== 200) {
throw new Error(`Failed to load replay session: status=${sessionResponse.status}`);
}
const session = sessionResponse.body?.session;
const sessionItems = Array.isArray(session?.items) ? session.items : [];
const userItems = sessionItems.filter((item) => item?.role === "user");
const assistantItems = sessionItems.filter((item) => item?.role === "assistant");
const caseRowsWithItems = replayRows.map((row, index) => ({
...row,
userItem: userItems[index] ?? {
message_id: `user-${index + 1}`,
created_at: new Date().toISOString(),
text: row.user_message
},
assistantItem: assistantItems[index] ?? {
message_id: `assistant-${index + 1}`,
created_at: new Date().toISOString(),
text: row.assistant_reply,
reply_type: row.reply_type,
trace_id: row.trace_id,
debug: row.debug
}
}));
const contradictionCount = replayRows.filter((row) => row.contradiction.contradiction).length;
const scopeResolvedCount = replayRows.filter((row) => row.business_scope_resolved.includes("company_specific_accounting")).length;
const admissibleNonZeroCount = replayRows.filter((row) => row.admissible_evidence_count > 0).length;
const partialCoverageCount = replayRows.filter((row) => row.reply_type === "partial_coverage").length;
const claimPathCompletedCount = replayRows.filter((row) => claimPathCompleted(row.debug)).length;
const falseGroundedCount = replayRows.filter(
(row) => row.eligibility.grounding_mode === "grounded_positive" && row.admissible_evidence_count <= 0
).length;
const metrics = {
case_count: replayRows.length,
baseline_partial_coverage_default_rate: baselinePartialCoverageRate,
live_temporal_contradiction_rate: ratio(contradictionCount, replayRows.length),
live_company_scope_resolution_rate: ratio(scopeResolvedCount, replayRows.length),
live_admissible_evidence_nonzero_rate: ratio(admissibleNonZeroCount, replayRows.length),
live_partial_coverage_default_rate: ratio(partialCoverageCount, replayRows.length),
live_claim_path_completion_rate: ratio(claimPathCompletedCount, replayRows.length),
live_false_grounded_answer_rate: ratio(falseGroundedCount, replayRows.length)
};
const thresholds = {
live_temporal_contradiction_rate: 0,
live_company_scope_resolution_rate: 1,
live_false_grounded_answer_rate: 0,
live_admissible_evidence_nonzero_min_cases: 2
};
const temporalFixed = metrics.live_temporal_contradiction_rate <= thresholds.live_temporal_contradiction_rate;
const companyScopeFixed = metrics.live_company_scope_resolution_rate >= thresholds.live_company_scope_resolution_rate;
const evidencePathFixed =
admissibleNonZeroCount >= thresholds.live_admissible_evidence_nonzero_min_cases &&
metrics.live_false_grounded_answer_rate <= thresholds.live_false_grounded_answer_rate;
const partialReduced = metrics.live_partial_coverage_default_rate < metrics.baseline_partial_coverage_default_rate;
let overallStatus = "WAVE19_2_NOT_ACCEPTED";
if (temporalFixed && companyScopeFixed && evidencePathFixed && partialReduced) {
overallStatus = "WAVE19_2_ACCEPTED";
} else if (temporalFixed && companyScopeFixed && metrics.live_false_grounded_answer_rate <= 0) {
overallStatus = "WAVE19_2_ACCEPTED_WITH_LIMITATIONS";
}
const runSummary = {
run_id: path.basename(runDir),
stage: "Stage_04",
wave: "Wave_19_2",
scope: "live_runtime_fix_by_replay_1txt",
source_of_truth: sourceFile,
execution: {
replay_mode: "exact_questions_from_1_txt",
runtime_path: "assistant_message_with_mcp_runtime_on",
normalizer_mode: "useMock=true",
session_id: sessionId
},
thresholds,
metrics,
verdicts: {
LIVE_TEMPORAL_ALIGNMENT_FIXED: temporalFixed ? "FIXED" : "NOT_FIXED",
LIVE_COMPANY_SCOPE_FIXED: companyScopeFixed ? "FIXED" : "NOT_FIXED",
LIVE_EVIDENCE_PATH_FIXED: evidencePathFixed ? "FIXED" : "NOT_FIXED",
LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: partialReduced ? "REDUCED" : "NOT_REDUCED",
overall_status: overallStatus
}
};
writeJson(path.join(runDir, "run_summary.json"), runSummary);
const temporalAudit = {
generated_at: new Date().toISOString(),
cases: replayRows.map((row) => ({
case_id: row.case_id,
label: row.label,
raw_time_scope: row.raw_time_scope,
resolved_time_anchor: row.resolved_time_anchor,
effective_primary_period: row.effective_primary_period,
temporal_guard_input: row.temporal_guard_input,
temporal_guard_basis: row.temporal_guard_basis,
eligibility_time_basis: row.eligibility_time_basis,
temporal_guard_outcome: row.temporal_guard_outcome,
contradiction: row.contradiction
})),
metric: {
live_temporal_contradiction_rate: metrics.live_temporal_contradiction_rate
}
};
writeJson(path.join(runDir, "temporal_contradiction_audit.json"), temporalAudit);
const scopeAudit = {
generated_at: new Date().toISOString(),
cases: replayRows.map((row) => ({
case_id: row.case_id,
label: row.label,
business_scope_raw: row.business_scope_raw,
business_scope_resolved: row.business_scope_resolved,
company_scope_resolution_reason: row.company_scope_resolution_reason
})),
metric: {
live_company_scope_resolution_rate: metrics.live_company_scope_resolution_rate
}
};
writeJson(path.join(runDir, "business_scope_resolution_audit.json"), scopeAudit);
const mcpToEvidence = {
generated_at: new Date().toISOString(),
cases: replayRows.map((row) => ({
case_id: row.case_id,
label: row.label,
claim_type: row.claim_type,
admissible_evidence_count: row.admissible_evidence_count,
live_calls: row.live_calls,
claim_targeted_hit_rate: row.targeted_evidence_hit_rate,
eligibility: row.eligibility
}))
};
writeJson(path.join(runDir, "live_mcp_to_evidence_handoff.json"), mcpToEvidence);
const rejectBreakdown = {
generated_at: new Date().toISOString(),
aggregate: replayRows.reduce(
(acc, row) => {
const breakdown = row.reject_breakdown && typeof row.reject_breakdown === "object" ? row.reject_breakdown : {};
for (const key of Object.keys(acc)) {
acc[key] += Number(breakdown[key] ?? 0);
}
return acc;
},
{
wrong_period: 0,
wrong_domain: 0,
wrong_account_scope: 0,
weak_source_mapping: 0,
zero_live_match: 0,
future_dated_or_out_of_window: 0
}
),
cases: replayRows.map((row) => ({
case_id: row.case_id,
label: row.label,
reject_breakdown: row.reject_breakdown
}))
};
writeJson(path.join(runDir, "admissibility_reject_breakdown_live.json"), rejectBreakdown);
for (const row of replayRows) {
writeJson(path.join(runDir, "debug_payloads", `${row.case_id}_${row.label}.json`), {
case_id: row.case_id,
label: row.label,
trace_id: row.trace_id,
reply_type: row.reply_type,
debug: row.debug
});
}
const caseMatrixLines = [];
caseMatrixLines.push("# Live Case Matrix");
caseMatrixLines.push("");
caseMatrixLines.push("| Case | Label | Reply | Claim Type | Admissible Evidence | Grounding Mode | Scope | Temporal |");
caseMatrixLines.push("| --- | --- | --- | --- | ---: | --- | --- | --- |");
for (const row of replayRows) {
caseMatrixLines.push(
`| ${row.case_id} | ${row.label} | ${row.reply_type} | ${row.claim_type ?? "n/a"} | ${row.admissible_evidence_count} | ${row.eligibility.grounding_mode} | ${row.business_scope_resolved.join(", ") || "n/a"} | ${row.temporal_guard_outcome} |`
);
}
writeText(path.join(runDir, "live_case_matrix.md"), `${caseMatrixLines.join("\n")}\n`);
const replayReportLines = [];
replayReportLines.push("# Live Replay Report (Wave 19.2)");
replayReportLines.push("");
replayReportLines.push("## Source");
replayReportLines.push(`- Source of truth replayed from: \`${sourceFile}\``);
replayReportLines.push("- Replayed exactly 3 user turns from the original export.");
replayReportLines.push("- Runtime path: MCP ON, useMock=true.");
replayReportLines.push("");
replayReportLines.push("## Metrics");
replayReportLines.push(`- live_temporal_contradiction_rate: ${metrics.live_temporal_contradiction_rate}`);
replayReportLines.push(`- live_company_scope_resolution_rate: ${metrics.live_company_scope_resolution_rate}`);
replayReportLines.push(`- live_admissible_evidence_nonzero_rate: ${metrics.live_admissible_evidence_nonzero_rate}`);
replayReportLines.push(`- live_partial_coverage_default_rate: ${metrics.live_partial_coverage_default_rate}`);
replayReportLines.push(`- baseline_partial_coverage_default_rate: ${metrics.baseline_partial_coverage_default_rate}`);
replayReportLines.push(`- live_claim_path_completion_rate: ${metrics.live_claim_path_completion_rate}`);
replayReportLines.push(`- live_false_grounded_answer_rate: ${metrics.live_false_grounded_answer_rate}`);
replayReportLines.push("");
replayReportLines.push("## Verdict");
replayReportLines.push(`- LIVE_TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"}`);
replayReportLines.push(`- LIVE_COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"}`);
replayReportLines.push(`- LIVE_EVIDENCE_PATH_FIXED: ${evidencePathFixed ? "FIXED" : "NOT_FIXED"}`);
replayReportLines.push(`- LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: ${partialReduced ? "REDUCED" : "NOT_REDUCED"}`);
replayReportLines.push(`- Overall: ${overallStatus}`);
replayReportLines.push("");
writeText(path.join(runDir, "live_replay_report.md"), `${replayReportLines.join("\n")}\n`);
const chatExportLines = [];
chatExportLines.push("# Chat Export Live Replay");
chatExportLines.push("");
for (const row of replayRows) {
chatExportLines.push(`## ${row.case_id} | ${row.label}`);
chatExportLines.push(`user: ${row.user_message}`);
chatExportLines.push(`assistant(reply_type=${row.reply_type}, trace_id=${row.trace_id}): ${row.assistant_reply.replace(/\s+/g, " ").trim()}`);
chatExportLines.push("");
}
writeText(path.join(runDir, "chat_export_live_replay.md"), `${chatExportLines.join("\n")}\n`);
const generatedAtIso = new Date().toISOString();
const liveReplayTxt = composeLiveReplayExport(session, caseRowsWithItems, generatedAtIso);
writeText(path.join(runDir, "1_live_replay.txt"), liveReplayTxt);
const readme = [
"# Stage 4 / Wave 19.2 - Live Runtime Fix by Replay 1.txt",
"",
"## What was run",
"- Source-of-truth replay from original `1.txt` user turns.",
"- MCP runtime ON (`FEATURE_ASSISTANT_MCP_RUNTIME_V1=1`).",
"- Normalizer in `useMock=true` mode.",
"",
"## Produced artifacts",
"- run_summary.json",
"- live_replay_report.md",
"- live_case_matrix.md",
"- business_scope_resolution_audit.json",
"- temporal_contradiction_audit.json",
"- live_mcp_to_evidence_handoff.json",
"- admissibility_reject_breakdown_live.json",
"- chat_export_live_replay.md",
"- debug_payloads/",
"- 1_live_replay.txt",
"",
"## Final verdict",
`- LIVE_TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"}`,
`- LIVE_COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"}`,
`- LIVE_EVIDENCE_PATH_FIXED: ${evidencePathFixed ? "FIXED" : "NOT_FIXED"}`,
`- LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: ${partialReduced ? "REDUCED" : "NOT_REDUCED"}`,
`- Overall: ${overallStatus}`
].join("\n");
writeText(path.join(runDir, "README.md"), `${readme}\n`);
}
main().catch((error) => {
process.stderr.write(`${error instanceof Error ? error.stack || error.message : String(error)}\n`);
process.exit(1);
});