const fs = require("node:fs"); const path = require("node:path"); const request = require("supertest"); const CASE_LABELS = [ { case_id: "L1", label: "vat_chain_furniture_13_15_july", expected_mode: "grounded_or_stronger" }, { case_id: "L2", label: "rbp_tail_31_july_5000", expected_mode: "limited_or_grounded" }, { case_id: "L3", label: "fa_amortization_2471_2465_849", expected_mode: "limited_or_grounded" } ]; function ensureDir(dirPath) { fs.mkdirSync(dirPath, { recursive: true }); } function writeJson(filePath, payload) { ensureDir(path.dirname(filePath)); fs.writeFileSync(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8"); } function writeText(filePath, text) { ensureDir(path.dirname(filePath)); fs.writeFileSync(filePath, text, "utf8"); } function ratio(numerator, denominator) { if (!Number.isFinite(numerator) || !Number.isFinite(denominator) || denominator <= 0) { return 0; } return Number((numerator / denominator).toFixed(4)); } function parseConversationSections(markdown) { const lines = String(markdown ?? "").split(/\r?\n/); const sections = []; let current = null; let mode = "idle"; function pushCurrent() { if (!current) { return; } sections.push({ role: current.role, index: current.index, metadata: current.metadata, body: current.body.join("\n").trim() }); current = null; mode = "idle"; } for (const line of lines) { const heading = line.match(/^##\s+(\d+)\.\s+(user|assistant)\s*$/i); if (heading) { pushCurrent(); current = { index: Number(heading[1]), role: String(heading[2]).toLowerCase(), metadata: {}, body: [] }; mode = "meta"; continue; } if (!current) { continue; } if (mode === "meta") { if (line.trim() === "") { mode = "body"; continue; } const meta = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/); if (meta) { current.metadata[meta[1]] = meta[2]; } continue; } if (/^###\s+technical_debug_payload_json\s*$/i.test(line)) { pushCurrent(); continue; } current.body.push(line); } pushCurrent(); return sections; } function isJuly2020Period(period) { if (!period || typeof period !== "object") { return false; } const from = String(period.from ?? "").trim(); const to = String(period.to ?? "").trim(); return /^2020-07-\d{2}$/.test(from) && /^2020-07-\d{2}$/.test(to); } function extractLiveCalls(debug) { const rows = []; const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : []; for (const result of retrievalResults) { const summary = result?.summary ?? {}; const live = summary?.live_mcp; if (!live || typeof live !== "object") { continue; } rows.push({ fragment_id: result?.fragment_id ?? null, route: result?.route ?? null, method: String(live.method ?? "execute_query"), args_summary: live.args ?? null, query_subject: String(live.query_subject ?? summary.query_subject ?? ""), account_scope: Array.isArray(live.account_scope) ? live.account_scope : [], fetched_rows: Number(live.fetched_rows ?? 0), returned_rows: Number(live.returned_rows ?? 0), matched_rows: Number(live.matched_rows ?? 0), status: String(live.status ?? "unknown") }); } return rows; } function contradictionFlags(debug) { const temporal = debug?.temporal_guard ?? {}; const eligibility = debug?.grounded_answer_eligibility_guard ?? {}; const effective = temporal.effective_primary_period ?? debug?.effective_primary_period ?? null; const temporalOutcome = String(temporal.temporal_guard_outcome ?? ""); const hasJulyEffective = isJuly2020Period(effective); const basisMismatch = String(temporal.temporal_guard_basis ?? "") !== String(eligibility.eligibility_time_basis ?? ""); const failedUnderJuly = hasJulyEffective && temporalOutcome === "failed_out_of_snapshot_window"; return { has_july_effective_primary_period: hasJulyEffective, temporal_guard_outcome: temporalOutcome, temporal_basis_mismatch: basisMismatch, failed_under_july_effective_period: failedUnderJuly, contradiction: failedUnderJuly || basisMismatch }; } function claimPathCompleted(debug) { const targeted = debug?.targeted_evidence_acquisition ?? {}; const hitRate = Number(targeted.targeted_evidence_hit_rate ?? 0); const checkStatus = targeted.check_status && typeof targeted.check_status === "object" ? targeted.check_status : {}; const hasFound = Object.values(checkStatus).some((value) => String(value) === "found"); return hitRate > 0 && hasFound; } function composeLiveReplayExport(session, caseRows, generatedAtIso) { const lines = []; lines.push("# Assistant conversation export"); lines.push(`session_id: ${session.session_id}`); lines.push("export_mode: technical"); lines.push(`exported_at: ${generatedAtIso}`); lines.push(""); let sectionIndex = 1; for (const row of caseRows) { const user = row.userItem; const assistant = row.assistantItem; lines.push(`## ${sectionIndex}. user`); lines.push(`message_id: ${user.message_id}`); lines.push(`created_at: ${user.created_at}`); lines.push("reply_type: n/a"); lines.push(""); lines.push(user.text); lines.push(""); sectionIndex += 1; lines.push(`## ${sectionIndex}. assistant`); lines.push(`message_id: ${assistant.message_id}`); lines.push(`created_at: ${assistant.created_at}`); lines.push(`reply_type: ${assistant.reply_type}`); lines.push(`trace_id: ${assistant.trace_id}`); lines.push(""); lines.push(assistant.text); lines.push(""); lines.push("### technical_debug_payload_json"); lines.push("```json"); lines.push(JSON.stringify(assistant.debug ?? {}, null, 2)); lines.push("```"); lines.push(""); sectionIndex += 1; } return lines.join("\n"); } async function main() { const runDir = process.argv[2]; const sourceFile = process.argv[3]; if (!runDir || !sourceFile) { throw new Error("Usage: node wave19_2LiveReplayPack.js "); } const sourceText = fs.readFileSync(sourceFile, "utf8"); const sections = parseConversationSections(sourceText); const userSections = sections.filter((item) => item.role === "user"); const assistantSections = sections.filter((item) => item.role === "assistant"); const userMessages = userSections.map((item) => item.body).filter((item) => item.length > 0).slice(0, 3); if (userMessages.length < 3) { throw new Error(`Expected at least 3 user messages in source file, got ${userMessages.length}`); } const baselinePartialCoverage = assistantSections .slice(0, 3) .filter((item) => String(item.metadata.reply_type ?? "") === "partial_coverage").length; const baselinePartialCoverageRate = ratio(baselinePartialCoverage, Math.max(1, Math.min(3, assistantSections.length))); process.env.FEATURE_ASSISTANT_MCP_RUNTIME_V1 = "1"; const { createApp } = require("../dist/server.js"); const app = createApp(); const sessionId = `asst-wave19_2-${Date.now()}`; const replayRows = []; for (let i = 0; i < 3; i += 1) { const message = userMessages[i]; const response = await request(app).post("/api/assistant/message").send({ session_id: sessionId, useMock: true, promptVersion: "normalizer_v2_0_2", user_message: message }); if (response.status !== 200) { throw new Error(`Replay case ${i + 1} failed with status=${response.status}`); } const debug = response.body?.debug ?? {}; const eligibility = debug?.grounded_answer_eligibility_guard ?? {}; const admissibility = debug?.evidence_admissibility_gate ?? {}; const temporal = debug?.temporal_guard ?? {}; const targeted = debug?.targeted_evidence_acquisition ?? {}; const calls = extractLiveCalls(debug); const contradiction = contradictionFlags(debug); replayRows.push({ ...CASE_LABELS[i], user_message: message, reply_type: String(response.body?.reply_type ?? ""), assistant_reply: String(response.body?.assistant_reply ?? ""), trace_id: String(response.body?.trace_id ?? debug?.trace_id ?? ""), business_scope_raw: Array.isArray(debug?.business_scope_raw) ? debug.business_scope_raw : [], business_scope_resolved: Array.isArray(debug?.business_scope_resolved) ? debug.business_scope_resolved : [], company_scope_resolution_reason: Array.isArray(debug?.company_scope_resolution_reason) ? debug.company_scope_resolution_reason : Array.isArray(debug?.scope_resolution_reason) ? debug.scope_resolution_reason : [], raw_time_scope: temporal?.raw_time_scope ?? null, resolved_time_anchor: temporal?.resolved_time_anchor ?? null, effective_primary_period: temporal?.effective_primary_period ?? null, temporal_guard_input: temporal?.temporal_guard_input ?? null, temporal_guard_outcome: temporal?.temporal_guard_outcome ?? null, eligibility_time_basis: eligibility?.eligibility_time_basis ?? null, temporal_guard_basis: temporal?.temporal_guard_basis ?? null, contradiction, claim_type: debug?.claim_anchor_audit?.claim_type ?? null, claim_anchor_resolution_rate: Number(debug?.claim_anchor_audit?.claim_anchor_resolution_rate ?? 0), targeted_evidence_hit_rate: Number(targeted?.targeted_evidence_hit_rate ?? 0), admissible_evidence_count: Number(admissibility?.admissible_evidence_count ?? 0), reject_breakdown: admissibility?.reject_breakdown ?? null, eligibility: { eligible: Boolean(eligibility?.eligible), grounding_mode: String(eligibility?.grounding_mode ?? ""), outcome: String(eligibility?.outcome ?? ""), reason_codes: Array.isArray(eligibility?.reason_codes) ? eligibility.reason_codes : [] }, live_calls: calls, debug }); } const sessionResponse = await request(app).get(`/api/assistant/session/${sessionId}`); if (sessionResponse.status !== 200) { throw new Error(`Failed to load replay session: status=${sessionResponse.status}`); } const session = sessionResponse.body?.session; const sessionItems = Array.isArray(session?.items) ? session.items : []; const userItems = sessionItems.filter((item) => item?.role === "user"); const assistantItems = sessionItems.filter((item) => item?.role === "assistant"); const caseRowsWithItems = replayRows.map((row, index) => ({ ...row, userItem: userItems[index] ?? { message_id: `user-${index + 1}`, created_at: new Date().toISOString(), text: row.user_message }, assistantItem: assistantItems[index] ?? { message_id: `assistant-${index + 1}`, created_at: new Date().toISOString(), text: row.assistant_reply, reply_type: row.reply_type, trace_id: row.trace_id, debug: row.debug } })); const contradictionCount = replayRows.filter((row) => row.contradiction.contradiction).length; const scopeResolvedCount = replayRows.filter((row) => row.business_scope_resolved.includes("company_specific_accounting")).length; const admissibleNonZeroCount = replayRows.filter((row) => row.admissible_evidence_count > 0).length; const partialCoverageCount = replayRows.filter((row) => row.reply_type === "partial_coverage").length; const claimPathCompletedCount = replayRows.filter((row) => claimPathCompleted(row.debug)).length; const falseGroundedCount = replayRows.filter( (row) => row.eligibility.grounding_mode === "grounded_positive" && row.admissible_evidence_count <= 0 ).length; const metrics = { case_count: replayRows.length, baseline_partial_coverage_default_rate: baselinePartialCoverageRate, live_temporal_contradiction_rate: ratio(contradictionCount, replayRows.length), live_company_scope_resolution_rate: ratio(scopeResolvedCount, replayRows.length), live_admissible_evidence_nonzero_rate: ratio(admissibleNonZeroCount, replayRows.length), live_partial_coverage_default_rate: ratio(partialCoverageCount, replayRows.length), live_claim_path_completion_rate: ratio(claimPathCompletedCount, replayRows.length), live_false_grounded_answer_rate: ratio(falseGroundedCount, replayRows.length) }; const thresholds = { live_temporal_contradiction_rate: 0, live_company_scope_resolution_rate: 1, live_false_grounded_answer_rate: 0, live_admissible_evidence_nonzero_min_cases: 2 }; const temporalFixed = metrics.live_temporal_contradiction_rate <= thresholds.live_temporal_contradiction_rate; const companyScopeFixed = metrics.live_company_scope_resolution_rate >= thresholds.live_company_scope_resolution_rate; const evidencePathFixed = admissibleNonZeroCount >= thresholds.live_admissible_evidence_nonzero_min_cases && metrics.live_false_grounded_answer_rate <= thresholds.live_false_grounded_answer_rate; const partialReduced = metrics.live_partial_coverage_default_rate < metrics.baseline_partial_coverage_default_rate; let overallStatus = "WAVE19_2_NOT_ACCEPTED"; if (temporalFixed && companyScopeFixed && evidencePathFixed && partialReduced) { overallStatus = "WAVE19_2_ACCEPTED"; } else if (temporalFixed && companyScopeFixed && metrics.live_false_grounded_answer_rate <= 0) { overallStatus = "WAVE19_2_ACCEPTED_WITH_LIMITATIONS"; } const runSummary = { run_id: path.basename(runDir), stage: "Stage_04", wave: "Wave_19_2", scope: "live_runtime_fix_by_replay_1txt", source_of_truth: sourceFile, execution: { replay_mode: "exact_questions_from_1_txt", runtime_path: "assistant_message_with_mcp_runtime_on", normalizer_mode: "useMock=true", session_id: sessionId }, thresholds, metrics, verdicts: { LIVE_TEMPORAL_ALIGNMENT_FIXED: temporalFixed ? "FIXED" : "NOT_FIXED", LIVE_COMPANY_SCOPE_FIXED: companyScopeFixed ? "FIXED" : "NOT_FIXED", LIVE_EVIDENCE_PATH_FIXED: evidencePathFixed ? "FIXED" : "NOT_FIXED", LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: partialReduced ? "REDUCED" : "NOT_REDUCED", overall_status: overallStatus } }; writeJson(path.join(runDir, "run_summary.json"), runSummary); const temporalAudit = { generated_at: new Date().toISOString(), cases: replayRows.map((row) => ({ case_id: row.case_id, label: row.label, raw_time_scope: row.raw_time_scope, resolved_time_anchor: row.resolved_time_anchor, effective_primary_period: row.effective_primary_period, temporal_guard_input: row.temporal_guard_input, temporal_guard_basis: row.temporal_guard_basis, eligibility_time_basis: row.eligibility_time_basis, temporal_guard_outcome: row.temporal_guard_outcome, contradiction: row.contradiction })), metric: { live_temporal_contradiction_rate: metrics.live_temporal_contradiction_rate } }; writeJson(path.join(runDir, "temporal_contradiction_audit.json"), temporalAudit); const scopeAudit = { generated_at: new Date().toISOString(), cases: replayRows.map((row) => ({ case_id: row.case_id, label: row.label, business_scope_raw: row.business_scope_raw, business_scope_resolved: row.business_scope_resolved, company_scope_resolution_reason: row.company_scope_resolution_reason })), metric: { live_company_scope_resolution_rate: metrics.live_company_scope_resolution_rate } }; writeJson(path.join(runDir, "business_scope_resolution_audit.json"), scopeAudit); const mcpToEvidence = { generated_at: new Date().toISOString(), cases: replayRows.map((row) => ({ case_id: row.case_id, label: row.label, claim_type: row.claim_type, admissible_evidence_count: row.admissible_evidence_count, live_calls: row.live_calls, claim_targeted_hit_rate: row.targeted_evidence_hit_rate, eligibility: row.eligibility })) }; writeJson(path.join(runDir, "live_mcp_to_evidence_handoff.json"), mcpToEvidence); const rejectBreakdown = { generated_at: new Date().toISOString(), aggregate: replayRows.reduce( (acc, row) => { const breakdown = row.reject_breakdown && typeof row.reject_breakdown === "object" ? row.reject_breakdown : {}; for (const key of Object.keys(acc)) { acc[key] += Number(breakdown[key] ?? 0); } return acc; }, { wrong_period: 0, wrong_domain: 0, wrong_account_scope: 0, weak_source_mapping: 0, zero_live_match: 0, future_dated_or_out_of_window: 0 } ), cases: replayRows.map((row) => ({ case_id: row.case_id, label: row.label, reject_breakdown: row.reject_breakdown })) }; writeJson(path.join(runDir, "admissibility_reject_breakdown_live.json"), rejectBreakdown); for (const row of replayRows) { writeJson(path.join(runDir, "debug_payloads", `${row.case_id}_${row.label}.json`), { case_id: row.case_id, label: row.label, trace_id: row.trace_id, reply_type: row.reply_type, debug: row.debug }); } const caseMatrixLines = []; caseMatrixLines.push("# Live Case Matrix"); caseMatrixLines.push(""); caseMatrixLines.push("| Case | Label | Reply | Claim Type | Admissible Evidence | Grounding Mode | Scope | Temporal |"); caseMatrixLines.push("| --- | --- | --- | --- | ---: | --- | --- | --- |"); for (const row of replayRows) { caseMatrixLines.push( `| ${row.case_id} | ${row.label} | ${row.reply_type} | ${row.claim_type ?? "n/a"} | ${row.admissible_evidence_count} | ${row.eligibility.grounding_mode} | ${row.business_scope_resolved.join(", ") || "n/a"} | ${row.temporal_guard_outcome} |` ); } writeText(path.join(runDir, "live_case_matrix.md"), `${caseMatrixLines.join("\n")}\n`); const replayReportLines = []; replayReportLines.push("# Live Replay Report (Wave 19.2)"); replayReportLines.push(""); replayReportLines.push("## Source"); replayReportLines.push(`- Source of truth replayed from: \`${sourceFile}\``); replayReportLines.push("- Replayed exactly 3 user turns from the original export."); replayReportLines.push("- Runtime path: MCP ON, useMock=true."); replayReportLines.push(""); replayReportLines.push("## Metrics"); replayReportLines.push(`- live_temporal_contradiction_rate: ${metrics.live_temporal_contradiction_rate}`); replayReportLines.push(`- live_company_scope_resolution_rate: ${metrics.live_company_scope_resolution_rate}`); replayReportLines.push(`- live_admissible_evidence_nonzero_rate: ${metrics.live_admissible_evidence_nonzero_rate}`); replayReportLines.push(`- live_partial_coverage_default_rate: ${metrics.live_partial_coverage_default_rate}`); replayReportLines.push(`- baseline_partial_coverage_default_rate: ${metrics.baseline_partial_coverage_default_rate}`); replayReportLines.push(`- live_claim_path_completion_rate: ${metrics.live_claim_path_completion_rate}`); replayReportLines.push(`- live_false_grounded_answer_rate: ${metrics.live_false_grounded_answer_rate}`); replayReportLines.push(""); replayReportLines.push("## Verdict"); replayReportLines.push(`- LIVE_TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"}`); replayReportLines.push(`- LIVE_COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"}`); replayReportLines.push(`- LIVE_EVIDENCE_PATH_FIXED: ${evidencePathFixed ? "FIXED" : "NOT_FIXED"}`); replayReportLines.push(`- LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: ${partialReduced ? "REDUCED" : "NOT_REDUCED"}`); replayReportLines.push(`- Overall: ${overallStatus}`); replayReportLines.push(""); writeText(path.join(runDir, "live_replay_report.md"), `${replayReportLines.join("\n")}\n`); const chatExportLines = []; chatExportLines.push("# Chat Export Live Replay"); chatExportLines.push(""); for (const row of replayRows) { chatExportLines.push(`## ${row.case_id} | ${row.label}`); chatExportLines.push(`user: ${row.user_message}`); chatExportLines.push(`assistant(reply_type=${row.reply_type}, trace_id=${row.trace_id}): ${row.assistant_reply.replace(/\s+/g, " ").trim()}`); chatExportLines.push(""); } writeText(path.join(runDir, "chat_export_live_replay.md"), `${chatExportLines.join("\n")}\n`); const generatedAtIso = new Date().toISOString(); const liveReplayTxt = composeLiveReplayExport(session, caseRowsWithItems, generatedAtIso); writeText(path.join(runDir, "1_live_replay.txt"), liveReplayTxt); const readme = [ "# Stage 4 / Wave 19.2 - Live Runtime Fix by Replay 1.txt", "", "## What was run", "- Source-of-truth replay from original `1.txt` user turns.", "- MCP runtime ON (`FEATURE_ASSISTANT_MCP_RUNTIME_V1=1`).", "- Normalizer in `useMock=true` mode.", "", "## Produced artifacts", "- run_summary.json", "- live_replay_report.md", "- live_case_matrix.md", "- business_scope_resolution_audit.json", "- temporal_contradiction_audit.json", "- live_mcp_to_evidence_handoff.json", "- admissibility_reject_breakdown_live.json", "- chat_export_live_replay.md", "- debug_payloads/", "- 1_live_replay.txt", "", "## Final verdict", `- LIVE_TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"}`, `- LIVE_COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"}`, `- LIVE_EVIDENCE_PATH_FIXED: ${evidencePathFixed ? "FIXED" : "NOT_FIXED"}`, `- LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: ${partialReduced ? "REDUCED" : "NOT_REDUCED"}`, `- Overall: ${overallStatus}` ].join("\n"); writeText(path.join(runDir, "README.md"), `${readme}\n`); } main().catch((error) => { process.stderr.write(`${error instanceof Error ? error.stack || error.message : String(error)}\n`); process.exit(1); });