535 lines
22 KiB
JavaScript
535 lines
22 KiB
JavaScript
const fs = require("node:fs");
|
|
const path = require("node:path");
|
|
const request = require("supertest");
|
|
|
|
const CASE_LABELS = [
|
|
{ case_id: "L1", label: "vat_chain_furniture_13_15_july", expected_mode: "grounded_or_stronger" },
|
|
{ case_id: "L2", label: "rbp_tail_31_july_5000", expected_mode: "limited_or_grounded" },
|
|
{ case_id: "L3", label: "fa_amortization_2471_2465_849", expected_mode: "limited_or_grounded" }
|
|
];
|
|
|
|
function ensureDir(dirPath) {
|
|
fs.mkdirSync(dirPath, { recursive: true });
|
|
}
|
|
|
|
function writeJson(filePath, payload) {
|
|
ensureDir(path.dirname(filePath));
|
|
fs.writeFileSync(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
|
|
}
|
|
|
|
function writeText(filePath, text) {
|
|
ensureDir(path.dirname(filePath));
|
|
fs.writeFileSync(filePath, text, "utf8");
|
|
}
|
|
|
|
function ratio(numerator, denominator) {
|
|
if (!Number.isFinite(numerator) || !Number.isFinite(denominator) || denominator <= 0) {
|
|
return 0;
|
|
}
|
|
return Number((numerator / denominator).toFixed(4));
|
|
}
|
|
|
|
function parseConversationSections(markdown) {
|
|
const lines = String(markdown ?? "").split(/\r?\n/);
|
|
const sections = [];
|
|
let current = null;
|
|
let mode = "idle";
|
|
|
|
function pushCurrent() {
|
|
if (!current) {
|
|
return;
|
|
}
|
|
sections.push({
|
|
role: current.role,
|
|
index: current.index,
|
|
metadata: current.metadata,
|
|
body: current.body.join("\n").trim()
|
|
});
|
|
current = null;
|
|
mode = "idle";
|
|
}
|
|
|
|
for (const line of lines) {
|
|
const heading = line.match(/^##\s+(\d+)\.\s+(user|assistant)\s*$/i);
|
|
if (heading) {
|
|
pushCurrent();
|
|
current = {
|
|
index: Number(heading[1]),
|
|
role: String(heading[2]).toLowerCase(),
|
|
metadata: {},
|
|
body: []
|
|
};
|
|
mode = "meta";
|
|
continue;
|
|
}
|
|
if (!current) {
|
|
continue;
|
|
}
|
|
if (mode === "meta") {
|
|
if (line.trim() === "") {
|
|
mode = "body";
|
|
continue;
|
|
}
|
|
const meta = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/);
|
|
if (meta) {
|
|
current.metadata[meta[1]] = meta[2];
|
|
}
|
|
continue;
|
|
}
|
|
if (/^###\s+technical_debug_payload_json\s*$/i.test(line)) {
|
|
pushCurrent();
|
|
continue;
|
|
}
|
|
current.body.push(line);
|
|
}
|
|
pushCurrent();
|
|
return sections;
|
|
}
|
|
|
|
function isJuly2020Period(period) {
|
|
if (!period || typeof period !== "object") {
|
|
return false;
|
|
}
|
|
const from = String(period.from ?? "").trim();
|
|
const to = String(period.to ?? "").trim();
|
|
return /^2020-07-\d{2}$/.test(from) && /^2020-07-\d{2}$/.test(to);
|
|
}
|
|
|
|
function extractLiveCalls(debug) {
|
|
const rows = [];
|
|
const retrievalResults = Array.isArray(debug?.retrieval_results) ? debug.retrieval_results : [];
|
|
for (const result of retrievalResults) {
|
|
const summary = result?.summary ?? {};
|
|
const live = summary?.live_mcp;
|
|
if (!live || typeof live !== "object") {
|
|
continue;
|
|
}
|
|
rows.push({
|
|
fragment_id: result?.fragment_id ?? null,
|
|
route: result?.route ?? null,
|
|
method: String(live.method ?? "execute_query"),
|
|
args_summary: live.args ?? null,
|
|
query_subject: String(live.query_subject ?? summary.query_subject ?? ""),
|
|
account_scope: Array.isArray(live.account_scope) ? live.account_scope : [],
|
|
fetched_rows: Number(live.fetched_rows ?? 0),
|
|
returned_rows: Number(live.returned_rows ?? 0),
|
|
matched_rows: Number(live.matched_rows ?? 0),
|
|
status: String(live.status ?? "unknown")
|
|
});
|
|
}
|
|
return rows;
|
|
}
|
|
|
|
function contradictionFlags(debug) {
|
|
const temporal = debug?.temporal_guard ?? {};
|
|
const eligibility = debug?.grounded_answer_eligibility_guard ?? {};
|
|
const effective = temporal.effective_primary_period ?? debug?.effective_primary_period ?? null;
|
|
const temporalOutcome = String(temporal.temporal_guard_outcome ?? "");
|
|
const hasJulyEffective = isJuly2020Period(effective);
|
|
const basisMismatch =
|
|
String(temporal.temporal_guard_basis ?? "") !== String(eligibility.eligibility_time_basis ?? "");
|
|
const failedUnderJuly = hasJulyEffective && temporalOutcome === "failed_out_of_snapshot_window";
|
|
return {
|
|
has_july_effective_primary_period: hasJulyEffective,
|
|
temporal_guard_outcome: temporalOutcome,
|
|
temporal_basis_mismatch: basisMismatch,
|
|
failed_under_july_effective_period: failedUnderJuly,
|
|
contradiction: failedUnderJuly || basisMismatch
|
|
};
|
|
}
|
|
|
|
function claimPathCompleted(debug) {
|
|
const targeted = debug?.targeted_evidence_acquisition ?? {};
|
|
const hitRate = Number(targeted.targeted_evidence_hit_rate ?? 0);
|
|
const checkStatus = targeted.check_status && typeof targeted.check_status === "object" ? targeted.check_status : {};
|
|
const hasFound = Object.values(checkStatus).some((value) => String(value) === "found");
|
|
return hitRate > 0 && hasFound;
|
|
}
|
|
|
|
function composeLiveReplayExport(session, caseRows, generatedAtIso) {
|
|
const lines = [];
|
|
lines.push("# Assistant conversation export");
|
|
lines.push(`session_id: ${session.session_id}`);
|
|
lines.push("export_mode: technical");
|
|
lines.push(`exported_at: ${generatedAtIso}`);
|
|
lines.push("");
|
|
|
|
let sectionIndex = 1;
|
|
for (const row of caseRows) {
|
|
const user = row.userItem;
|
|
const assistant = row.assistantItem;
|
|
lines.push(`## ${sectionIndex}. user`);
|
|
lines.push(`message_id: ${user.message_id}`);
|
|
lines.push(`created_at: ${user.created_at}`);
|
|
lines.push("reply_type: n/a");
|
|
lines.push("");
|
|
lines.push(user.text);
|
|
lines.push("");
|
|
sectionIndex += 1;
|
|
|
|
lines.push(`## ${sectionIndex}. assistant`);
|
|
lines.push(`message_id: ${assistant.message_id}`);
|
|
lines.push(`created_at: ${assistant.created_at}`);
|
|
lines.push(`reply_type: ${assistant.reply_type}`);
|
|
lines.push(`trace_id: ${assistant.trace_id}`);
|
|
lines.push("");
|
|
lines.push(assistant.text);
|
|
lines.push("");
|
|
lines.push("### technical_debug_payload_json");
|
|
lines.push("```json");
|
|
lines.push(JSON.stringify(assistant.debug ?? {}, null, 2));
|
|
lines.push("```");
|
|
lines.push("");
|
|
sectionIndex += 1;
|
|
}
|
|
return lines.join("\n");
|
|
}
|
|
|
|
async function main() {
|
|
const runDir = process.argv[2];
|
|
const sourceFile = process.argv[3];
|
|
if (!runDir || !sourceFile) {
|
|
throw new Error("Usage: node wave19_2LiveReplayPack.js <run-dir> <source-1.txt>");
|
|
}
|
|
|
|
const sourceText = fs.readFileSync(sourceFile, "utf8");
|
|
const sections = parseConversationSections(sourceText);
|
|
const userSections = sections.filter((item) => item.role === "user");
|
|
const assistantSections = sections.filter((item) => item.role === "assistant");
|
|
const userMessages = userSections.map((item) => item.body).filter((item) => item.length > 0).slice(0, 3);
|
|
if (userMessages.length < 3) {
|
|
throw new Error(`Expected at least 3 user messages in source file, got ${userMessages.length}`);
|
|
}
|
|
|
|
const baselinePartialCoverage = assistantSections
|
|
.slice(0, 3)
|
|
.filter((item) => String(item.metadata.reply_type ?? "") === "partial_coverage").length;
|
|
const baselinePartialCoverageRate = ratio(baselinePartialCoverage, Math.max(1, Math.min(3, assistantSections.length)));
|
|
|
|
process.env.FEATURE_ASSISTANT_MCP_RUNTIME_V1 = "1";
|
|
const { createApp } = require("../dist/server.js");
|
|
const app = createApp();
|
|
const sessionId = `asst-wave19_2-${Date.now()}`;
|
|
|
|
const replayRows = [];
|
|
for (let i = 0; i < 3; i += 1) {
|
|
const message = userMessages[i];
|
|
const response = await request(app).post("/api/assistant/message").send({
|
|
session_id: sessionId,
|
|
useMock: true,
|
|
promptVersion: "normalizer_v2_0_2",
|
|
user_message: message
|
|
});
|
|
if (response.status !== 200) {
|
|
throw new Error(`Replay case ${i + 1} failed with status=${response.status}`);
|
|
}
|
|
const debug = response.body?.debug ?? {};
|
|
const eligibility = debug?.grounded_answer_eligibility_guard ?? {};
|
|
const admissibility = debug?.evidence_admissibility_gate ?? {};
|
|
const temporal = debug?.temporal_guard ?? {};
|
|
const targeted = debug?.targeted_evidence_acquisition ?? {};
|
|
const calls = extractLiveCalls(debug);
|
|
const contradiction = contradictionFlags(debug);
|
|
|
|
replayRows.push({
|
|
...CASE_LABELS[i],
|
|
user_message: message,
|
|
reply_type: String(response.body?.reply_type ?? ""),
|
|
assistant_reply: String(response.body?.assistant_reply ?? ""),
|
|
trace_id: String(response.body?.trace_id ?? debug?.trace_id ?? ""),
|
|
business_scope_raw: Array.isArray(debug?.business_scope_raw) ? debug.business_scope_raw : [],
|
|
business_scope_resolved: Array.isArray(debug?.business_scope_resolved) ? debug.business_scope_resolved : [],
|
|
company_scope_resolution_reason: Array.isArray(debug?.company_scope_resolution_reason)
|
|
? debug.company_scope_resolution_reason
|
|
: Array.isArray(debug?.scope_resolution_reason)
|
|
? debug.scope_resolution_reason
|
|
: [],
|
|
raw_time_scope: temporal?.raw_time_scope ?? null,
|
|
resolved_time_anchor: temporal?.resolved_time_anchor ?? null,
|
|
effective_primary_period: temporal?.effective_primary_period ?? null,
|
|
temporal_guard_input: temporal?.temporal_guard_input ?? null,
|
|
temporal_guard_outcome: temporal?.temporal_guard_outcome ?? null,
|
|
eligibility_time_basis: eligibility?.eligibility_time_basis ?? null,
|
|
temporal_guard_basis: temporal?.temporal_guard_basis ?? null,
|
|
contradiction,
|
|
claim_type: debug?.claim_anchor_audit?.claim_type ?? null,
|
|
claim_anchor_resolution_rate: Number(debug?.claim_anchor_audit?.claim_anchor_resolution_rate ?? 0),
|
|
targeted_evidence_hit_rate: Number(targeted?.targeted_evidence_hit_rate ?? 0),
|
|
admissible_evidence_count: Number(admissibility?.admissible_evidence_count ?? 0),
|
|
reject_breakdown: admissibility?.reject_breakdown ?? null,
|
|
eligibility: {
|
|
eligible: Boolean(eligibility?.eligible),
|
|
grounding_mode: String(eligibility?.grounding_mode ?? ""),
|
|
outcome: String(eligibility?.outcome ?? ""),
|
|
reason_codes: Array.isArray(eligibility?.reason_codes) ? eligibility.reason_codes : []
|
|
},
|
|
live_calls: calls,
|
|
debug
|
|
});
|
|
}
|
|
|
|
const sessionResponse = await request(app).get(`/api/assistant/session/${sessionId}`);
|
|
if (sessionResponse.status !== 200) {
|
|
throw new Error(`Failed to load replay session: status=${sessionResponse.status}`);
|
|
}
|
|
const session = sessionResponse.body?.session;
|
|
const sessionItems = Array.isArray(session?.items) ? session.items : [];
|
|
|
|
const userItems = sessionItems.filter((item) => item?.role === "user");
|
|
const assistantItems = sessionItems.filter((item) => item?.role === "assistant");
|
|
const caseRowsWithItems = replayRows.map((row, index) => ({
|
|
...row,
|
|
userItem: userItems[index] ?? {
|
|
message_id: `user-${index + 1}`,
|
|
created_at: new Date().toISOString(),
|
|
text: row.user_message
|
|
},
|
|
assistantItem: assistantItems[index] ?? {
|
|
message_id: `assistant-${index + 1}`,
|
|
created_at: new Date().toISOString(),
|
|
text: row.assistant_reply,
|
|
reply_type: row.reply_type,
|
|
trace_id: row.trace_id,
|
|
debug: row.debug
|
|
}
|
|
}));
|
|
|
|
const contradictionCount = replayRows.filter((row) => row.contradiction.contradiction).length;
|
|
const scopeResolvedCount = replayRows.filter((row) => row.business_scope_resolved.includes("company_specific_accounting")).length;
|
|
const admissibleNonZeroCount = replayRows.filter((row) => row.admissible_evidence_count > 0).length;
|
|
const partialCoverageCount = replayRows.filter((row) => row.reply_type === "partial_coverage").length;
|
|
const claimPathCompletedCount = replayRows.filter((row) => claimPathCompleted(row.debug)).length;
|
|
const falseGroundedCount = replayRows.filter(
|
|
(row) => row.eligibility.grounding_mode === "grounded_positive" && row.admissible_evidence_count <= 0
|
|
).length;
|
|
|
|
const metrics = {
|
|
case_count: replayRows.length,
|
|
baseline_partial_coverage_default_rate: baselinePartialCoverageRate,
|
|
live_temporal_contradiction_rate: ratio(contradictionCount, replayRows.length),
|
|
live_company_scope_resolution_rate: ratio(scopeResolvedCount, replayRows.length),
|
|
live_admissible_evidence_nonzero_rate: ratio(admissibleNonZeroCount, replayRows.length),
|
|
live_partial_coverage_default_rate: ratio(partialCoverageCount, replayRows.length),
|
|
live_claim_path_completion_rate: ratio(claimPathCompletedCount, replayRows.length),
|
|
live_false_grounded_answer_rate: ratio(falseGroundedCount, replayRows.length)
|
|
};
|
|
|
|
const thresholds = {
|
|
live_temporal_contradiction_rate: 0,
|
|
live_company_scope_resolution_rate: 1,
|
|
live_false_grounded_answer_rate: 0,
|
|
live_admissible_evidence_nonzero_min_cases: 2
|
|
};
|
|
|
|
const temporalFixed = metrics.live_temporal_contradiction_rate <= thresholds.live_temporal_contradiction_rate;
|
|
const companyScopeFixed = metrics.live_company_scope_resolution_rate >= thresholds.live_company_scope_resolution_rate;
|
|
const evidencePathFixed =
|
|
admissibleNonZeroCount >= thresholds.live_admissible_evidence_nonzero_min_cases &&
|
|
metrics.live_false_grounded_answer_rate <= thresholds.live_false_grounded_answer_rate;
|
|
const partialReduced = metrics.live_partial_coverage_default_rate < metrics.baseline_partial_coverage_default_rate;
|
|
|
|
let overallStatus = "WAVE19_2_NOT_ACCEPTED";
|
|
if (temporalFixed && companyScopeFixed && evidencePathFixed && partialReduced) {
|
|
overallStatus = "WAVE19_2_ACCEPTED";
|
|
} else if (temporalFixed && companyScopeFixed && metrics.live_false_grounded_answer_rate <= 0) {
|
|
overallStatus = "WAVE19_2_ACCEPTED_WITH_LIMITATIONS";
|
|
}
|
|
|
|
const runSummary = {
|
|
run_id: path.basename(runDir),
|
|
stage: "Stage_04",
|
|
wave: "Wave_19_2",
|
|
scope: "live_runtime_fix_by_replay_1txt",
|
|
source_of_truth: sourceFile,
|
|
execution: {
|
|
replay_mode: "exact_questions_from_1_txt",
|
|
runtime_path: "assistant_message_with_mcp_runtime_on",
|
|
normalizer_mode: "useMock=true",
|
|
session_id: sessionId
|
|
},
|
|
thresholds,
|
|
metrics,
|
|
verdicts: {
|
|
LIVE_TEMPORAL_ALIGNMENT_FIXED: temporalFixed ? "FIXED" : "NOT_FIXED",
|
|
LIVE_COMPANY_SCOPE_FIXED: companyScopeFixed ? "FIXED" : "NOT_FIXED",
|
|
LIVE_EVIDENCE_PATH_FIXED: evidencePathFixed ? "FIXED" : "NOT_FIXED",
|
|
LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: partialReduced ? "REDUCED" : "NOT_REDUCED",
|
|
overall_status: overallStatus
|
|
}
|
|
};
|
|
writeJson(path.join(runDir, "run_summary.json"), runSummary);
|
|
|
|
const temporalAudit = {
|
|
generated_at: new Date().toISOString(),
|
|
cases: replayRows.map((row) => ({
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
raw_time_scope: row.raw_time_scope,
|
|
resolved_time_anchor: row.resolved_time_anchor,
|
|
effective_primary_period: row.effective_primary_period,
|
|
temporal_guard_input: row.temporal_guard_input,
|
|
temporal_guard_basis: row.temporal_guard_basis,
|
|
eligibility_time_basis: row.eligibility_time_basis,
|
|
temporal_guard_outcome: row.temporal_guard_outcome,
|
|
contradiction: row.contradiction
|
|
})),
|
|
metric: {
|
|
live_temporal_contradiction_rate: metrics.live_temporal_contradiction_rate
|
|
}
|
|
};
|
|
writeJson(path.join(runDir, "temporal_contradiction_audit.json"), temporalAudit);
|
|
|
|
const scopeAudit = {
|
|
generated_at: new Date().toISOString(),
|
|
cases: replayRows.map((row) => ({
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
business_scope_raw: row.business_scope_raw,
|
|
business_scope_resolved: row.business_scope_resolved,
|
|
company_scope_resolution_reason: row.company_scope_resolution_reason
|
|
})),
|
|
metric: {
|
|
live_company_scope_resolution_rate: metrics.live_company_scope_resolution_rate
|
|
}
|
|
};
|
|
writeJson(path.join(runDir, "business_scope_resolution_audit.json"), scopeAudit);
|
|
|
|
const mcpToEvidence = {
|
|
generated_at: new Date().toISOString(),
|
|
cases: replayRows.map((row) => ({
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
claim_type: row.claim_type,
|
|
admissible_evidence_count: row.admissible_evidence_count,
|
|
live_calls: row.live_calls,
|
|
claim_targeted_hit_rate: row.targeted_evidence_hit_rate,
|
|
eligibility: row.eligibility
|
|
}))
|
|
};
|
|
writeJson(path.join(runDir, "live_mcp_to_evidence_handoff.json"), mcpToEvidence);
|
|
|
|
const rejectBreakdown = {
|
|
generated_at: new Date().toISOString(),
|
|
aggregate: replayRows.reduce(
|
|
(acc, row) => {
|
|
const breakdown = row.reject_breakdown && typeof row.reject_breakdown === "object" ? row.reject_breakdown : {};
|
|
for (const key of Object.keys(acc)) {
|
|
acc[key] += Number(breakdown[key] ?? 0);
|
|
}
|
|
return acc;
|
|
},
|
|
{
|
|
wrong_period: 0,
|
|
wrong_domain: 0,
|
|
wrong_account_scope: 0,
|
|
weak_source_mapping: 0,
|
|
zero_live_match: 0,
|
|
future_dated_or_out_of_window: 0
|
|
}
|
|
),
|
|
cases: replayRows.map((row) => ({
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
reject_breakdown: row.reject_breakdown
|
|
}))
|
|
};
|
|
writeJson(path.join(runDir, "admissibility_reject_breakdown_live.json"), rejectBreakdown);
|
|
|
|
for (const row of replayRows) {
|
|
writeJson(path.join(runDir, "debug_payloads", `${row.case_id}_${row.label}.json`), {
|
|
case_id: row.case_id,
|
|
label: row.label,
|
|
trace_id: row.trace_id,
|
|
reply_type: row.reply_type,
|
|
debug: row.debug
|
|
});
|
|
}
|
|
|
|
const caseMatrixLines = [];
|
|
caseMatrixLines.push("# Live Case Matrix");
|
|
caseMatrixLines.push("");
|
|
caseMatrixLines.push("| Case | Label | Reply | Claim Type | Admissible Evidence | Grounding Mode | Scope | Temporal |");
|
|
caseMatrixLines.push("| --- | --- | --- | --- | ---: | --- | --- | --- |");
|
|
for (const row of replayRows) {
|
|
caseMatrixLines.push(
|
|
`| ${row.case_id} | ${row.label} | ${row.reply_type} | ${row.claim_type ?? "n/a"} | ${row.admissible_evidence_count} | ${row.eligibility.grounding_mode} | ${row.business_scope_resolved.join(", ") || "n/a"} | ${row.temporal_guard_outcome} |`
|
|
);
|
|
}
|
|
writeText(path.join(runDir, "live_case_matrix.md"), `${caseMatrixLines.join("\n")}\n`);
|
|
|
|
const replayReportLines = [];
|
|
replayReportLines.push("# Live Replay Report (Wave 19.2)");
|
|
replayReportLines.push("");
|
|
replayReportLines.push("## Source");
|
|
replayReportLines.push(`- Source of truth replayed from: \`${sourceFile}\``);
|
|
replayReportLines.push("- Replayed exactly 3 user turns from the original export.");
|
|
replayReportLines.push("- Runtime path: MCP ON, useMock=true.");
|
|
replayReportLines.push("");
|
|
replayReportLines.push("## Metrics");
|
|
replayReportLines.push(`- live_temporal_contradiction_rate: ${metrics.live_temporal_contradiction_rate}`);
|
|
replayReportLines.push(`- live_company_scope_resolution_rate: ${metrics.live_company_scope_resolution_rate}`);
|
|
replayReportLines.push(`- live_admissible_evidence_nonzero_rate: ${metrics.live_admissible_evidence_nonzero_rate}`);
|
|
replayReportLines.push(`- live_partial_coverage_default_rate: ${metrics.live_partial_coverage_default_rate}`);
|
|
replayReportLines.push(`- baseline_partial_coverage_default_rate: ${metrics.baseline_partial_coverage_default_rate}`);
|
|
replayReportLines.push(`- live_claim_path_completion_rate: ${metrics.live_claim_path_completion_rate}`);
|
|
replayReportLines.push(`- live_false_grounded_answer_rate: ${metrics.live_false_grounded_answer_rate}`);
|
|
replayReportLines.push("");
|
|
replayReportLines.push("## Verdict");
|
|
replayReportLines.push(`- LIVE_TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"}`);
|
|
replayReportLines.push(`- LIVE_COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"}`);
|
|
replayReportLines.push(`- LIVE_EVIDENCE_PATH_FIXED: ${evidencePathFixed ? "FIXED" : "NOT_FIXED"}`);
|
|
replayReportLines.push(`- LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: ${partialReduced ? "REDUCED" : "NOT_REDUCED"}`);
|
|
replayReportLines.push(`- Overall: ${overallStatus}`);
|
|
replayReportLines.push("");
|
|
writeText(path.join(runDir, "live_replay_report.md"), `${replayReportLines.join("\n")}\n`);
|
|
|
|
const chatExportLines = [];
|
|
chatExportLines.push("# Chat Export Live Replay");
|
|
chatExportLines.push("");
|
|
for (const row of replayRows) {
|
|
chatExportLines.push(`## ${row.case_id} | ${row.label}`);
|
|
chatExportLines.push(`user: ${row.user_message}`);
|
|
chatExportLines.push(`assistant(reply_type=${row.reply_type}, trace_id=${row.trace_id}): ${row.assistant_reply.replace(/\s+/g, " ").trim()}`);
|
|
chatExportLines.push("");
|
|
}
|
|
writeText(path.join(runDir, "chat_export_live_replay.md"), `${chatExportLines.join("\n")}\n`);
|
|
|
|
const generatedAtIso = new Date().toISOString();
|
|
const liveReplayTxt = composeLiveReplayExport(session, caseRowsWithItems, generatedAtIso);
|
|
writeText(path.join(runDir, "1_live_replay.txt"), liveReplayTxt);
|
|
|
|
const readme = [
|
|
"# Stage 4 / Wave 19.2 - Live Runtime Fix by Replay 1.txt",
|
|
"",
|
|
"## What was run",
|
|
"- Source-of-truth replay from original `1.txt` user turns.",
|
|
"- MCP runtime ON (`FEATURE_ASSISTANT_MCP_RUNTIME_V1=1`).",
|
|
"- Normalizer in `useMock=true` mode.",
|
|
"",
|
|
"## Produced artifacts",
|
|
"- run_summary.json",
|
|
"- live_replay_report.md",
|
|
"- live_case_matrix.md",
|
|
"- business_scope_resolution_audit.json",
|
|
"- temporal_contradiction_audit.json",
|
|
"- live_mcp_to_evidence_handoff.json",
|
|
"- admissibility_reject_breakdown_live.json",
|
|
"- chat_export_live_replay.md",
|
|
"- debug_payloads/",
|
|
"- 1_live_replay.txt",
|
|
"",
|
|
"## Final verdict",
|
|
`- LIVE_TEMPORAL_ALIGNMENT_FIXED: ${temporalFixed ? "FIXED" : "NOT_FIXED"}`,
|
|
`- LIVE_COMPANY_SCOPE_FIXED: ${companyScopeFixed ? "FIXED" : "NOT_FIXED"}`,
|
|
`- LIVE_EVIDENCE_PATH_FIXED: ${evidencePathFixed ? "FIXED" : "NOT_FIXED"}`,
|
|
`- LIVE_PARTIAL_COVERAGE_DEFAULT_REDUCED: ${partialReduced ? "REDUCED" : "NOT_REDUCED"}`,
|
|
`- Overall: ${overallStatus}`
|
|
].join("\n");
|
|
writeText(path.join(runDir, "README.md"), `${readme}\n`);
|
|
}
|
|
|
|
main().catch((error) => {
|
|
process.stderr.write(`${error instanceof Error ? error.stack || error.message : String(error)}\n`);
|
|
process.exit(1);
|
|
});
|