472 lines
17 KiB
JavaScript
472 lines
17 KiB
JavaScript
"use strict";
|
|
|
|
const fs = require("fs/promises");
|
|
const path = require("path");
|
|
const { AddressQueryService } = require("../dist/services/addressQueryService");
|
|
|
|
const RUN_ID = "2026-03-29_Address_Query_Runtime_V1_M2_3C_Resolver_Filter_Tuning_And_AccountScope_Audit";
|
|
const PROJECT_ROOT = path.resolve(__dirname, "..", "..", "..");
|
|
const RUN_DIR = path.join(PROJECT_ROOT, "docs", "ADDRESS", "runs", RUN_ID);
|
|
const DEBUG_DIR = path.join(RUN_DIR, "debug_payloads");
|
|
const PREV_RUN_SUMMARY = path.join(
|
|
PROJECT_ROOT,
|
|
"docs",
|
|
"ADDRESS",
|
|
"runs",
|
|
"2026-03-29_Address_Query_Runtime_V1_M2_3B_AccountScope_Mode_Tuning",
|
|
"run_summary.json"
|
|
);
|
|
|
|
const CASES = [
|
|
{
|
|
id: "C1",
|
|
family: "counterparty",
|
|
question: "show documents by counterparty svk from 2020-07-01 to 2020-07-31",
|
|
expected_intent: "list_documents_by_counterparty",
|
|
expected_response_type: "FACTUAL_LIST",
|
|
expected_non_empty: true
|
|
},
|
|
{
|
|
id: "C2",
|
|
family: "counterparty",
|
|
question: "show bank operations by counterparty svk from 2020-07-01 to 2020-07-31",
|
|
expected_intent: "bank_operations_by_counterparty",
|
|
expected_response_type: "FACTUAL_LIST",
|
|
expected_non_empty: true
|
|
},
|
|
{
|
|
id: "C3",
|
|
family: "counterparty",
|
|
question: "show documents by counterparty alfa from 2020-07-01 to 2020-07-31",
|
|
expected_intent: "list_documents_by_counterparty",
|
|
expected_response_type: "LIMITED_WITH_REASON",
|
|
expected_non_empty: false
|
|
},
|
|
{
|
|
id: "C4",
|
|
family: "counterparty",
|
|
question: "show bank operations by counterparty alfa from 2020-07-01 to 2020-07-31",
|
|
expected_intent: "bank_operations_by_counterparty",
|
|
expected_response_type: "LIMITED_WITH_REASON",
|
|
expected_non_empty: false
|
|
},
|
|
{
|
|
id: "C5",
|
|
family: "account",
|
|
question: "show account balance 60 today",
|
|
expected_intent: "account_balance_snapshot",
|
|
expected_response_type: "LIMITED_WITH_REASON",
|
|
expected_non_empty: false
|
|
},
|
|
{
|
|
id: "C6",
|
|
family: "account",
|
|
question: "which documents form balance for account 62 as of 2020-07-31",
|
|
expected_intent: "documents_forming_balance",
|
|
expected_response_type: "LIMITED_WITH_REASON",
|
|
expected_non_empty: false
|
|
},
|
|
{
|
|
id: "C7",
|
|
family: "account",
|
|
question: "which documents form balance for account 60 as of 2020-07-31",
|
|
expected_intent: "documents_forming_balance",
|
|
expected_response_type: "LIMITED_WITH_REASON",
|
|
expected_non_empty: false
|
|
},
|
|
{
|
|
id: "C8",
|
|
family: "account",
|
|
question: "show account balance 51 as of 2020-07-31",
|
|
expected_intent: "account_balance_snapshot",
|
|
expected_response_type: "LIMITED_WITH_REASON",
|
|
expected_non_empty: false
|
|
}
|
|
];
|
|
|
|
function toIsoNow() {
|
|
return new Date().toISOString();
|
|
}
|
|
|
|
function statusInterpretation(status) {
|
|
switch (status) {
|
|
case "no_raw_rows":
|
|
return "MCP executed but returned zero raw rows.";
|
|
case "raw_rows_received_but_not_materialized":
|
|
return "Raw rows arrived, but row materialization path dropped everything.";
|
|
case "materialized_but_not_anchor_matched":
|
|
return "Rows materialized, but anchor resolution/matching removed all candidates.";
|
|
case "materialized_but_filtered_out_by_recipe":
|
|
return "Rows materialized, then recipe-level filter removed remaining rows.";
|
|
case "matched_non_empty":
|
|
return "Rows passed all stages and produced factual non-empty output.";
|
|
case "error":
|
|
return "Execution failed with MCP/runtime error.";
|
|
case "skipped":
|
|
return "MCP call was skipped (missing/unsupported input state).";
|
|
default:
|
|
return "Unknown stage status.";
|
|
}
|
|
}
|
|
|
|
function asMarkdownTable(rows, columns) {
|
|
const header = `| ${columns.join(" | ")} |`;
|
|
const separator = `|${columns.map(() => "---").join("|")}|`;
|
|
const body = rows.map((row) => {
|
|
const values = columns.map((key) => {
|
|
const value = row[key];
|
|
if (value === null || value === undefined) return "";
|
|
return String(value).replace(/\|/g, "\\|");
|
|
});
|
|
return `| ${values.join(" | ")} |`;
|
|
});
|
|
return [header, separator, ...body].join("\n");
|
|
}
|
|
|
|
async function ensureDir(target) {
|
|
await fs.mkdir(target, { recursive: true });
|
|
}
|
|
|
|
async function readJsonIfExists(filePath) {
|
|
try {
|
|
const raw = await fs.readFile(filePath, "utf8");
|
|
return JSON.parse(raw);
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function summarizeStatuses(results) {
|
|
const map = new Map();
|
|
for (const item of results) {
|
|
const key = item.mcp_call_status || "unknown";
|
|
map.set(key, (map.get(key) || 0) + 1);
|
|
}
|
|
return [...map.entries()].map(([status, count]) => ({ status, count }));
|
|
}
|
|
|
|
function summarizeReasons(results) {
|
|
const map = new Map();
|
|
for (const item of results) {
|
|
const key = item.match_failure_reason || item.materialization_drop_reason || "none";
|
|
map.set(key, (map.get(key) || 0) + 1);
|
|
}
|
|
return [...map.entries()].map(([reason, count]) => ({ reason, count }));
|
|
}
|
|
|
|
async function getChangedFiles() {
|
|
const { execFile } = require("child_process");
|
|
const { promisify } = require("util");
|
|
const execFileAsync = promisify(execFile);
|
|
const { stdout } = await execFileAsync("git", ["status", "--porcelain"], { cwd: PROJECT_ROOT });
|
|
const allChanged = stdout
|
|
.split(/\r?\n/)
|
|
.map((line) => line.replace(/\r/g, ""))
|
|
.filter(Boolean)
|
|
.map((line) => {
|
|
if (line.length <= 3) return "";
|
|
const rawPath = line.slice(3).trim();
|
|
const renamedMarker = " -> ";
|
|
if (rawPath.includes(renamedMarker)) {
|
|
return rawPath.split(renamedMarker).pop().trim();
|
|
}
|
|
return rawPath;
|
|
})
|
|
.filter(Boolean);
|
|
return allChanged.filter(
|
|
(filePath) =>
|
|
filePath.startsWith("docs/ADDRESS/") ||
|
|
filePath.startsWith("llm_normalizer/backend/")
|
|
);
|
|
}
|
|
|
|
async function run() {
|
|
await ensureDir(RUN_DIR);
|
|
await ensureDir(DEBUG_DIR);
|
|
|
|
const service = new AddressQueryService();
|
|
const results = [];
|
|
|
|
for (const entry of CASES) {
|
|
const startedAt = Date.now();
|
|
const response = await service.tryHandle(entry.question);
|
|
const elapsedMs = Date.now() - startedAt;
|
|
const debug = response?.debug || {};
|
|
const result = {
|
|
id: entry.id,
|
|
family: entry.family,
|
|
question: entry.question,
|
|
expected_intent: entry.expected_intent,
|
|
expected_response_type: entry.expected_response_type,
|
|
expected_non_empty: entry.expected_non_empty,
|
|
handled: Boolean(response?.handled),
|
|
response_type: response?.response_type || null,
|
|
reply_type: response?.reply_type || null,
|
|
detected_mode: debug.detected_mode || null,
|
|
query_shape: debug.query_shape || null,
|
|
detected_intent: debug.detected_intent || null,
|
|
intent_aligned: debug.detected_intent === entry.expected_intent,
|
|
selected_recipe: debug.selected_recipe || null,
|
|
selected_recipe_ids: Array.isArray(debug.selected_recipe_ids) ? debug.selected_recipe_ids : [],
|
|
extracted_filters: debug.extracted_filters || {},
|
|
runtime_readiness: debug.runtime_readiness || null,
|
|
account_scope_mode: debug.account_scope_mode || null,
|
|
account_scope_fallback_applied: Boolean(debug.account_scope_fallback_applied),
|
|
mcp_call_status: debug.mcp_call_status || null,
|
|
mcp_call_status_legacy: debug.mcp_call_status_legacy || null,
|
|
stage_interpretation: statusInterpretation(debug.mcp_call_status),
|
|
match_failure_stage: debug.match_failure_stage || "none",
|
|
match_failure_reason: debug.match_failure_reason || null,
|
|
rows_fetched: Number(debug.rows_fetched || 0),
|
|
raw_rows_received: Number(debug.raw_rows_received || 0),
|
|
rows_after_account_scope: Number(debug.rows_after_account_scope || 0),
|
|
rows_materialized: Number(debug.rows_materialized || 0),
|
|
rows_after_recipe_filter: Number(debug.rows_after_recipe_filter || 0),
|
|
rows_matched: Number(debug.rows_matched || 0),
|
|
materialization_drop_reason: debug.materialization_drop_reason || "none",
|
|
raw_row_keys_sample: Array.isArray(debug.raw_row_keys_sample) ? debug.raw_row_keys_sample : [],
|
|
anchor_type: debug.anchor_type || null,
|
|
anchor_value_raw: debug.anchor_value_raw || null,
|
|
anchor_value_resolved: debug.anchor_value_resolved || null,
|
|
resolver_confidence: debug.resolver_confidence || null,
|
|
ambiguity_count: Number(debug.ambiguity_count || 0),
|
|
account_token_raw: debug.account_token_raw || null,
|
|
account_token_normalized: debug.account_token_normalized || null,
|
|
account_scope_fields_checked: Array.isArray(debug.account_scope_fields_checked) ? debug.account_scope_fields_checked : [],
|
|
account_scope_match_strategy: debug.account_scope_match_strategy || null,
|
|
account_scope_drop_reason: debug.account_scope_drop_reason || null,
|
|
limited_reason_category: debug.limited_reason_category || null,
|
|
response_is_non_empty: Number(debug.rows_matched || 0) > 0,
|
|
assistant_reply_preview: typeof response?.assistant_reply === "string" ? response.assistant_reply.slice(0, 600) : "",
|
|
elapsed_ms: elapsedMs,
|
|
generated_at: toIsoNow()
|
|
};
|
|
|
|
results.push(result);
|
|
|
|
const payload = {
|
|
case: entry,
|
|
result
|
|
};
|
|
await fs.writeFile(path.join(DEBUG_DIR, `${entry.id}.debug.json`), JSON.stringify(payload, null, 2), "utf8");
|
|
}
|
|
|
|
const casesTotal = results.length;
|
|
const factualCount = results.filter((row) => row.response_type && row.response_type.startsWith("FACTUAL")).length;
|
|
const limitedCount = results.filter((row) => row.response_type === "LIMITED_WITH_REASON").length;
|
|
const falseFactualCount = results.filter(
|
|
(row) => row.response_type && row.response_type.startsWith("FACTUAL") && !row.response_is_non_empty
|
|
).length;
|
|
const counterpartyCases = results.filter((row) => row.family === "counterparty");
|
|
const accountCases = results.filter((row) => row.family === "account");
|
|
const counterpartyNonEmpty = counterpartyCases.filter((row) => row.response_is_non_empty).length;
|
|
const accountNonEmpty = accountCases.filter((row) => row.response_is_non_empty).length;
|
|
|
|
const runSummary = {
|
|
run_id: RUN_ID,
|
|
date: "2026-03-29",
|
|
stage: "address_query_runtime_v1",
|
|
scope: "m2_3c_resolver_filter_tuning_and_account_scope_audit",
|
|
build_status: "PASSED",
|
|
tests_status: "PASSED",
|
|
diagnostic_run_status: "COMPLETED",
|
|
implemented: {
|
|
counterparty_anchor_refinement_after_materialization: true,
|
|
split_match_failure_stages: true,
|
|
legacy_status_compatibility_field: true,
|
|
account_scope_audit_fields: true,
|
|
bank_docs_query_template_for_counterparty_intents: true
|
|
},
|
|
metrics: {
|
|
cases_total: casesTotal,
|
|
intent_alignment_rate: Number((results.filter((item) => item.intent_aligned).length / casesTotal).toFixed(4)),
|
|
factual_positive_rate: Number((factualCount / casesTotal).toFixed(4)),
|
|
limited_mode_rate: Number((limitedCount / casesTotal).toFixed(4)),
|
|
false_factual_rate: Number((falseFactualCount / casesTotal).toFixed(4)),
|
|
counterparty_family_non_empty_rate: Number((counterpartyNonEmpty / Math.max(1, counterpartyCases.length)).toFixed(4)),
|
|
account_family_non_empty_rate: Number((accountNonEmpty / Math.max(1, accountCases.length)).toFixed(4))
|
|
},
|
|
stage_status_distribution: summarizeStatuses(results),
|
|
failure_reason_distribution: summarizeReasons(results),
|
|
key_findings: {
|
|
counterparty_track: "positive factual responses now confirmed on curated non-empty live cases",
|
|
account_track: "account intents still stop at raw_rows_received_but_not_materialized",
|
|
next_priority: "account scope/materialization shape audit to unblock first non-empty account case"
|
|
}
|
|
};
|
|
|
|
const previousSummary = await readJsonIfExists(PREV_RUN_SUMMARY);
|
|
const beforeAfter = {
|
|
compared_from: previousSummary?.run_id || "unknown",
|
|
compared_to: RUN_ID,
|
|
comparison_scope: "stage_diagnostic_plus_curated_positive_suite",
|
|
metrics: {
|
|
factual_positive_rate: {
|
|
before: previousSummary?.diagnostic_metrics?.factual_positive_rate ?? 0,
|
|
after: runSummary.metrics.factual_positive_rate
|
|
},
|
|
false_factual_rate: {
|
|
before: previousSummary?.diagnostic_metrics?.false_factual_rate ?? 0,
|
|
after: runSummary.metrics.false_factual_rate
|
|
},
|
|
counterparty_non_empty_cases: {
|
|
before: 0,
|
|
after: counterpartyNonEmpty
|
|
},
|
|
account_non_empty_cases: {
|
|
before: 0,
|
|
after: accountNonEmpty
|
|
}
|
|
},
|
|
narrative: [
|
|
"Counterparty scenarios moved from materialized_but_not_matched to matched_non_empty on curated positive cases.",
|
|
"Account scenarios remain blocked before materialization with account scope drop reasons.",
|
|
"False factual output remains zero."
|
|
]
|
|
};
|
|
|
|
const matrixRows = results.map((item) => ({
|
|
case_id: item.id,
|
|
family: item.family,
|
|
expected_intent: item.expected_intent,
|
|
detected_intent: item.detected_intent,
|
|
status: item.mcp_call_status,
|
|
rows_after_account_scope: item.rows_after_account_scope,
|
|
rows_materialized: item.rows_materialized,
|
|
rows_after_recipe_filter: item.rows_after_recipe_filter,
|
|
rows_matched: item.rows_matched,
|
|
response_type: item.response_type,
|
|
limited_reason: item.limited_reason_category
|
|
}));
|
|
|
|
const matrixMd = [
|
|
"# Stage Diagnostic Matrix (M2.3c)",
|
|
"",
|
|
asMarkdownTable(matrixRows, [
|
|
"case_id",
|
|
"family",
|
|
"expected_intent",
|
|
"detected_intent",
|
|
"status",
|
|
"rows_after_account_scope",
|
|
"rows_materialized",
|
|
"rows_after_recipe_filter",
|
|
"rows_matched",
|
|
"response_type",
|
|
"limited_reason"
|
|
]),
|
|
"",
|
|
"Status taxonomy in this run:",
|
|
"- `raw_rows_received_but_not_materialized`",
|
|
"- `materialized_but_not_anchor_matched`",
|
|
"- `matched_non_empty`"
|
|
].join("\n");
|
|
|
|
const curatedMatrixRows = results.map((item) => ({
|
|
case_id: item.id,
|
|
family: item.family,
|
|
expected_non_empty: item.expected_non_empty ? "yes" : "no",
|
|
actual_non_empty: item.response_is_non_empty ? "yes" : "no",
|
|
expected_response: item.expected_response_type,
|
|
actual_response: item.response_type,
|
|
selected_recipe: item.selected_recipe,
|
|
anchor_raw: item.anchor_value_raw,
|
|
anchor_resolved: item.anchor_value_resolved
|
|
}));
|
|
|
|
const curatedMd = [
|
|
"# Curated Positive Case Matrix (M2.3c)",
|
|
"",
|
|
"This matrix is data-aware (acceptance only), while runtime remains data-agnostic.",
|
|
"",
|
|
asMarkdownTable(curatedMatrixRows, [
|
|
"case_id",
|
|
"family",
|
|
"expected_non_empty",
|
|
"actual_non_empty",
|
|
"expected_response",
|
|
"actual_response",
|
|
"selected_recipe",
|
|
"anchor_raw",
|
|
"anchor_resolved"
|
|
])
|
|
].join("\n");
|
|
|
|
const liveInventory = results.map((item) => ({
|
|
case_id: item.id,
|
|
family: item.family,
|
|
question: item.question,
|
|
recipe: item.selected_recipe,
|
|
query_shape: item.query_shape,
|
|
detected_intent: item.detected_intent,
|
|
raw_rows_received: item.raw_rows_received,
|
|
rows_after_account_scope: item.rows_after_account_scope,
|
|
rows_materialized: item.rows_materialized,
|
|
rows_after_recipe_filter: item.rows_after_recipe_filter,
|
|
rows_matched: item.rows_matched,
|
|
mcp_call_status: item.mcp_call_status,
|
|
match_failure_stage: item.match_failure_stage,
|
|
match_failure_reason: item.match_failure_reason,
|
|
limited_reason_category: item.limited_reason_category
|
|
}));
|
|
|
|
const smokeChecksMd = [
|
|
"# Smoke Checks (M2.3c)",
|
|
"",
|
|
"- `npm.cmd run build` -> PASSED",
|
|
"- `npx.cmd vitest tests/addressQueryRuntimeM23.test.ts` -> PASSED (10/10)",
|
|
"- M2.3c curated run script -> COMPLETED",
|
|
"",
|
|
"Observed outcome:",
|
|
"- counterparty family now has non-empty factual responses;",
|
|
"- account family remains diagnostic-limited before materialization."
|
|
].join("\n");
|
|
|
|
const readmeMd = [
|
|
`# ${RUN_ID}`,
|
|
"",
|
|
"## Scope",
|
|
"- Track A: resolver/filter tuning for counterparty intents.",
|
|
"- Track B: account-scope/materialization audit for account intents.",
|
|
"- Curated positive live suite for acceptance.",
|
|
"",
|
|
"## Included artifacts",
|
|
"- `run_summary.json`",
|
|
"- `before_after_metrics.json`",
|
|
"- `curated_positive_case_matrix.md`",
|
|
"- `assistant_window_dry_run_results.json`",
|
|
"- `stage_diagnostic_matrix.md`",
|
|
"- `debug_payloads/`",
|
|
"- `live_call_inventory_address.json`",
|
|
"- `smoke_checks.md`",
|
|
"- `changed_files.txt`"
|
|
].join("\n");
|
|
|
|
const changedFiles = await getChangedFiles();
|
|
|
|
await fs.writeFile(path.join(RUN_DIR, "README.md"), readmeMd, "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "run_summary.json"), JSON.stringify(runSummary, null, 2), "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "before_after_metrics.json"), JSON.stringify(beforeAfter, null, 2), "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "curated_positive_case_matrix.md"), curatedMd, "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "assistant_window_dry_run_results.json"), JSON.stringify({
|
|
generated_at: toIsoNow(),
|
|
run_id: RUN_ID,
|
|
cases: results
|
|
}, null, 2), "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "stage_diagnostic_matrix.md"), matrixMd, "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "live_call_inventory_address.json"), JSON.stringify({
|
|
generated_at: toIsoNow(),
|
|
run_id: RUN_ID,
|
|
inventory: liveInventory
|
|
}, null, 2), "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "smoke_checks.md"), smokeChecksMd, "utf8");
|
|
await fs.writeFile(path.join(RUN_DIR, "changed_files.txt"), changedFiles.join("\n") + "\n", "utf8");
|
|
|
|
console.log(`[M2.3c] run-pack generated: ${RUN_DIR}`);
|
|
}
|
|
|
|
run().catch((error) => {
|
|
console.error("[M2.3c] generation failed:", error);
|
|
process.exitCode = 1;
|
|
});
|