#!/usr/bin/env node const fs = require("node:fs"); const path = require("node:path"); const EXPECTED_QUESTION_TYPES = [ "why_breaks", "prove_or_guess", "prove_or_guess", "why_breaks", "where_break_is", "prove_or_guess", "why_breaks", "which_chains_are_complete_vs_incomplete", "which_chains_are_complete_vs_incomplete", "prove_or_guess", "why_breaks", "prove_or_guess", "why_breaks", "what_is_it_grounded_on", "why_breaks", "which_chains_are_complete_vs_incomplete", "prove_or_guess", "what_is_it_grounded_on", "why_breaks", "prove_or_guess" ]; function parseArgs(argv) { const args = { rawFile: "", outputDir: "", caseMatrixFile: "wave13_chat20_case_matrix_updated.md", metricsFile: "wave13_chat20_metrics.json", reportFile: "wave13_regression_report.md", baselineMetricsFile: "" }; for (let i = 0; i < argv.length; i += 1) { const token = argv[i]; if (token === "--raw-file") { args.rawFile = String(argv[i + 1] ?? ""); i += 1; continue; } if (token === "--output-dir") { args.outputDir = String(argv[i + 1] ?? ""); i += 1; continue; } if (token === "--case-matrix-file") { args.caseMatrixFile = String(argv[i + 1] ?? args.caseMatrixFile); i += 1; continue; } if (token === "--metrics-file") { args.metricsFile = String(argv[i + 1] ?? args.metricsFile); i += 1; continue; } if (token === "--report-file") { args.reportFile = String(argv[i + 1] ?? args.reportFile); i += 1; continue; } if (token === "--baseline-metrics-file") { args.baselineMetricsFile = String(argv[i + 1] ?? ""); i += 1; } } return args; } function ensureDir(dirPath) { fs.mkdirSync(dirPath, { recursive: true }); } function readJson(filePath) { const raw = fs.readFileSync(filePath, "utf8").replace(/^\uFEFF/, ""); return JSON.parse(raw); } function writeUtf8Bom(filePath, content) { ensureDir(path.dirname(filePath)); fs.writeFileSync(filePath, `\uFEFF${content}`, "utf8"); } function text(value) { return value == null ? "" : String(value); } function lower(value) { return text(value).toLowerCase(); } function expectedDomainByIndex(index) { const caseNo = index + 1; if (caseNo <= 8) { return "settlements_60_62"; } if (caseNo <= 16) { return "vat_document_register_book"; } return "month_close_costs_20_44"; } function normalizeInternalDomain(domainName) { const d = lower(domainName); if (!d) { return "unknown"; } if ( d.includes("settlement") || d.includes("supplier") || d.includes("customer") || d.includes("bank") ) { return "settlements_60_62"; } if (d.includes("vat") || d.includes("nds")) { return "vat_document_register_book"; } if ( d.includes("period_close") || d.includes("month_close") || d.includes("deferred_expense") || d.includes("fixed_asset") || d.includes("close") ) { return "month_close_costs_20_44"; } return "unknown"; } function mergeCountMap(target, source) { if (!source || typeof source !== "object") { return; } for (const [key, value] of Object.entries(source)) { const name = text(key); if (!name) { continue; } const count = Number(value) || 0; if (!target[name]) { target[name] = 0; } target[name] += count > 0 ? count : 1; } } function collectDomainScores(row) { const scores = {}; const retrieval = Array.isArray(row?.debug?.retrieval_results) ? row.debug.retrieval_results : []; for (const item of retrieval) { mergeCountMap(scores, item?.problem_unit_summary?.lifecycle_domain_distribution); mergeCountMap(scores, item?.problem_unit_summary?.graph_summary?.domain_distribution); const domainCard = text(item?.summary?.domain_purity_guard?.domain_card_id); if (domainCard) { if (!scores[domainCard]) { scores[domainCard] = 0; } scores[domainCard] += 2; } const resultItems = Array.isArray(item?.items) ? item.items : []; for (const resultItem of resultItems) { const scopes = Array.isArray(resultItem?.graph_domain_scope) ? resultItem.graph_domain_scope : []; for (const scope of scopes) { const name = text(scope); if (!name) { continue; } if (!scores[name]) { scores[name] = 0; } scores[name] += 1; } } } const activeDomain = text(row?.debug?.investigation_state_snapshot?.focus?.active_domain); if (activeDomain) { if (!scores[activeDomain]) { scores[activeDomain] = 0; } scores[activeDomain] += 1; } return scores; } function pickActualDomain(row) { const scores = collectDomainScores(row); const sorted = Object.entries(scores).sort((a, b) => { if (b[1] !== a[1]) { return b[1] - a[1]; } return String(a[0]).localeCompare(String(b[0])); }); if (!sorted.length) { return "unknown"; } return normalizeInternalDomain(sorted[0][0]); } function pickActualQuestionType(row) { const qType = text(row?.debug?.question_type_class); return qType || "unknown"; } function extractCompanyAnchors(row) { const all = row?.debug?.company_anchors?.all; if (!Array.isArray(all)) { return []; } return all.map((v) => text(v).trim()).filter(Boolean); } function hasAnchorUsageInAnswer(row, anchors) { if (!anchors.length) { return false; } const reply = lower(row?.assistant_reply); if (!reply) { return false; } if (reply.includes("в опоре использованы якоря вопроса")) { return true; } for (const anchor of anchors) { const value = lower(anchor); if (value.length < 3) { continue; } if (reply.includes(value)) { return true; } } return false; } function evaluateEvidenceStrength(row) { const status = lower(row?.debug?.answer_grounding_check?.status); if (status === "grounded") { return "strong"; } if (status === "partial") { return "weak"; } if (status === "no_grounded_answer") { return "none"; } return "limited"; } function evaluateConfidenceStyle(row) { const reply = lower(row?.assistant_reply); if (!reply) { return "unknown"; } const hasLimitation = reply.includes("ограничени") || reply.includes("частично") || reply.includes("низкая") || reply.includes("не подтвержден"); const hasConfident = reply.includes("подтверждено") || reply.includes("доказ") || reply.includes("подтверждается"); if (hasLimitation && hasConfident) { return "mixed"; } if (hasLimitation) { return "limited"; } if (hasConfident) { return "confident"; } return "neutral"; } function containsAny(textValue, needles) { const body = lower(textValue); return needles.some((needle) => body.includes(lower(needle))); } function evaluateFirstCheckRelevance(row, expectedDomain) { const reply = text(row?.assistant_reply); if (!reply) { return false; } if (expectedDomain === "settlements_60_62") { return containsAny(reply, [ "договор", "объект расчет", "регистр расчет", "зачет аванс", "взаимозачет", "60/62/76" ]); } if (expectedDomain === "vat_document_register_book") { return containsAny(reply, [ "ндс", "счет-фактур", "книга покуп", "книга продаж", "регистр", "19" ]); } if (expectedDomain === "month_close_costs_20_44") { return containsAny(reply, [ "закрыти", "рбп", "амортизац", "косвен", "20", "25", "26", "44" ]); } return false; } function evaluateGenericAnswer(row) { const reply = lower(row?.assistant_reply); if (!reply) { return true; } const genericPatterns = [ "коротко: проблема с закрытием расчета подтверждается частично", "сигнал проблемы есть, но механизм подтвержден не полностью", "вывод сделан по snapshot", "проверьте договор, объект расчетов, регистр расчетов", "проверьте договор и объект расчетов" ]; const hits = genericPatterns.filter((pattern) => reply.includes(pattern)).length; return hits >= 2; } function shortQuestion(value, maxLength = 130) { const q = text(value).replace(/\s+/g, " ").trim(); if (q.length <= maxLength) { return q; } return `${q.slice(0, maxLength - 3)}...`; } function toPercent(value) { return Number(value.toFixed(4)); } function buildCaseRow(index, row) { const expectedDomain = expectedDomainByIndex(index); const actualDomain = pickActualDomain(row); const expectedQuestionType = EXPECTED_QUESTION_TYPES[index] || "unknown"; const actualQuestionType = pickActualQuestionType(row); const anchors = extractCompanyAnchors(row); const anchorsPresent = anchors.length > 0; const anchorsUsed = hasAnchorUsageInAnswer(row, anchors); const evidenceStrength = evaluateEvidenceStrength(row); const confidenceStyle = evaluateConfidenceStyle(row); const firstCheckRelevant = evaluateFirstCheckRelevance(row, expectedDomain); const genericAnswer = evaluateGenericAnswer(row); const reasons = []; if (actualDomain !== expectedDomain) { reasons.push("wrong_domain"); } if (actualQuestionType !== expectedQuestionType) { reasons.push("wrong_question_type"); } if (anchorsPresent && !anchorsUsed) { reasons.push("weak_company_anchor_usage"); } if (!firstCheckRelevant) { reasons.push("wrong_first_check"); } if (genericAnswer) { reasons.push("generic_answer"); } let verdict = "PASS"; if (reasons.length > 0) { const hardFail = reasons.includes("wrong_domain") || reasons.includes("wrong_first_check"); verdict = hardFail || reasons.length >= 3 ? "FAIL" : "SOFT_PASS"; } return { case_id: text(row?.case_id) || `q${String(index + 1).padStart(2, "0")}`, question_short: shortQuestion(row?.user_message), expected_domain: expectedDomain, actual_domain: actualDomain, expected_question_type: expectedQuestionType, actual_question_type: actualQuestionType, company_anchors_present: anchorsPresent, company_anchors_used_in_answer: anchorsUsed, evidence_strength: evidenceStrength, answer_confidence_style: confidenceStyle, first_check_relevance: firstCheckRelevant, verdict, failure_reason_short: reasons.length ? reasons.join(", ") : "none", is_generic_answer: genericAnswer, failure_reasons: reasons }; } function markdownCell(value) { return text(value).replace(/\|/g, "\\|"); } function buildCaseMatrixMarkdown(rows) { const lines = []; lines.push("# Wave 13 Chat20 Case Matrix (Updated)"); lines.push(""); lines.push("| case_id | question_short | expected_domain | actual_domain | expected_question_type | actual_question_type | company_anchors_present | company_anchors_used_in_answer | evidence_strength | answer_confidence_style | first_check_relevance | verdict | failure_reason_short |"); lines.push("|---|---|---|---|---|---|---|---|---|---|---|---|---|"); for (const row of rows) { lines.push( `| ${markdownCell(row.case_id)} | ${markdownCell(row.question_short)} | ${markdownCell(row.expected_domain)} | ${markdownCell(row.actual_domain)} | ${markdownCell(row.expected_question_type)} | ${markdownCell(row.actual_question_type)} | ${markdownCell(row.company_anchors_present)} | ${markdownCell(row.company_anchors_used_in_answer)} | ${markdownCell(row.evidence_strength)} | ${markdownCell(row.answer_confidence_style)} | ${markdownCell(row.first_check_relevance)} | ${markdownCell(row.verdict)} | ${markdownCell(row.failure_reason_short)} |` ); } lines.push(""); return `${lines.join("\n")}\n`; } function countBy(rows, selector) { const result = {}; for (const row of rows) { const key = selector(row); if (!result[key]) { result[key] = 0; } result[key] += 1; } return result; } function buildRegressionReport(rows, metrics, baselineMetrics) { const lines = []; lines.push("# Wave 13 Regression Report"); lines.push(""); lines.push(`- Cases: ${rows.length}`); lines.push(`- PASS: ${metrics.totals.pass}`); lines.push(`- SOFT_PASS: ${metrics.totals.soft_pass}`); lines.push(`- FAIL: ${metrics.totals.fail}`); lines.push(""); lines.push("## Metric Snapshot"); lines.push(`- domain_correctness_rate: ${metrics.domain_correctness_rate}`); lines.push(`- question_type_fit_rate: ${metrics.question_type_fit_rate}`); lines.push(`- company_anchor_usage_rate: ${metrics.company_anchor_usage_rate}`); lines.push(`- generic_answer_rate: ${metrics.generic_answer_rate}`); lines.push(`- first_check_relevance_rate: ${metrics.first_check_relevance_rate}`); lines.push(""); if (baselineMetrics) { lines.push("## Delta vs Baseline"); for (const key of [ "domain_correctness_rate", "question_type_fit_rate", "company_anchor_usage_rate", "generic_answer_rate", "first_check_relevance_rate" ]) { const current = Number(metrics[key] ?? 0); const baseline = Number(baselineMetrics[key] ?? 0); const delta = Number((current - baseline).toFixed(4)); lines.push(`- ${key}: ${baseline} -> ${current} (delta ${delta >= 0 ? "+" : ""}${delta})`); } lines.push(""); } const failures = rows.filter((row) => row.verdict !== "PASS"); const reasonCounts = {}; for (const row of failures) { for (const reason of row.failure_reasons) { if (!reasonCounts[reason]) { reasonCounts[reason] = 0; } reasonCounts[reason] += 1; } } const topReasons = Object.entries(reasonCounts).sort((a, b) => b[1] - a[1]).slice(0, 5); lines.push("## Top Defects"); if (!topReasons.length) { lines.push("- No defects detected."); } else { for (const [reason, count] of topReasons) { lines.push(`- ${reason}: ${count}`); } } lines.push(""); lines.push("## FAIL Cases"); for (const row of rows.filter((item) => item.verdict === "FAIL")) { lines.push(`- ${row.case_id}: ${row.failure_reason_short}`); } lines.push(""); return `${lines.join("\n")}\n`; } async function main() { const args = parseArgs(process.argv.slice(2)); if (!args.rawFile) { throw new Error("Missing required argument --raw-file"); } if (!args.outputDir) { throw new Error("Missing required argument --output-dir"); } const rawPath = path.resolve(args.rawFile); const outputDir = path.resolve(args.outputDir); const raw = readJson(rawPath); const rows = Array.isArray(raw?.rows) ? raw.rows : []; if (rows.length === 0) { throw new Error("Raw file contains no rows."); } const caseRows = rows.map((row, index) => buildCaseRow(index, row)); const totalsByVerdict = countBy(caseRows, (row) => row.verdict); const domainCorrect = caseRows.filter((row) => row.expected_domain === row.actual_domain).length; const qTypeFit = caseRows.filter((row) => row.expected_question_type === row.actual_question_type).length; const anchorsPresentCount = caseRows.filter((row) => row.company_anchors_present).length; const anchorsUsedCount = caseRows.filter( (row) => row.company_anchors_present && row.company_anchors_used_in_answer ).length; const genericCount = caseRows.filter((row) => row.is_generic_answer).length; const firstCheckRelevantCount = caseRows.filter((row) => row.first_check_relevance).length; const metrics = { schema_version: "wave13_chat20_metrics_v2", run_id: path.basename(outputDir), source_session_id: text(raw?.session_id), totals: { cases: caseRows.length, pass: totalsByVerdict.PASS || 0, soft_pass: totalsByVerdict.SOFT_PASS || 0, fail: totalsByVerdict.FAIL || 0 }, domain_correctness_rate: toPercent(domainCorrect / caseRows.length), question_type_fit_rate: toPercent(qTypeFit / caseRows.length), company_anchor_usage_rate: toPercent( anchorsPresentCount > 0 ? anchorsUsedCount / anchorsPresentCount : 0 ), company_anchor_usage_rate_global: toPercent(anchorsUsedCount / caseRows.length), generic_answer_rate: toPercent(genericCount / caseRows.length), first_check_relevance_rate: toPercent(firstCheckRelevantCount / caseRows.length), anchors_present_cases: anchorsPresentCount, anchors_used_cases: anchorsUsedCount }; let baselineMetrics = null; if (args.baselineMetricsFile) { const baselinePath = path.resolve(args.baselineMetricsFile); if (fs.existsSync(baselinePath)) { baselineMetrics = readJson(baselinePath); metrics.baseline_reference = path.basename(baselinePath); metrics.baseline_metrics = { domain_correctness_rate: baselineMetrics.domain_correctness_rate, question_type_fit_rate: baselineMetrics.question_type_fit_rate, company_anchor_usage_rate: baselineMetrics.company_anchor_usage_rate, generic_answer_rate: baselineMetrics.generic_answer_rate, first_check_relevance_rate: baselineMetrics.first_check_relevance_rate }; metrics.delta_vs_baseline = { domain_correctness_rate_delta: toPercent( Number(metrics.domain_correctness_rate) - Number(metrics.baseline_metrics.domain_correctness_rate || 0) ), question_type_fit_rate_delta: toPercent( Number(metrics.question_type_fit_rate) - Number(metrics.baseline_metrics.question_type_fit_rate || 0) ), company_anchor_usage_rate_delta: toPercent( Number(metrics.company_anchor_usage_rate) - Number(metrics.baseline_metrics.company_anchor_usage_rate || 0) ), generic_answer_rate_delta: toPercent( Number(metrics.generic_answer_rate) - Number(metrics.baseline_metrics.generic_answer_rate || 0) ), first_check_relevance_rate_delta: toPercent( Number(metrics.first_check_relevance_rate) - Number(metrics.baseline_metrics.first_check_relevance_rate || 0) ) }; } } const matrixPath = path.join(outputDir, args.caseMatrixFile); const metricsPath = path.join(outputDir, args.metricsFile); const reportPath = path.join(outputDir, args.reportFile); writeUtf8Bom(matrixPath, buildCaseMatrixMarkdown(caseRows)); writeUtf8Bom(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`); writeUtf8Bom(reportPath, buildRegressionReport(caseRows, metrics, baselineMetrics)); process.stdout.write( [ `rows=${caseRows.length}`, `matrix=${matrixPath}`, `metrics=${metricsPath}`, `report=${reportPath}` ].join("\n") ); } main().catch((error) => { process.stderr.write(`${error?.stack || error}\n`); process.exitCode = 1; });