615 lines
19 KiB
JavaScript
615 lines
19 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
const fs = require("node:fs");
|
||
const path = require("node:path");
|
||
|
||
const EXPECTED_QUESTION_TYPES = [
|
||
"why_breaks",
|
||
"prove_or_guess",
|
||
"prove_or_guess",
|
||
"why_breaks",
|
||
"where_break_is",
|
||
"prove_or_guess",
|
||
"why_breaks",
|
||
"which_chains_are_complete_vs_incomplete",
|
||
"which_chains_are_complete_vs_incomplete",
|
||
"prove_or_guess",
|
||
"why_breaks",
|
||
"prove_or_guess",
|
||
"why_breaks",
|
||
"what_is_it_grounded_on",
|
||
"why_breaks",
|
||
"which_chains_are_complete_vs_incomplete",
|
||
"prove_or_guess",
|
||
"what_is_it_grounded_on",
|
||
"why_breaks",
|
||
"prove_or_guess"
|
||
];
|
||
|
||
function parseArgs(argv) {
|
||
const args = {
|
||
rawFile: "",
|
||
outputDir: "",
|
||
caseMatrixFile: "wave13_chat20_case_matrix_updated.md",
|
||
metricsFile: "wave13_chat20_metrics.json",
|
||
reportFile: "wave13_regression_report.md",
|
||
baselineMetricsFile: ""
|
||
};
|
||
|
||
for (let i = 0; i < argv.length; i += 1) {
|
||
const token = argv[i];
|
||
if (token === "--raw-file") {
|
||
args.rawFile = String(argv[i + 1] ?? "");
|
||
i += 1;
|
||
continue;
|
||
}
|
||
if (token === "--output-dir") {
|
||
args.outputDir = String(argv[i + 1] ?? "");
|
||
i += 1;
|
||
continue;
|
||
}
|
||
if (token === "--case-matrix-file") {
|
||
args.caseMatrixFile = String(argv[i + 1] ?? args.caseMatrixFile);
|
||
i += 1;
|
||
continue;
|
||
}
|
||
if (token === "--metrics-file") {
|
||
args.metricsFile = String(argv[i + 1] ?? args.metricsFile);
|
||
i += 1;
|
||
continue;
|
||
}
|
||
if (token === "--report-file") {
|
||
args.reportFile = String(argv[i + 1] ?? args.reportFile);
|
||
i += 1;
|
||
continue;
|
||
}
|
||
if (token === "--baseline-metrics-file") {
|
||
args.baselineMetricsFile = String(argv[i + 1] ?? "");
|
||
i += 1;
|
||
}
|
||
}
|
||
|
||
return args;
|
||
}
|
||
|
||
function ensureDir(dirPath) {
|
||
fs.mkdirSync(dirPath, { recursive: true });
|
||
}
|
||
|
||
function readJson(filePath) {
|
||
const raw = fs.readFileSync(filePath, "utf8").replace(/^\uFEFF/, "");
|
||
return JSON.parse(raw);
|
||
}
|
||
|
||
function writeUtf8Bom(filePath, content) {
|
||
ensureDir(path.dirname(filePath));
|
||
fs.writeFileSync(filePath, `\uFEFF${content}`, "utf8");
|
||
}
|
||
|
||
function text(value) {
|
||
return value == null ? "" : String(value);
|
||
}
|
||
|
||
function lower(value) {
|
||
return text(value).toLowerCase();
|
||
}
|
||
|
||
function expectedDomainByIndex(index) {
|
||
const caseNo = index + 1;
|
||
if (caseNo <= 8) {
|
||
return "settlements_60_62";
|
||
}
|
||
if (caseNo <= 16) {
|
||
return "vat_document_register_book";
|
||
}
|
||
return "month_close_costs_20_44";
|
||
}
|
||
|
||
function normalizeInternalDomain(domainName) {
|
||
const d = lower(domainName);
|
||
if (!d) {
|
||
return "unknown";
|
||
}
|
||
if (
|
||
d.includes("settlement") ||
|
||
d.includes("supplier") ||
|
||
d.includes("customer") ||
|
||
d.includes("bank")
|
||
) {
|
||
return "settlements_60_62";
|
||
}
|
||
if (d.includes("vat") || d.includes("nds")) {
|
||
return "vat_document_register_book";
|
||
}
|
||
if (
|
||
d.includes("period_close") ||
|
||
d.includes("month_close") ||
|
||
d.includes("deferred_expense") ||
|
||
d.includes("fixed_asset") ||
|
||
d.includes("close")
|
||
) {
|
||
return "month_close_costs_20_44";
|
||
}
|
||
return "unknown";
|
||
}
|
||
|
||
function mergeCountMap(target, source) {
|
||
if (!source || typeof source !== "object") {
|
||
return;
|
||
}
|
||
for (const [key, value] of Object.entries(source)) {
|
||
const name = text(key);
|
||
if (!name) {
|
||
continue;
|
||
}
|
||
const count = Number(value) || 0;
|
||
if (!target[name]) {
|
||
target[name] = 0;
|
||
}
|
||
target[name] += count > 0 ? count : 1;
|
||
}
|
||
}
|
||
|
||
function collectDomainScores(row) {
|
||
const scores = {};
|
||
const retrieval = Array.isArray(row?.debug?.retrieval_results) ? row.debug.retrieval_results : [];
|
||
for (const item of retrieval) {
|
||
mergeCountMap(scores, item?.problem_unit_summary?.lifecycle_domain_distribution);
|
||
mergeCountMap(scores, item?.problem_unit_summary?.graph_summary?.domain_distribution);
|
||
const domainCard = text(item?.summary?.domain_purity_guard?.domain_card_id);
|
||
if (domainCard) {
|
||
if (!scores[domainCard]) {
|
||
scores[domainCard] = 0;
|
||
}
|
||
scores[domainCard] += 2;
|
||
}
|
||
const resultItems = Array.isArray(item?.items) ? item.items : [];
|
||
for (const resultItem of resultItems) {
|
||
const scopes = Array.isArray(resultItem?.graph_domain_scope) ? resultItem.graph_domain_scope : [];
|
||
for (const scope of scopes) {
|
||
const name = text(scope);
|
||
if (!name) {
|
||
continue;
|
||
}
|
||
if (!scores[name]) {
|
||
scores[name] = 0;
|
||
}
|
||
scores[name] += 1;
|
||
}
|
||
}
|
||
}
|
||
const activeDomain = text(row?.debug?.investigation_state_snapshot?.focus?.active_domain);
|
||
if (activeDomain) {
|
||
if (!scores[activeDomain]) {
|
||
scores[activeDomain] = 0;
|
||
}
|
||
scores[activeDomain] += 1;
|
||
}
|
||
return scores;
|
||
}
|
||
|
||
function pickActualDomain(row) {
|
||
const scores = collectDomainScores(row);
|
||
const sorted = Object.entries(scores).sort((a, b) => {
|
||
if (b[1] !== a[1]) {
|
||
return b[1] - a[1];
|
||
}
|
||
return String(a[0]).localeCompare(String(b[0]));
|
||
});
|
||
if (!sorted.length) {
|
||
return "unknown";
|
||
}
|
||
return normalizeInternalDomain(sorted[0][0]);
|
||
}
|
||
|
||
function pickActualQuestionType(row) {
|
||
const qType = text(row?.debug?.question_type_class);
|
||
return qType || "unknown";
|
||
}
|
||
|
||
function extractCompanyAnchors(row) {
|
||
const all = row?.debug?.company_anchors?.all;
|
||
if (!Array.isArray(all)) {
|
||
return [];
|
||
}
|
||
return all.map((v) => text(v).trim()).filter(Boolean);
|
||
}
|
||
|
||
function hasAnchorUsageInAnswer(row, anchors) {
|
||
if (!anchors.length) {
|
||
return false;
|
||
}
|
||
const reply = lower(row?.assistant_reply);
|
||
if (!reply) {
|
||
return false;
|
||
}
|
||
if (reply.includes("в опоре использованы якоря вопроса")) {
|
||
return true;
|
||
}
|
||
for (const anchor of anchors) {
|
||
const value = lower(anchor);
|
||
if (value.length < 3) {
|
||
continue;
|
||
}
|
||
if (reply.includes(value)) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
function evaluateEvidenceStrength(row) {
|
||
const status = lower(row?.debug?.answer_grounding_check?.status);
|
||
if (status === "grounded") {
|
||
return "strong";
|
||
}
|
||
if (status === "partial") {
|
||
return "weak";
|
||
}
|
||
if (status === "no_grounded_answer") {
|
||
return "none";
|
||
}
|
||
return "limited";
|
||
}
|
||
|
||
function evaluateConfidenceStyle(row) {
|
||
const reply = lower(row?.assistant_reply);
|
||
if (!reply) {
|
||
return "unknown";
|
||
}
|
||
const hasLimitation =
|
||
reply.includes("ограничени") ||
|
||
reply.includes("частично") ||
|
||
reply.includes("низкая") ||
|
||
reply.includes("не подтвержден");
|
||
const hasConfident =
|
||
reply.includes("подтверждено") ||
|
||
reply.includes("доказ") ||
|
||
reply.includes("подтверждается");
|
||
if (hasLimitation && hasConfident) {
|
||
return "mixed";
|
||
}
|
||
if (hasLimitation) {
|
||
return "limited";
|
||
}
|
||
if (hasConfident) {
|
||
return "confident";
|
||
}
|
||
return "neutral";
|
||
}
|
||
|
||
function containsAny(textValue, needles) {
|
||
const body = lower(textValue);
|
||
return needles.some((needle) => body.includes(lower(needle)));
|
||
}
|
||
|
||
function evaluateFirstCheckRelevance(row, expectedDomain) {
|
||
const reply = text(row?.assistant_reply);
|
||
if (!reply) {
|
||
return false;
|
||
}
|
||
if (expectedDomain === "settlements_60_62") {
|
||
return containsAny(reply, [
|
||
"договор",
|
||
"объект расчет",
|
||
"регистр расчет",
|
||
"зачет аванс",
|
||
"взаимозачет",
|
||
"60/62/76"
|
||
]);
|
||
}
|
||
if (expectedDomain === "vat_document_register_book") {
|
||
return containsAny(reply, [
|
||
"ндс",
|
||
"счет-фактур",
|
||
"книга покуп",
|
||
"книга продаж",
|
||
"регистр",
|
||
"19"
|
||
]);
|
||
}
|
||
if (expectedDomain === "month_close_costs_20_44") {
|
||
return containsAny(reply, [
|
||
"закрыти",
|
||
"рбп",
|
||
"амортизац",
|
||
"косвен",
|
||
"20",
|
||
"25",
|
||
"26",
|
||
"44"
|
||
]);
|
||
}
|
||
return false;
|
||
}
|
||
|
||
function evaluateGenericAnswer(row) {
|
||
const reply = lower(row?.assistant_reply);
|
||
if (!reply) {
|
||
return true;
|
||
}
|
||
const genericPatterns = [
|
||
"коротко: проблема с закрытием расчета подтверждается частично",
|
||
"сигнал проблемы есть, но механизм подтвержден не полностью",
|
||
"вывод сделан по snapshot",
|
||
"проверьте договор, объект расчетов, регистр расчетов",
|
||
"проверьте договор и объект расчетов"
|
||
];
|
||
const hits = genericPatterns.filter((pattern) => reply.includes(pattern)).length;
|
||
return hits >= 2;
|
||
}
|
||
|
||
function shortQuestion(value, maxLength = 130) {
|
||
const q = text(value).replace(/\s+/g, " ").trim();
|
||
if (q.length <= maxLength) {
|
||
return q;
|
||
}
|
||
return `${q.slice(0, maxLength - 3)}...`;
|
||
}
|
||
|
||
function toPercent(value) {
|
||
return Number(value.toFixed(4));
|
||
}
|
||
|
||
function buildCaseRow(index, row) {
|
||
const expectedDomain = expectedDomainByIndex(index);
|
||
const actualDomain = pickActualDomain(row);
|
||
const expectedQuestionType = EXPECTED_QUESTION_TYPES[index] || "unknown";
|
||
const actualQuestionType = pickActualQuestionType(row);
|
||
const anchors = extractCompanyAnchors(row);
|
||
const anchorsPresent = anchors.length > 0;
|
||
const anchorsUsed = hasAnchorUsageInAnswer(row, anchors);
|
||
const evidenceStrength = evaluateEvidenceStrength(row);
|
||
const confidenceStyle = evaluateConfidenceStyle(row);
|
||
const firstCheckRelevant = evaluateFirstCheckRelevance(row, expectedDomain);
|
||
const genericAnswer = evaluateGenericAnswer(row);
|
||
|
||
const reasons = [];
|
||
if (actualDomain !== expectedDomain) {
|
||
reasons.push("wrong_domain");
|
||
}
|
||
if (actualQuestionType !== expectedQuestionType) {
|
||
reasons.push("wrong_question_type");
|
||
}
|
||
if (anchorsPresent && !anchorsUsed) {
|
||
reasons.push("weak_company_anchor_usage");
|
||
}
|
||
if (!firstCheckRelevant) {
|
||
reasons.push("wrong_first_check");
|
||
}
|
||
if (genericAnswer) {
|
||
reasons.push("generic_answer");
|
||
}
|
||
|
||
let verdict = "PASS";
|
||
if (reasons.length > 0) {
|
||
const hardFail = reasons.includes("wrong_domain") || reasons.includes("wrong_first_check");
|
||
verdict = hardFail || reasons.length >= 3 ? "FAIL" : "SOFT_PASS";
|
||
}
|
||
|
||
return {
|
||
case_id: text(row?.case_id) || `q${String(index + 1).padStart(2, "0")}`,
|
||
question_short: shortQuestion(row?.user_message),
|
||
expected_domain: expectedDomain,
|
||
actual_domain: actualDomain,
|
||
expected_question_type: expectedQuestionType,
|
||
actual_question_type: actualQuestionType,
|
||
company_anchors_present: anchorsPresent,
|
||
company_anchors_used_in_answer: anchorsUsed,
|
||
evidence_strength: evidenceStrength,
|
||
answer_confidence_style: confidenceStyle,
|
||
first_check_relevance: firstCheckRelevant,
|
||
verdict,
|
||
failure_reason_short: reasons.length ? reasons.join(", ") : "none",
|
||
is_generic_answer: genericAnswer,
|
||
failure_reasons: reasons
|
||
};
|
||
}
|
||
|
||
function markdownCell(value) {
|
||
return text(value).replace(/\|/g, "\\|");
|
||
}
|
||
|
||
function buildCaseMatrixMarkdown(rows) {
|
||
const lines = [];
|
||
lines.push("# Wave 13 Chat20 Case Matrix (Updated)");
|
||
lines.push("");
|
||
lines.push("| case_id | question_short | expected_domain | actual_domain | expected_question_type | actual_question_type | company_anchors_present | company_anchors_used_in_answer | evidence_strength | answer_confidence_style | first_check_relevance | verdict | failure_reason_short |");
|
||
lines.push("|---|---|---|---|---|---|---|---|---|---|---|---|---|");
|
||
for (const row of rows) {
|
||
lines.push(
|
||
`| ${markdownCell(row.case_id)} | ${markdownCell(row.question_short)} | ${markdownCell(row.expected_domain)} | ${markdownCell(row.actual_domain)} | ${markdownCell(row.expected_question_type)} | ${markdownCell(row.actual_question_type)} | ${markdownCell(row.company_anchors_present)} | ${markdownCell(row.company_anchors_used_in_answer)} | ${markdownCell(row.evidence_strength)} | ${markdownCell(row.answer_confidence_style)} | ${markdownCell(row.first_check_relevance)} | ${markdownCell(row.verdict)} | ${markdownCell(row.failure_reason_short)} |`
|
||
);
|
||
}
|
||
lines.push("");
|
||
return `${lines.join("\n")}\n`;
|
||
}
|
||
|
||
function countBy(rows, selector) {
|
||
const result = {};
|
||
for (const row of rows) {
|
||
const key = selector(row);
|
||
if (!result[key]) {
|
||
result[key] = 0;
|
||
}
|
||
result[key] += 1;
|
||
}
|
||
return result;
|
||
}
|
||
|
||
function buildRegressionReport(rows, metrics, baselineMetrics) {
|
||
const lines = [];
|
||
lines.push("# Wave 13 Regression Report");
|
||
lines.push("");
|
||
lines.push(`- Cases: ${rows.length}`);
|
||
lines.push(`- PASS: ${metrics.totals.pass}`);
|
||
lines.push(`- SOFT_PASS: ${metrics.totals.soft_pass}`);
|
||
lines.push(`- FAIL: ${metrics.totals.fail}`);
|
||
lines.push("");
|
||
lines.push("## Metric Snapshot");
|
||
lines.push(`- domain_correctness_rate: ${metrics.domain_correctness_rate}`);
|
||
lines.push(`- question_type_fit_rate: ${metrics.question_type_fit_rate}`);
|
||
lines.push(`- company_anchor_usage_rate: ${metrics.company_anchor_usage_rate}`);
|
||
lines.push(`- generic_answer_rate: ${metrics.generic_answer_rate}`);
|
||
lines.push(`- first_check_relevance_rate: ${metrics.first_check_relevance_rate}`);
|
||
lines.push("");
|
||
if (baselineMetrics) {
|
||
lines.push("## Delta vs Baseline");
|
||
for (const key of [
|
||
"domain_correctness_rate",
|
||
"question_type_fit_rate",
|
||
"company_anchor_usage_rate",
|
||
"generic_answer_rate",
|
||
"first_check_relevance_rate"
|
||
]) {
|
||
const current = Number(metrics[key] ?? 0);
|
||
const baseline = Number(baselineMetrics[key] ?? 0);
|
||
const delta = Number((current - baseline).toFixed(4));
|
||
lines.push(`- ${key}: ${baseline} -> ${current} (delta ${delta >= 0 ? "+" : ""}${delta})`);
|
||
}
|
||
lines.push("");
|
||
}
|
||
|
||
const failures = rows.filter((row) => row.verdict !== "PASS");
|
||
const reasonCounts = {};
|
||
for (const row of failures) {
|
||
for (const reason of row.failure_reasons) {
|
||
if (!reasonCounts[reason]) {
|
||
reasonCounts[reason] = 0;
|
||
}
|
||
reasonCounts[reason] += 1;
|
||
}
|
||
}
|
||
const topReasons = Object.entries(reasonCounts).sort((a, b) => b[1] - a[1]).slice(0, 5);
|
||
lines.push("## Top Defects");
|
||
if (!topReasons.length) {
|
||
lines.push("- No defects detected.");
|
||
} else {
|
||
for (const [reason, count] of topReasons) {
|
||
lines.push(`- ${reason}: ${count}`);
|
||
}
|
||
}
|
||
lines.push("");
|
||
|
||
lines.push("## FAIL Cases");
|
||
for (const row of rows.filter((item) => item.verdict === "FAIL")) {
|
||
lines.push(`- ${row.case_id}: ${row.failure_reason_short}`);
|
||
}
|
||
lines.push("");
|
||
|
||
return `${lines.join("\n")}\n`;
|
||
}
|
||
|
||
async function main() {
|
||
const args = parseArgs(process.argv.slice(2));
|
||
if (!args.rawFile) {
|
||
throw new Error("Missing required argument --raw-file");
|
||
}
|
||
if (!args.outputDir) {
|
||
throw new Error("Missing required argument --output-dir");
|
||
}
|
||
|
||
const rawPath = path.resolve(args.rawFile);
|
||
const outputDir = path.resolve(args.outputDir);
|
||
|
||
const raw = readJson(rawPath);
|
||
const rows = Array.isArray(raw?.rows) ? raw.rows : [];
|
||
if (rows.length === 0) {
|
||
throw new Error("Raw file contains no rows.");
|
||
}
|
||
|
||
const caseRows = rows.map((row, index) => buildCaseRow(index, row));
|
||
const totalsByVerdict = countBy(caseRows, (row) => row.verdict);
|
||
const domainCorrect = caseRows.filter((row) => row.expected_domain === row.actual_domain).length;
|
||
const qTypeFit = caseRows.filter((row) => row.expected_question_type === row.actual_question_type).length;
|
||
const anchorsPresentCount = caseRows.filter((row) => row.company_anchors_present).length;
|
||
const anchorsUsedCount = caseRows.filter(
|
||
(row) => row.company_anchors_present && row.company_anchors_used_in_answer
|
||
).length;
|
||
const genericCount = caseRows.filter((row) => row.is_generic_answer).length;
|
||
const firstCheckRelevantCount = caseRows.filter((row) => row.first_check_relevance).length;
|
||
|
||
const metrics = {
|
||
schema_version: "wave13_chat20_metrics_v2",
|
||
run_id: path.basename(outputDir),
|
||
source_session_id: text(raw?.session_id),
|
||
totals: {
|
||
cases: caseRows.length,
|
||
pass: totalsByVerdict.PASS || 0,
|
||
soft_pass: totalsByVerdict.SOFT_PASS || 0,
|
||
fail: totalsByVerdict.FAIL || 0
|
||
},
|
||
domain_correctness_rate: toPercent(domainCorrect / caseRows.length),
|
||
question_type_fit_rate: toPercent(qTypeFit / caseRows.length),
|
||
company_anchor_usage_rate: toPercent(
|
||
anchorsPresentCount > 0 ? anchorsUsedCount / anchorsPresentCount : 0
|
||
),
|
||
company_anchor_usage_rate_global: toPercent(anchorsUsedCount / caseRows.length),
|
||
generic_answer_rate: toPercent(genericCount / caseRows.length),
|
||
first_check_relevance_rate: toPercent(firstCheckRelevantCount / caseRows.length),
|
||
anchors_present_cases: anchorsPresentCount,
|
||
anchors_used_cases: anchorsUsedCount
|
||
};
|
||
|
||
let baselineMetrics = null;
|
||
if (args.baselineMetricsFile) {
|
||
const baselinePath = path.resolve(args.baselineMetricsFile);
|
||
if (fs.existsSync(baselinePath)) {
|
||
baselineMetrics = readJson(baselinePath);
|
||
metrics.baseline_reference = path.basename(baselinePath);
|
||
metrics.baseline_metrics = {
|
||
domain_correctness_rate: baselineMetrics.domain_correctness_rate,
|
||
question_type_fit_rate: baselineMetrics.question_type_fit_rate,
|
||
company_anchor_usage_rate: baselineMetrics.company_anchor_usage_rate,
|
||
generic_answer_rate: baselineMetrics.generic_answer_rate,
|
||
first_check_relevance_rate: baselineMetrics.first_check_relevance_rate
|
||
};
|
||
metrics.delta_vs_baseline = {
|
||
domain_correctness_rate_delta: toPercent(
|
||
Number(metrics.domain_correctness_rate) -
|
||
Number(metrics.baseline_metrics.domain_correctness_rate || 0)
|
||
),
|
||
question_type_fit_rate_delta: toPercent(
|
||
Number(metrics.question_type_fit_rate) -
|
||
Number(metrics.baseline_metrics.question_type_fit_rate || 0)
|
||
),
|
||
company_anchor_usage_rate_delta: toPercent(
|
||
Number(metrics.company_anchor_usage_rate) -
|
||
Number(metrics.baseline_metrics.company_anchor_usage_rate || 0)
|
||
),
|
||
generic_answer_rate_delta: toPercent(
|
||
Number(metrics.generic_answer_rate) -
|
||
Number(metrics.baseline_metrics.generic_answer_rate || 0)
|
||
),
|
||
first_check_relevance_rate_delta: toPercent(
|
||
Number(metrics.first_check_relevance_rate) -
|
||
Number(metrics.baseline_metrics.first_check_relevance_rate || 0)
|
||
)
|
||
};
|
||
}
|
||
}
|
||
|
||
const matrixPath = path.join(outputDir, args.caseMatrixFile);
|
||
const metricsPath = path.join(outputDir, args.metricsFile);
|
||
const reportPath = path.join(outputDir, args.reportFile);
|
||
|
||
writeUtf8Bom(matrixPath, buildCaseMatrixMarkdown(caseRows));
|
||
writeUtf8Bom(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`);
|
||
writeUtf8Bom(reportPath, buildRegressionReport(caseRows, metrics, baselineMetrics));
|
||
|
||
process.stdout.write(
|
||
[
|
||
`rows=${caseRows.length}`,
|
||
`matrix=${matrixPath}`,
|
||
`metrics=${metricsPath}`,
|
||
`report=${reportPath}`
|
||
].join("\n")
|
||
);
|
||
}
|
||
|
||
main().catch((error) => {
|
||
process.stderr.write(`${error?.stack || error}\n`);
|
||
process.exitCode = 1;
|
||
});
|
||
|