NODEDC_1C/llm_normalizer/backend/scripts/analyzeWave13Chat20.js

615 lines
19 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
const fs = require("node:fs");
const path = require("node:path");
const EXPECTED_QUESTION_TYPES = [
"why_breaks",
"prove_or_guess",
"prove_or_guess",
"why_breaks",
"where_break_is",
"prove_or_guess",
"why_breaks",
"which_chains_are_complete_vs_incomplete",
"which_chains_are_complete_vs_incomplete",
"prove_or_guess",
"why_breaks",
"prove_or_guess",
"why_breaks",
"what_is_it_grounded_on",
"why_breaks",
"which_chains_are_complete_vs_incomplete",
"prove_or_guess",
"what_is_it_grounded_on",
"why_breaks",
"prove_or_guess"
];
function parseArgs(argv) {
const args = {
rawFile: "",
outputDir: "",
caseMatrixFile: "wave13_chat20_case_matrix_updated.md",
metricsFile: "wave13_chat20_metrics.json",
reportFile: "wave13_regression_report.md",
baselineMetricsFile: ""
};
for (let i = 0; i < argv.length; i += 1) {
const token = argv[i];
if (token === "--raw-file") {
args.rawFile = String(argv[i + 1] ?? "");
i += 1;
continue;
}
if (token === "--output-dir") {
args.outputDir = String(argv[i + 1] ?? "");
i += 1;
continue;
}
if (token === "--case-matrix-file") {
args.caseMatrixFile = String(argv[i + 1] ?? args.caseMatrixFile);
i += 1;
continue;
}
if (token === "--metrics-file") {
args.metricsFile = String(argv[i + 1] ?? args.metricsFile);
i += 1;
continue;
}
if (token === "--report-file") {
args.reportFile = String(argv[i + 1] ?? args.reportFile);
i += 1;
continue;
}
if (token === "--baseline-metrics-file") {
args.baselineMetricsFile = String(argv[i + 1] ?? "");
i += 1;
}
}
return args;
}
function ensureDir(dirPath) {
fs.mkdirSync(dirPath, { recursive: true });
}
function readJson(filePath) {
const raw = fs.readFileSync(filePath, "utf8").replace(/^\uFEFF/, "");
return JSON.parse(raw);
}
function writeUtf8Bom(filePath, content) {
ensureDir(path.dirname(filePath));
fs.writeFileSync(filePath, `\uFEFF${content}`, "utf8");
}
function text(value) {
return value == null ? "" : String(value);
}
function lower(value) {
return text(value).toLowerCase();
}
function expectedDomainByIndex(index) {
const caseNo = index + 1;
if (caseNo <= 8) {
return "settlements_60_62";
}
if (caseNo <= 16) {
return "vat_document_register_book";
}
return "month_close_costs_20_44";
}
function normalizeInternalDomain(domainName) {
const d = lower(domainName);
if (!d) {
return "unknown";
}
if (
d.includes("settlement") ||
d.includes("supplier") ||
d.includes("customer") ||
d.includes("bank")
) {
return "settlements_60_62";
}
if (d.includes("vat") || d.includes("nds")) {
return "vat_document_register_book";
}
if (
d.includes("period_close") ||
d.includes("month_close") ||
d.includes("deferred_expense") ||
d.includes("fixed_asset") ||
d.includes("close")
) {
return "month_close_costs_20_44";
}
return "unknown";
}
function mergeCountMap(target, source) {
if (!source || typeof source !== "object") {
return;
}
for (const [key, value] of Object.entries(source)) {
const name = text(key);
if (!name) {
continue;
}
const count = Number(value) || 0;
if (!target[name]) {
target[name] = 0;
}
target[name] += count > 0 ? count : 1;
}
}
function collectDomainScores(row) {
const scores = {};
const retrieval = Array.isArray(row?.debug?.retrieval_results) ? row.debug.retrieval_results : [];
for (const item of retrieval) {
mergeCountMap(scores, item?.problem_unit_summary?.lifecycle_domain_distribution);
mergeCountMap(scores, item?.problem_unit_summary?.graph_summary?.domain_distribution);
const domainCard = text(item?.summary?.domain_purity_guard?.domain_card_id);
if (domainCard) {
if (!scores[domainCard]) {
scores[domainCard] = 0;
}
scores[domainCard] += 2;
}
const resultItems = Array.isArray(item?.items) ? item.items : [];
for (const resultItem of resultItems) {
const scopes = Array.isArray(resultItem?.graph_domain_scope) ? resultItem.graph_domain_scope : [];
for (const scope of scopes) {
const name = text(scope);
if (!name) {
continue;
}
if (!scores[name]) {
scores[name] = 0;
}
scores[name] += 1;
}
}
}
const activeDomain = text(row?.debug?.investigation_state_snapshot?.focus?.active_domain);
if (activeDomain) {
if (!scores[activeDomain]) {
scores[activeDomain] = 0;
}
scores[activeDomain] += 1;
}
return scores;
}
function pickActualDomain(row) {
const scores = collectDomainScores(row);
const sorted = Object.entries(scores).sort((a, b) => {
if (b[1] !== a[1]) {
return b[1] - a[1];
}
return String(a[0]).localeCompare(String(b[0]));
});
if (!sorted.length) {
return "unknown";
}
return normalizeInternalDomain(sorted[0][0]);
}
function pickActualQuestionType(row) {
const qType = text(row?.debug?.question_type_class);
return qType || "unknown";
}
function extractCompanyAnchors(row) {
const all = row?.debug?.company_anchors?.all;
if (!Array.isArray(all)) {
return [];
}
return all.map((v) => text(v).trim()).filter(Boolean);
}
function hasAnchorUsageInAnswer(row, anchors) {
if (!anchors.length) {
return false;
}
const reply = lower(row?.assistant_reply);
if (!reply) {
return false;
}
if (reply.includes("в опоре использованы якоря вопроса")) {
return true;
}
for (const anchor of anchors) {
const value = lower(anchor);
if (value.length < 3) {
continue;
}
if (reply.includes(value)) {
return true;
}
}
return false;
}
function evaluateEvidenceStrength(row) {
const status = lower(row?.debug?.answer_grounding_check?.status);
if (status === "grounded") {
return "strong";
}
if (status === "partial") {
return "weak";
}
if (status === "no_grounded_answer") {
return "none";
}
return "limited";
}
function evaluateConfidenceStyle(row) {
const reply = lower(row?.assistant_reply);
if (!reply) {
return "unknown";
}
const hasLimitation =
reply.includes("ограничени") ||
reply.includes("частично") ||
reply.includes("низкая") ||
reply.includes("не подтвержден");
const hasConfident =
reply.includes("подтверждено") ||
reply.includes("доказ") ||
reply.includes("подтверждается");
if (hasLimitation && hasConfident) {
return "mixed";
}
if (hasLimitation) {
return "limited";
}
if (hasConfident) {
return "confident";
}
return "neutral";
}
function containsAny(textValue, needles) {
const body = lower(textValue);
return needles.some((needle) => body.includes(lower(needle)));
}
function evaluateFirstCheckRelevance(row, expectedDomain) {
const reply = text(row?.assistant_reply);
if (!reply) {
return false;
}
if (expectedDomain === "settlements_60_62") {
return containsAny(reply, [
"договор",
"объект расчет",
"регистр расчет",
"зачет аванс",
"взаимозачет",
"60/62/76"
]);
}
if (expectedDomain === "vat_document_register_book") {
return containsAny(reply, [
"ндс",
"счет-фактур",
"книга покуп",
"книга продаж",
"регистр",
"19"
]);
}
if (expectedDomain === "month_close_costs_20_44") {
return containsAny(reply, [
"закрыти",
"рбп",
"амортизац",
"косвен",
"20",
"25",
"26",
"44"
]);
}
return false;
}
function evaluateGenericAnswer(row) {
const reply = lower(row?.assistant_reply);
if (!reply) {
return true;
}
const genericPatterns = [
"коротко: проблема с закрытием расчета подтверждается частично",
"сигнал проблемы есть, но механизм подтвержден не полностью",
"вывод сделан по snapshot",
"проверьте договор, объект расчетов, регистр расчетов",
"проверьте договор и объект расчетов"
];
const hits = genericPatterns.filter((pattern) => reply.includes(pattern)).length;
return hits >= 2;
}
function shortQuestion(value, maxLength = 130) {
const q = text(value).replace(/\s+/g, " ").trim();
if (q.length <= maxLength) {
return q;
}
return `${q.slice(0, maxLength - 3)}...`;
}
function toPercent(value) {
return Number(value.toFixed(4));
}
function buildCaseRow(index, row) {
const expectedDomain = expectedDomainByIndex(index);
const actualDomain = pickActualDomain(row);
const expectedQuestionType = EXPECTED_QUESTION_TYPES[index] || "unknown";
const actualQuestionType = pickActualQuestionType(row);
const anchors = extractCompanyAnchors(row);
const anchorsPresent = anchors.length > 0;
const anchorsUsed = hasAnchorUsageInAnswer(row, anchors);
const evidenceStrength = evaluateEvidenceStrength(row);
const confidenceStyle = evaluateConfidenceStyle(row);
const firstCheckRelevant = evaluateFirstCheckRelevance(row, expectedDomain);
const genericAnswer = evaluateGenericAnswer(row);
const reasons = [];
if (actualDomain !== expectedDomain) {
reasons.push("wrong_domain");
}
if (actualQuestionType !== expectedQuestionType) {
reasons.push("wrong_question_type");
}
if (anchorsPresent && !anchorsUsed) {
reasons.push("weak_company_anchor_usage");
}
if (!firstCheckRelevant) {
reasons.push("wrong_first_check");
}
if (genericAnswer) {
reasons.push("generic_answer");
}
let verdict = "PASS";
if (reasons.length > 0) {
const hardFail = reasons.includes("wrong_domain") || reasons.includes("wrong_first_check");
verdict = hardFail || reasons.length >= 3 ? "FAIL" : "SOFT_PASS";
}
return {
case_id: text(row?.case_id) || `q${String(index + 1).padStart(2, "0")}`,
question_short: shortQuestion(row?.user_message),
expected_domain: expectedDomain,
actual_domain: actualDomain,
expected_question_type: expectedQuestionType,
actual_question_type: actualQuestionType,
company_anchors_present: anchorsPresent,
company_anchors_used_in_answer: anchorsUsed,
evidence_strength: evidenceStrength,
answer_confidence_style: confidenceStyle,
first_check_relevance: firstCheckRelevant,
verdict,
failure_reason_short: reasons.length ? reasons.join(", ") : "none",
is_generic_answer: genericAnswer,
failure_reasons: reasons
};
}
function markdownCell(value) {
return text(value).replace(/\|/g, "\\|");
}
function buildCaseMatrixMarkdown(rows) {
const lines = [];
lines.push("# Wave 13 Chat20 Case Matrix (Updated)");
lines.push("");
lines.push("| case_id | question_short | expected_domain | actual_domain | expected_question_type | actual_question_type | company_anchors_present | company_anchors_used_in_answer | evidence_strength | answer_confidence_style | first_check_relevance | verdict | failure_reason_short |");
lines.push("|---|---|---|---|---|---|---|---|---|---|---|---|---|");
for (const row of rows) {
lines.push(
`| ${markdownCell(row.case_id)} | ${markdownCell(row.question_short)} | ${markdownCell(row.expected_domain)} | ${markdownCell(row.actual_domain)} | ${markdownCell(row.expected_question_type)} | ${markdownCell(row.actual_question_type)} | ${markdownCell(row.company_anchors_present)} | ${markdownCell(row.company_anchors_used_in_answer)} | ${markdownCell(row.evidence_strength)} | ${markdownCell(row.answer_confidence_style)} | ${markdownCell(row.first_check_relevance)} | ${markdownCell(row.verdict)} | ${markdownCell(row.failure_reason_short)} |`
);
}
lines.push("");
return `${lines.join("\n")}\n`;
}
function countBy(rows, selector) {
const result = {};
for (const row of rows) {
const key = selector(row);
if (!result[key]) {
result[key] = 0;
}
result[key] += 1;
}
return result;
}
function buildRegressionReport(rows, metrics, baselineMetrics) {
const lines = [];
lines.push("# Wave 13 Regression Report");
lines.push("");
lines.push(`- Cases: ${rows.length}`);
lines.push(`- PASS: ${metrics.totals.pass}`);
lines.push(`- SOFT_PASS: ${metrics.totals.soft_pass}`);
lines.push(`- FAIL: ${metrics.totals.fail}`);
lines.push("");
lines.push("## Metric Snapshot");
lines.push(`- domain_correctness_rate: ${metrics.domain_correctness_rate}`);
lines.push(`- question_type_fit_rate: ${metrics.question_type_fit_rate}`);
lines.push(`- company_anchor_usage_rate: ${metrics.company_anchor_usage_rate}`);
lines.push(`- generic_answer_rate: ${metrics.generic_answer_rate}`);
lines.push(`- first_check_relevance_rate: ${metrics.first_check_relevance_rate}`);
lines.push("");
if (baselineMetrics) {
lines.push("## Delta vs Baseline");
for (const key of [
"domain_correctness_rate",
"question_type_fit_rate",
"company_anchor_usage_rate",
"generic_answer_rate",
"first_check_relevance_rate"
]) {
const current = Number(metrics[key] ?? 0);
const baseline = Number(baselineMetrics[key] ?? 0);
const delta = Number((current - baseline).toFixed(4));
lines.push(`- ${key}: ${baseline} -> ${current} (delta ${delta >= 0 ? "+" : ""}${delta})`);
}
lines.push("");
}
const failures = rows.filter((row) => row.verdict !== "PASS");
const reasonCounts = {};
for (const row of failures) {
for (const reason of row.failure_reasons) {
if (!reasonCounts[reason]) {
reasonCounts[reason] = 0;
}
reasonCounts[reason] += 1;
}
}
const topReasons = Object.entries(reasonCounts).sort((a, b) => b[1] - a[1]).slice(0, 5);
lines.push("## Top Defects");
if (!topReasons.length) {
lines.push("- No defects detected.");
} else {
for (const [reason, count] of topReasons) {
lines.push(`- ${reason}: ${count}`);
}
}
lines.push("");
lines.push("## FAIL Cases");
for (const row of rows.filter((item) => item.verdict === "FAIL")) {
lines.push(`- ${row.case_id}: ${row.failure_reason_short}`);
}
lines.push("");
return `${lines.join("\n")}\n`;
}
async function main() {
const args = parseArgs(process.argv.slice(2));
if (!args.rawFile) {
throw new Error("Missing required argument --raw-file");
}
if (!args.outputDir) {
throw new Error("Missing required argument --output-dir");
}
const rawPath = path.resolve(args.rawFile);
const outputDir = path.resolve(args.outputDir);
const raw = readJson(rawPath);
const rows = Array.isArray(raw?.rows) ? raw.rows : [];
if (rows.length === 0) {
throw new Error("Raw file contains no rows.");
}
const caseRows = rows.map((row, index) => buildCaseRow(index, row));
const totalsByVerdict = countBy(caseRows, (row) => row.verdict);
const domainCorrect = caseRows.filter((row) => row.expected_domain === row.actual_domain).length;
const qTypeFit = caseRows.filter((row) => row.expected_question_type === row.actual_question_type).length;
const anchorsPresentCount = caseRows.filter((row) => row.company_anchors_present).length;
const anchorsUsedCount = caseRows.filter(
(row) => row.company_anchors_present && row.company_anchors_used_in_answer
).length;
const genericCount = caseRows.filter((row) => row.is_generic_answer).length;
const firstCheckRelevantCount = caseRows.filter((row) => row.first_check_relevance).length;
const metrics = {
schema_version: "wave13_chat20_metrics_v2",
run_id: path.basename(outputDir),
source_session_id: text(raw?.session_id),
totals: {
cases: caseRows.length,
pass: totalsByVerdict.PASS || 0,
soft_pass: totalsByVerdict.SOFT_PASS || 0,
fail: totalsByVerdict.FAIL || 0
},
domain_correctness_rate: toPercent(domainCorrect / caseRows.length),
question_type_fit_rate: toPercent(qTypeFit / caseRows.length),
company_anchor_usage_rate: toPercent(
anchorsPresentCount > 0 ? anchorsUsedCount / anchorsPresentCount : 0
),
company_anchor_usage_rate_global: toPercent(anchorsUsedCount / caseRows.length),
generic_answer_rate: toPercent(genericCount / caseRows.length),
first_check_relevance_rate: toPercent(firstCheckRelevantCount / caseRows.length),
anchors_present_cases: anchorsPresentCount,
anchors_used_cases: anchorsUsedCount
};
let baselineMetrics = null;
if (args.baselineMetricsFile) {
const baselinePath = path.resolve(args.baselineMetricsFile);
if (fs.existsSync(baselinePath)) {
baselineMetrics = readJson(baselinePath);
metrics.baseline_reference = path.basename(baselinePath);
metrics.baseline_metrics = {
domain_correctness_rate: baselineMetrics.domain_correctness_rate,
question_type_fit_rate: baselineMetrics.question_type_fit_rate,
company_anchor_usage_rate: baselineMetrics.company_anchor_usage_rate,
generic_answer_rate: baselineMetrics.generic_answer_rate,
first_check_relevance_rate: baselineMetrics.first_check_relevance_rate
};
metrics.delta_vs_baseline = {
domain_correctness_rate_delta: toPercent(
Number(metrics.domain_correctness_rate) -
Number(metrics.baseline_metrics.domain_correctness_rate || 0)
),
question_type_fit_rate_delta: toPercent(
Number(metrics.question_type_fit_rate) -
Number(metrics.baseline_metrics.question_type_fit_rate || 0)
),
company_anchor_usage_rate_delta: toPercent(
Number(metrics.company_anchor_usage_rate) -
Number(metrics.baseline_metrics.company_anchor_usage_rate || 0)
),
generic_answer_rate_delta: toPercent(
Number(metrics.generic_answer_rate) -
Number(metrics.baseline_metrics.generic_answer_rate || 0)
),
first_check_relevance_rate_delta: toPercent(
Number(metrics.first_check_relevance_rate) -
Number(metrics.baseline_metrics.first_check_relevance_rate || 0)
)
};
}
}
const matrixPath = path.join(outputDir, args.caseMatrixFile);
const metricsPath = path.join(outputDir, args.metricsFile);
const reportPath = path.join(outputDir, args.reportFile);
writeUtf8Bom(matrixPath, buildCaseMatrixMarkdown(caseRows));
writeUtf8Bom(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`);
writeUtf8Bom(reportPath, buildRegressionReport(caseRows, metrics, baselineMetrics));
process.stdout.write(
[
`rows=${caseRows.length}`,
`matrix=${matrixPath}`,
`metrics=${metricsPath}`,
`report=${reportPath}`
].join("\n")
);
}
main().catch((error) => {
process.stderr.write(`${error?.stack || error}\n`);
process.exitCode = 1;
});