From 3be06b5f937203b34edb22c70937f1c75874946a Mon Sep 17 00:00:00 2001 From: dctouch Date: Sun, 10 May 2026 08:38:52 +0300 Subject: [PATCH] =?UTF-8?q?=D0=9F=D0=BE=D1=87=D0=B8=D0=BD=D0=B8=D1=82?= =?UTF-8?q?=D1=8C=20=D0=B2=D0=BE=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BA=D0=B8=D1=80=D0=B8=D0=BB?= =?UTF-8?q?=D0=BB=D0=B8=D1=86=D1=8B=20=D0=B2=20=D0=B0=D0=B2=D1=82=D0=BE?= =?UTF-8?q?=D0=BF=D1=80=D0=BE=D0=B3=D0=BE=D0=BD=D0=B0=D1=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../backend/dist/routes/autoRuns.js | 42 +++++- llm_normalizer/backend/dist/routes/eval.js | 8 +- .../dist/services/addressTextRepair.js | 63 ++++++--- .../services/assistantLivingModePolicy.js | 5 +- .../services/assistantOrganizationMatcher.js | 11 +- .../backend/dist/services/assistantService.js | 48 ++++++- llm_normalizer/backend/src/routes/autoRuns.ts | 45 ++++++- llm_normalizer/backend/src/routes/eval.ts | 8 +- .../backend/src/services/addressTextRepair.ts | 70 +++++++--- .../src/services/assistantLivingModePolicy.ts | 5 +- .../services/assistantOrganizationMatcher.ts | 11 +- .../backend/src/services/assistantService.ts | 48 ++++++- .../backend/tests/addressTextRepair.test.ts | 35 +++++ .../assistantAddressFollowupContext.test.ts | 95 ++++++++++++++ .../assistantOrganizationMatcher.test.ts | 20 +++ .../tests/autoRunsQuestionSplit.test.ts | 22 ++++ .../tests/evalRuntimeQuestionSplit.test.ts | 33 +++++ ..._saved_session_runtime_job-elT8zDcc9I.json | 123 ++++++++++++++++++ 18 files changed, 618 insertions(+), 74 deletions(-) create mode 100644 llm_normalizer/backend/tests/addressTextRepair.test.ts create mode 100644 llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-elT8zDcc9I.json diff --git a/llm_normalizer/backend/dist/routes/autoRuns.js b/llm_normalizer/backend/dist/routes/autoRuns.js index 53377d4..d248270 100644 --- a/llm_normalizer/backend/dist/routes/autoRuns.js +++ b/llm_normalizer/backend/dist/routes/autoRuns.js @@ -102,7 +102,7 @@ function parseAutoGenTitle(value) { if (!title) { return null; } - return title.slice(0, 160); + return repairAutogenMojibake(title).slice(0, 160); } function parseManualCaseDecision(value, fallback = "needs_dialog_policy_fix") { const normalized = toStringSafe(value); @@ -1186,7 +1186,9 @@ function textMojibakeScore(value) { const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ‘’“”•–—™љ›њќћџ]/g) ?? []).length; const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length; const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length; - return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; + const replacement = (source.match(/\uFFFD/g) ?? []).length; + const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length; + return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; } function looksLikeMojibake(value) { const source = String(value ?? ""); @@ -1201,16 +1203,36 @@ function looksLikeMojibake(value) { } return (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length >= 2; } +function encodeWin1251MojibakeBytes(value) { + const chunks = []; + for (const char of String(value ?? "")) { + const code = char.codePointAt(0) ?? 0; + if (code >= 0x80 && code <= 0x9f) { + chunks.push(Buffer.from([code])); + continue; + } + chunks.push(iconv_lite_1.default.encode(char, "win1251")); + } + return Buffer.concat(chunks); +} +function decodeUtf8FromWin1251Mojibake(value) { + return encodeWin1251MojibakeBytes(value).toString("utf8"); +} +function repairKnownReplacementDamagedAutogenText(value) { + return String(value ?? "") + .replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410") + .replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419"); +} function repairAutogenMojibake(value) { - const source = String(value ?? ""); - if (!looksLikeMojibake(source)) { + const source = repairKnownReplacementDamagedAutogenText(String(value ?? "")); + if (!looksLikeMojibake(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) { return source; } let candidate = source; for (let pass = 0; pass < 3; pass += 1) { let improved = false; try { - const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8"); + const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate); if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) { candidate = fromWin1251; improved = true; @@ -1229,11 +1251,16 @@ function repairAutogenMojibake(value) { catch { // ignore } + const repairedKnownText = repairKnownReplacementDamagedAutogenText(candidate); + if (repairedKnownText !== candidate) { + candidate = repairedKnownText; + improved = true; + } if (!improved) { break; } } - return candidate; + return repairKnownReplacementDamagedAutogenText(candidate); } function sanitizeGeneratedQuestion(value) { return repairAutogenMojibake(String(value ?? "")) @@ -1432,7 +1459,8 @@ function extractQuestionsFromAutogenOutput(rawText) { } exports.__autoRunsQuestionTestUtils = { splitQuestionCandidates, - extractQuestionsFromAutogenOutput + extractQuestionsFromAutogenOutput, + repairAutogenMojibake }; async function generateQwenSeedQuestionsLive(input) { const seedExamples = collectCanonicalQuestions(40); diff --git a/llm_normalizer/backend/dist/routes/eval.js b/llm_normalizer/backend/dist/routes/eval.js index 195791b..f6750d2 100644 --- a/llm_normalizer/backend/dist/routes/eval.js +++ b/llm_normalizer/backend/dist/routes/eval.js @@ -11,6 +11,7 @@ const nanoid_1 = require("nanoid"); const express_1 = require("express"); const config_1 = require("../config"); const http_1 = require("../utils/http"); +const addressTextRepair_1 = require("../services/addressTextRepair"); const ASYNC_JOBS = new Map(); const MAX_ASYNC_JOBS = 80; function toRecord(value) { @@ -30,7 +31,7 @@ function toArray(value) { return Array.isArray(value) ? value : []; } function normalizeQuestionChunk(value) { - return String(value ?? "") + return (0, addressTextRepair_1.repairAddressMojibakeText)(String(value ?? "")) .replace(/\r/g, " ") .replace(/\t/g, " ") .replace(/\s+/g, " ") @@ -92,7 +93,7 @@ function normalizeRuntimeQuestionList(items) { return normalized.filter((item) => item.length > 0); } function splitQuestionCandidate(raw) { - const normalized = String(raw ?? "").replace(/\r/g, "\n").trim(); + const normalized = (0, addressTextRepair_1.repairAddressMojibakeText)(String(raw ?? "")).replace(/\r/g, "\n").trim(); if (!normalized) { return []; } @@ -455,7 +456,8 @@ function buildEvalRouter(services) { } const questions = normalizeRuntimeQuestions(body.questions); const scenarioQuestions = normalizeRuntimeQuestions(body.scenarioQuestions, { dedupe: false, splitCandidates: false }); - const scenarioTitle = toStringSafe(body.scenarioTitle); + const scenarioTitleRaw = toStringSafe(body.scenarioTitle); + const scenarioTitle = scenarioTitleRaw ? (0, addressTextRepair_1.repairAddressMojibakeText)(scenarioTitleRaw) : null; const jobId = `job-${(0, nanoid_1.nanoid)(10)}`; const runId = `assistant-stage1-${(0, nanoid_1.nanoid)(10)}`; const runtimeCaseSetFile = scenarioQuestions.length > 0 diff --git a/llm_normalizer/backend/dist/services/addressTextRepair.js b/llm_normalizer/backend/dist/services/addressTextRepair.js index c8ff57f..7737040 100644 --- a/llm_normalizer/backend/dist/services/addressTextRepair.js +++ b/llm_normalizer/backend/dist/services/addressTextRepair.js @@ -6,44 +6,68 @@ Object.defineProperty(exports, "__esModule", { value: true }); exports.repairAddressMojibakeText = repairAddressMojibakeText; exports.normalizeRussianComparableText = normalizeRussianComparableText; const iconv_lite_1 = __importDefault(require("iconv-lite")); +const MOJIBAKE_CONTINUATION_CLASS = "[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]"; +const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu"); function compactWhitespace(value) { return value.replace(/\s+/g, " ").trim(); } +function countMatches(value, pattern) { + return (String(value ?? "").match(pattern) ?? []).length; +} function textMojibakeScore(value) { const source = String(value ?? ""); - const cyrillic = (source.match(/[\u0400-\u04ff]/g) ?? []).length; - const latin = (source.match(/[A-Za-z]/g) ?? []).length; - const replacement = (source.match(/[�]/g) ?? []).length; - const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length; - const doubleEncodedMarkers = (source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length; - return cyrillic + latin - replacement * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; + const cyrillic = countMatches(source, /[\u0400-\u04ff]/g); + const latin = countMatches(source, /[A-Za-z]/g); + const replacement = countMatches(source, /\uFFFD/g); + const c1Controls = countMatches(source, /[\u0080-\u009f]/g); + const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN); + const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu); + return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2; } function looksLikeAddressMojibake(value) { const source = String(value ?? ""); if (!source.trim()) { return false; } - if (/[�]/.test(source)) { + if (/[\u0080-\u009f\uFFFD]/.test(source)) { return true; } - if ((source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length >= 2) { + if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) { return true; } - if ((source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length >= 2) { - return true; + return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2; +} +function encodeWin1251MojibakeBytes(value) { + const chunks = []; + for (const char of String(value ?? "")) { + const code = char.codePointAt(0) ?? 0; + if (code >= 0x80 && code <= 0x9f) { + chunks.push(Buffer.from([code])); + continue; + } + chunks.push(iconv_lite_1.default.encode(char, "win1251")); } - return false; + return Buffer.concat(chunks); +} +function decodeUtf8FromWin1251Mojibake(value) { + return encodeWin1251MojibakeBytes(value).toString("utf8"); +} +function repairKnownReplacementDamagedRussianText(value) { + return String(value ?? "") + .replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410") + .replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419"); } function repairAddressMojibakeText(value) { const source = String(value ?? ""); - if (!looksLikeAddressMojibake(source)) { - return source; + const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source); + if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) { + return sourceWithKnownRepairs; } - let candidate = source; + let candidate = sourceWithKnownRepairs; for (let pass = 0; pass < 3; pass += 1) { let improved = false; try { - const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8"); + const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate); if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) { candidate = fromWin1251; improved = true; @@ -62,12 +86,17 @@ function repairAddressMojibakeText(value) { catch { // Ignore decode failures and keep the current candidate. } + const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate); + if (repairedKnownText !== candidate) { + candidate = repairedKnownText; + improved = true; + } if (!improved) { break; } } - return candidate; + return repairKnownReplacementDamagedRussianText(candidate); } function normalizeRussianComparableText(value) { - return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/ё/g, "е"); + return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435"); } diff --git a/llm_normalizer/backend/dist/services/assistantLivingModePolicy.js b/llm_normalizer/backend/dist/services/assistantLivingModePolicy.js index b11b108..8365fd4 100644 --- a/llm_normalizer/backend/dist/services/assistantLivingModePolicy.js +++ b/llm_normalizer/backend/dist/services/assistantLivingModePolicy.js @@ -209,7 +209,10 @@ function createAssistantLivingModePolicy(deps) { if (hasAffectiveReactionCue) { return false; } - return normalized.length <= 36 && !/[?]/.test(String(userMessage ?? "")); + const rawQuestionProbe = String(userMessage ?? "") + .replace(/\uFFFD\?/g, "\uFFFD") + .replace(/пїЅ\?/giu, "пїЅ"); + return normalized.length <= 36 && !/[?]/.test(rawQuestionProbe); } function hasAssistantDataScopeMetaQuestionSignal(text) { const repaired = repairAddressMojibake(String(text ?? "")); diff --git a/llm_normalizer/backend/dist/services/assistantOrganizationMatcher.js b/llm_normalizer/backend/dist/services/assistantOrganizationMatcher.js index ad3bfc3..142ba07 100644 --- a/llm_normalizer/backend/dist/services/assistantOrganizationMatcher.js +++ b/llm_normalizer/backend/dist/services/assistantOrganizationMatcher.js @@ -182,18 +182,25 @@ function scoreOrganizationMentionInMessage(message, organization) { variantScore = Math.max(variantScore, variant.length * 5); continue; } + let fuzzyTokenScore = 0; const fuzzyMatched = messageTokens.some((messageToken) => { if (messageToken === variant) { + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5); return true; } if (messageToken.length >= 5 && variant.length >= 5) { - return messageToken.startsWith(variant) || variant.startsWith(messageToken); + const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken); + if (prefixMatched) { + const prefixLength = Math.min(messageToken.length, variant.length); + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3)); + } + return prefixMatched; } return false; }); if (fuzzyMatched) { matched = true; - variantScore = Math.max(variantScore, Math.max(20, variant.length * 3)); + variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3)); } } if (matched) { diff --git a/llm_normalizer/backend/dist/services/assistantService.js b/llm_normalizer/backend/dist/services/assistantService.js index f803445..0736573 100644 --- a/llm_normalizer/backend/dist/services/assistantService.js +++ b/llm_normalizer/backend/dist/services/assistantService.js @@ -1944,7 +1944,9 @@ function textMojibakeScoreForAddress(value) { const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ\uFFFD?’“”•–—™љ›њќћџ]/g) ?? []).length; const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length; const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length; - return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; + const replacement = (source.match(/\uFFFD/g) ?? []).length; + const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length; + return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; } function looksLikeMojibakeForAddress(value) { const source = String(value ?? ""); @@ -1962,16 +1964,36 @@ function looksLikeMojibakeForAddress(value) { } return false; } +function encodeWin1251MojibakeBytesForAddress(value) { + const chunks = []; + for (const char of String(value ?? "")) { + const code = char.codePointAt(0) ?? 0; + if (code >= 0x80 && code <= 0x9f) { + chunks.push(Buffer.from([code])); + continue; + } + chunks.push(iconv_lite_1.default.encode(char, "win1251")); + } + return Buffer.concat(chunks); +} +function decodeUtf8FromWin1251MojibakeForAddress(value) { + return encodeWin1251MojibakeBytesForAddress(value).toString("utf8"); +} +function repairKnownReplacementDamagedAddressText(value) { + return String(value ?? "") + .replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410") + .replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419"); +} function repairAddressMojibake(value) { - const source = String(value ?? ""); - if (!looksLikeMojibakeForAddress(source)) { + const source = repairKnownReplacementDamagedAddressText(String(value ?? "")); + if (!looksLikeMojibakeForAddress(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) { return source; } let candidate = source; for (let pass = 0; pass < 3; pass += 1) { let improved = false; try { - const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8"); + const fromWin1251 = decodeUtf8FromWin1251MojibakeForAddress(candidate); if (textMojibakeScoreForAddress(fromWin1251) > textMojibakeScoreForAddress(candidate)) { candidate = fromWin1251; improved = true; @@ -1986,11 +2008,16 @@ function repairAddressMojibake(value) { } } catch (_error) { } + const repairedKnownText = repairKnownReplacementDamagedAddressText(candidate); + if (repairedKnownText !== candidate) { + candidate = repairedKnownText; + improved = true; + } if (!improved) { break; } } - return candidate; + return repairKnownReplacementDamagedAddressText(candidate); } function sanitizeOutgoingAssistantText(value, fallback = "Не смог сформировать читаемый ответ. Уточните запрос.") { const repaired = repairAddressMojibake(String(value ?? "")); @@ -4311,18 +4338,25 @@ function scoreOrganizationMentionInMessage(message, organization) { variantScore = Math.max(variantScore, variant.length * 5); continue; } + let fuzzyTokenScore = 0; const fuzzyMatched = messageTokens.some((messageToken) => { if (messageToken === variant) { + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5); return true; } if (messageToken.length >= 5 && variant.length >= 5) { - return messageToken.startsWith(variant) || variant.startsWith(messageToken); + const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken); + if (prefixMatched) { + const prefixLength = Math.min(messageToken.length, variant.length); + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3)); + } + return prefixMatched; } return false; }); if (fuzzyMatched) { matched = true; - variantScore = Math.max(variantScore, Math.max(20, variant.length * 3)); + variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3)); } } if (matched) { diff --git a/llm_normalizer/backend/src/routes/autoRuns.ts b/llm_normalizer/backend/src/routes/autoRuns.ts index abd1cfd..aeb71bd 100644 --- a/llm_normalizer/backend/src/routes/autoRuns.ts +++ b/llm_normalizer/backend/src/routes/autoRuns.ts @@ -321,7 +321,7 @@ function parseAutoGenTitle(value: unknown): string | null { if (!title) { return null; } - return title.slice(0, 160); + return repairAutogenMojibake(title).slice(0, 160); } function parseManualCaseDecision(value: unknown, fallback: ManualCaseDecision = "needs_dialog_policy_fix"): ManualCaseDecision { @@ -1504,7 +1504,9 @@ function textMojibakeScore(value: string): number { const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ‘’“”•–—™љ›њќћџ]/g) ?? []).length; const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length; const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length; - return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; + const replacement = (source.match(/\uFFFD/g) ?? []).length; + const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length; + return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; } function looksLikeMojibake(value: string): boolean { @@ -1521,16 +1523,39 @@ function looksLikeMojibake(value: string): boolean { return (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length >= 2; } +function encodeWin1251MojibakeBytes(value: string): Buffer { + const chunks: Buffer[] = []; + for (const char of String(value ?? "")) { + const code = char.codePointAt(0) ?? 0; + if (code >= 0x80 && code <= 0x9f) { + chunks.push(Buffer.from([code])); + continue; + } + chunks.push(iconv.encode(char, "win1251")); + } + return Buffer.concat(chunks); +} + +function decodeUtf8FromWin1251Mojibake(value: string): string { + return encodeWin1251MojibakeBytes(value).toString("utf8"); +} + +function repairKnownReplacementDamagedAutogenText(value: string): string { + return String(value ?? "") + .replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410") + .replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419"); +} + function repairAutogenMojibake(value: string): string { - const source = String(value ?? ""); - if (!looksLikeMojibake(source)) { + const source = repairKnownReplacementDamagedAutogenText(String(value ?? "")); + if (!looksLikeMojibake(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) { return source; } let candidate = source; for (let pass = 0; pass < 3; pass += 1) { let improved = false; try { - const fromWin1251 = iconv.encode(candidate, "win1251").toString("utf8"); + const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate); if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) { candidate = fromWin1251; improved = true; @@ -1547,11 +1572,16 @@ function repairAutogenMojibake(value: string): string { } catch { // ignore } + const repairedKnownText = repairKnownReplacementDamagedAutogenText(candidate); + if (repairedKnownText !== candidate) { + candidate = repairedKnownText; + improved = true; + } if (!improved) { break; } } - return candidate; + return repairKnownReplacementDamagedAutogenText(candidate); } function sanitizeGeneratedQuestion(value: string): string { @@ -1775,7 +1805,8 @@ function extractQuestionsFromAutogenOutput(rawText: string): string[] { export const __autoRunsQuestionTestUtils = { splitQuestionCandidates, - extractQuestionsFromAutogenOutput + extractQuestionsFromAutogenOutput, + repairAutogenMojibake }; async function generateQwenSeedQuestionsLive(input: { diff --git a/llm_normalizer/backend/src/routes/eval.ts b/llm_normalizer/backend/src/routes/eval.ts index cd8fe51..7935004 100644 --- a/llm_normalizer/backend/src/routes/eval.ts +++ b/llm_normalizer/backend/src/routes/eval.ts @@ -7,6 +7,7 @@ import type { AppServices } from "../serverContext"; import { ApiError, ok } from "../utils/http"; import type { EvalRunMode, NormalizeRequestPayload } from "../types/normalizer"; import type { EvalTarget } from "../types/assistantEval"; +import { repairAddressMojibakeText } from "../services/addressTextRepair"; type EvalAsyncStatus = "queued" | "running" | "completed" | "failed" | "canceled"; @@ -67,7 +68,7 @@ function toArray(value: unknown): unknown[] { } function normalizeQuestionChunk(value: string): string { - return String(value ?? "") + return repairAddressMojibakeText(String(value ?? "")) .replace(/\r/g, " ") .replace(/\t/g, " ") .replace(/\s+/g, " ") @@ -136,7 +137,7 @@ function normalizeRuntimeQuestionList(items: string[]): string[] { } function splitQuestionCandidate(raw: string): string[] { - const normalized = String(raw ?? "").replace(/\r/g, "\n").trim(); + const normalized = repairAddressMojibakeText(String(raw ?? "")).replace(/\r/g, "\n").trim(); if (!normalized) { return []; } @@ -533,7 +534,8 @@ export function buildEvalRouter(services: AppServices): Router { } const questions = normalizeRuntimeQuestions(body.questions); const scenarioQuestions = normalizeRuntimeQuestions(body.scenarioQuestions, { dedupe: false, splitCandidates: false }); - const scenarioTitle = toStringSafe(body.scenarioTitle); + const scenarioTitleRaw = toStringSafe(body.scenarioTitle); + const scenarioTitle = scenarioTitleRaw ? repairAddressMojibakeText(scenarioTitleRaw) : null; const jobId = `job-${nanoid(10)}`; const runId = `assistant-stage1-${nanoid(10)}`; diff --git a/llm_normalizer/backend/src/services/addressTextRepair.ts b/llm_normalizer/backend/src/services/addressTextRepair.ts index 4cc3bb0..d589b98 100644 --- a/llm_normalizer/backend/src/services/addressTextRepair.ts +++ b/llm_normalizer/backend/src/services/addressTextRepair.ts @@ -1,17 +1,26 @@ import iconv from "iconv-lite"; +const MOJIBAKE_CONTINUATION_CLASS = + "[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]"; +const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu"); + function compactWhitespace(value: string): string { return value.replace(/\s+/g, " ").trim(); } +function countMatches(value: string, pattern: RegExp): number { + return (String(value ?? "").match(pattern) ?? []).length; +} + function textMojibakeScore(value: string): number { const source = String(value ?? ""); - const cyrillic = (source.match(/[\u0400-\u04ff]/g) ?? []).length; - const latin = (source.match(/[A-Za-z]/g) ?? []).length; - const replacement = (source.match(/[�]/g) ?? []).length; - const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length; - const doubleEncodedMarkers = (source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length; - return cyrillic + latin - replacement * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; + const cyrillic = countMatches(source, /[\u0400-\u04ff]/g); + const latin = countMatches(source, /[A-Za-z]/g); + const replacement = countMatches(source, /\uFFFD/g); + const c1Controls = countMatches(source, /[\u0080-\u009f]/g); + const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN); + const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu); + return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2; } function looksLikeAddressMojibake(value: string): boolean { @@ -19,30 +28,51 @@ function looksLikeAddressMojibake(value: string): boolean { if (!source.trim()) { return false; } - if (/[�]/.test(source)) { + if (/[\u0080-\u009f\uFFFD]/.test(source)) { return true; } - if ((source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length >= 2) { + if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) { return true; } - if ((source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length >= 2) { - return true; + return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2; +} + +function encodeWin1251MojibakeBytes(value: string): Buffer { + const chunks: Buffer[] = []; + for (const char of String(value ?? "")) { + const code = char.codePointAt(0) ?? 0; + if (code >= 0x80 && code <= 0x9f) { + chunks.push(Buffer.from([code])); + continue; + } + chunks.push(iconv.encode(char, "win1251")); } - return false; + return Buffer.concat(chunks); +} + +function decodeUtf8FromWin1251Mojibake(value: string): string { + return encodeWin1251MojibakeBytes(value).toString("utf8"); +} + +function repairKnownReplacementDamagedRussianText(value: string): string { + return String(value ?? "") + .replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410") + .replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419"); } export function repairAddressMojibakeText(value: string): string { const source = String(value ?? ""); - if (!looksLikeAddressMojibake(source)) { - return source; + const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source); + if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) { + return sourceWithKnownRepairs; } - let candidate = source; + let candidate = sourceWithKnownRepairs; for (let pass = 0; pass < 3; pass += 1) { let improved = false; try { - const fromWin1251 = iconv.encode(candidate, "win1251").toString("utf8"); + const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate); if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) { candidate = fromWin1251; improved = true; @@ -61,14 +91,20 @@ export function repairAddressMojibakeText(value: string): string { // Ignore decode failures and keep the current candidate. } + const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate); + if (repairedKnownText !== candidate) { + candidate = repairedKnownText; + improved = true; + } + if (!improved) { break; } } - return candidate; + return repairKnownReplacementDamagedRussianText(candidate); } export function normalizeRussianComparableText(value: unknown): string { - return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/ё/g, "е"); + return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435"); } diff --git a/llm_normalizer/backend/src/services/assistantLivingModePolicy.ts b/llm_normalizer/backend/src/services/assistantLivingModePolicy.ts index c9ce125..3e2b4bb 100644 --- a/llm_normalizer/backend/src/services/assistantLivingModePolicy.ts +++ b/llm_normalizer/backend/src/services/assistantLivingModePolicy.ts @@ -279,7 +279,10 @@ export function createAssistantLivingModePolicy(deps: AssistantLivingModePolicyD if (hasAffectiveReactionCue) { return false; } - return normalized.length <= 36 && !/[?]/.test(String(userMessage ?? "")); + const rawQuestionProbe = String(userMessage ?? "") + .replace(/\uFFFD\?/g, "\uFFFD") + .replace(/пїЅ\?/giu, "пїЅ"); + return normalized.length <= 36 && !/[?]/.test(rawQuestionProbe); } function hasAssistantDataScopeMetaQuestionSignal(text) { diff --git a/llm_normalizer/backend/src/services/assistantOrganizationMatcher.ts b/llm_normalizer/backend/src/services/assistantOrganizationMatcher.ts index 3dfcad8..1b7d4c6 100644 --- a/llm_normalizer/backend/src/services/assistantOrganizationMatcher.ts +++ b/llm_normalizer/backend/src/services/assistantOrganizationMatcher.ts @@ -189,18 +189,25 @@ export function scoreOrganizationMentionInMessage(message: unknown, organization variantScore = Math.max(variantScore, variant.length * 5); continue; } + let fuzzyTokenScore = 0; const fuzzyMatched = messageTokens.some((messageToken) => { if (messageToken === variant) { + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5); return true; } if (messageToken.length >= 5 && variant.length >= 5) { - return messageToken.startsWith(variant) || variant.startsWith(messageToken); + const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken); + if (prefixMatched) { + const prefixLength = Math.min(messageToken.length, variant.length); + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3)); + } + return prefixMatched; } return false; }); if (fuzzyMatched) { matched = true; - variantScore = Math.max(variantScore, Math.max(20, variant.length * 3)); + variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3)); } } if (matched) { diff --git a/llm_normalizer/backend/src/services/assistantService.ts b/llm_normalizer/backend/src/services/assistantService.ts index c0a67fe..9601a5e 100644 --- a/llm_normalizer/backend/src/services/assistantService.ts +++ b/llm_normalizer/backend/src/services/assistantService.ts @@ -1900,7 +1900,9 @@ function textMojibakeScoreForAddress(value) { const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ\uFFFD?’“”•–—™љ›њќћџ]/g) ?? []).length; const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length; const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length; - return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; + const replacement = (source.match(/\uFFFD/g) ?? []).length; + const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length; + return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2; } function looksLikeMojibakeForAddress(value) { const source = String(value ?? ""); @@ -1918,16 +1920,36 @@ function looksLikeMojibakeForAddress(value) { } return false; } +function encodeWin1251MojibakeBytesForAddress(value) { + const chunks = []; + for (const char of String(value ?? "")) { + const code = char.codePointAt(0) ?? 0; + if (code >= 0x80 && code <= 0x9f) { + chunks.push(Buffer.from([code])); + continue; + } + chunks.push(iconv.encode(char, "win1251")); + } + return Buffer.concat(chunks); +} +function decodeUtf8FromWin1251MojibakeForAddress(value) { + return encodeWin1251MojibakeBytesForAddress(value).toString("utf8"); +} +function repairKnownReplacementDamagedAddressText(value) { + return String(value ?? "") + .replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410") + .replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419"); +} function repairAddressMojibake(value) { - const source = String(value ?? ""); - if (!looksLikeMojibakeForAddress(source)) { + const source = repairKnownReplacementDamagedAddressText(String(value ?? "")); + if (!looksLikeMojibakeForAddress(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) { return source; } let candidate = source; for (let pass = 0; pass < 3; pass += 1) { let improved = false; try { - const fromWin1251 = iconv.encode(candidate, "win1251").toString("utf8"); + const fromWin1251 = decodeUtf8FromWin1251MojibakeForAddress(candidate); if (textMojibakeScoreForAddress(fromWin1251) > textMojibakeScoreForAddress(candidate)) { candidate = fromWin1251; improved = true; @@ -1942,11 +1964,16 @@ function repairAddressMojibake(value) { } } catch (_error) { } + const repairedKnownText = repairKnownReplacementDamagedAddressText(candidate); + if (repairedKnownText !== candidate) { + candidate = repairedKnownText; + improved = true; + } if (!improved) { break; } } - return candidate; + return repairKnownReplacementDamagedAddressText(candidate); } function sanitizeOutgoingAssistantText(value, fallback = "Не смог сформировать читаемый ответ. Уточните запрос.") { const repaired = repairAddressMojibake(String(value ?? "")); @@ -4268,18 +4295,25 @@ function scoreOrganizationMentionInMessage(message, organization) { variantScore = Math.max(variantScore, variant.length * 5); continue; } + let fuzzyTokenScore = 0; const fuzzyMatched = messageTokens.some((messageToken) => { if (messageToken === variant) { + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5); return true; } if (messageToken.length >= 5 && variant.length >= 5) { - return messageToken.startsWith(variant) || variant.startsWith(messageToken); + const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken); + if (prefixMatched) { + const prefixLength = Math.min(messageToken.length, variant.length); + fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3)); + } + return prefixMatched; } return false; }); if (fuzzyMatched) { matched = true; - variantScore = Math.max(variantScore, Math.max(20, variant.length * 3)); + variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3)); } } if (matched) { diff --git a/llm_normalizer/backend/tests/addressTextRepair.test.ts b/llm_normalizer/backend/tests/addressTextRepair.test.ts new file mode 100644 index 0000000..1e673df --- /dev/null +++ b/llm_normalizer/backend/tests/addressTextRepair.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, it } from "vitest"; +import { repairAddressMojibakeText } from "../src/services/addressTextRepair"; + +describe("address text mojibake repair", () => { + it("preserves C1 bytes when repairing old autorun Russian text", () => { + const damagedTitle = String.fromCharCode( + 0x420, 0x2018, 0x420, 0x45b, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x401, 0x420, 0x45b, 0x420, + 0x2122, 0x20, 0x420, 0x45b, 0x420, 0x2018, 0x420, 0xa9, 0x420, 0x98, 0x420, 0x2122, 0x20, + 0x420, 0xa0, 0x421, 0x453, 0x421, 0x2021, 0x420, 0x405, 0x420, 0xb0, 0x421, 0x40f, 0x20, + 0x421, 0x403, 0x420, 0xb5, 0x421, 0x403, 0x421, 0x403, 0x420, 0x451, 0x421, 0x40f, 0x20, + 0x31, 0x36, 0x2e, 0x30, 0x34, 0x2e, 0x32, 0x30, 0x32, 0x36, 0x2c, 0x20, 0x32, 0x31, + 0x3a, 0x32, 0x36, 0x3a, 0x30, 0x36 + ); + const damagedAlternative = String.fromCharCode( + 0x420, 0x452, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x45e, 0x420, 0x2022, 0x420, 0xa0, 0x420, + 0x45c, 0x420, 0x452, 0x420, 0x45e, 0x420, 0x98, 0x420, 0x2019, 0x420, 0x452 + ); + + expect(repairAddressMojibakeText(damagedTitle)).toBe( + "\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419 \u0420\u0443\u0447\u043d\u0430\u044f \u0441\u0435\u0441\u0441\u0438\u044f 16.04.2026, 21:26:06" + ); + expect(repairAddressMojibakeText(damagedAlternative)).toBe( + "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410" + ); + }); + + it("repairs already lossy known replacement fragments", () => { + expect(repairAddressMojibakeText("\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\uFFFD?\u0412\u0410")).toBe( + "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410" + ); + expect(repairAddressMojibakeText("\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\uFFFD\u0419")).toBe( + "\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419" + ); + }); +}); diff --git a/llm_normalizer/backend/tests/assistantAddressFollowupContext.test.ts b/llm_normalizer/backend/tests/assistantAddressFollowupContext.test.ts index ab3c519..599eea1 100644 --- a/llm_normalizer/backend/tests/assistantAddressFollowupContext.test.ts +++ b/llm_normalizer/backend/tests/assistantAddressFollowupContext.test.ts @@ -2276,6 +2276,101 @@ describe("assistant address follow-up carryover", () => { expect(normalizerService.normalize).not.toHaveBeenCalled(); }); + it("continues the original inventory query after replacement-damaged organization clarification", async () => { + const calls: Array<{ message: string; options?: any }> = []; + const firstMessage = "кайф - что там на складе по остаткам?"; + const secondMessage = "АЛЬТЕРНАТ\uFFFD?ВА"; + const repairedSecondMessage = "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"; + const addressQueryService = { + tryHandle: vi.fn(async (message: string, options?: any) => { + calls.push({ message, options }); + if (message === firstMessage) { + return buildAddressLimitedLaneResult("missing_anchor", { + reply_text: [ + "Нужно уточнить организацию, чтобы не смешивать компании в одном ответе.", + "Сейчас в доступном контуре вижу такие организации:", + "- ООО Альтернатива Плюс", + "- ООО Лайсвуд", + "- РАЙМ" + ].join("\n"), + debug: { + ...buildAddressLimitedLaneResult("missing_anchor").debug, + detected_intent: "inventory_on_hand_as_of_date", + extracted_filters: { + as_of_date: "2026-04-15" + }, + selected_recipe: null, + organization_candidates: ["ООО Альтернатива Плюс", "ООО Лайсвуд", "РАЙМ"], + reasons: ["organization_clarification_required", "multiple_known_organizations_detected"] + } + }); + } + if ( + (message === secondMessage || message === repairedSecondMessage) && + options?.followupContext && + options?.activeOrganization === "ООО Альтернатива Плюс" + ) { + return buildAddressLaneResult({ + reply_text: "На 15.04.2026 по ООО Альтернатива Плюс подтвержден складской остаток по всем складам.", + debug: { + ...buildAddressLaneResult().debug, + detected_intent: "inventory_on_hand_as_of_date", + extracted_filters: { + as_of_date: "2026-04-15", + organization: "ООО Альтернатива Плюс" + }, + reasons: ["address_followup_context_applied", "organization_grounded_from_scope_candidates"] + } + }); + } + return null; + }) + } as any; + + const normalizerService = { + normalize: vi.fn(async () => ({ + assistant_reply: "normalizer_fallback_should_not_be_used", + reply_type: "partial_coverage", + debug: {} + })) + } as any; + + const sessions = new AssistantSessionStore(); + const service = new AssistantService( + normalizerService, + sessions as any, + {} as any, + { persistSession: vi.fn() } as any, + addressQueryService + ); + + const sessionId = `asst-address-org-clarification-damaged-${Date.now()}`; + const first = await service.handleMessage({ + session_id: sessionId, + user_message: firstMessage, + useMock: true + } as any); + expect(first.ok).toBe(true); + expect(first.reply_type).toBe("partial_coverage"); + + const second = await service.handleMessage({ + session_id: sessionId, + user_message: secondMessage, + useMock: true + } as any); + + expect(second.ok).toBe(true); + expect(second.reply_type).toBe("factual"); + expect(calls).toHaveLength(2); + expect(calls[1].message).toBe(repairedSecondMessage); + expect(calls[1].options?.activeOrganization).toBe("ООО Альтернатива Плюс"); + expect(calls[1].options?.knownOrganizations).toEqual(["ООО Альтернатива Плюс", "ООО Лайсвуд", "РАЙМ"]); + expect(calls[1].options?.followupContext?.previous_intent).toBe("inventory_on_hand_as_of_date"); + expect(calls[1].options?.followupContext?.previous_filters?.organization).toBe("ООО Альтернатива Плюс"); + expect(calls[1].options?.followupContext?.root_filters?.organization).toBe("ООО Альтернатива Плюс"); + expect(normalizerService.normalize).not.toHaveBeenCalled(); + }); + it("keeps historical inventory date follow-up alive after company clarification and a capability answer", async () => { const calls: Array<{ message: string; options?: any }> = []; const firstMessage = "покажи остатки по складу"; diff --git a/llm_normalizer/backend/tests/assistantOrganizationMatcher.test.ts b/llm_normalizer/backend/tests/assistantOrganizationMatcher.test.ts index 9138278..79987ac 100644 --- a/llm_normalizer/backend/tests/assistantOrganizationMatcher.test.ts +++ b/llm_normalizer/backend/tests/assistantOrganizationMatcher.test.ts @@ -42,6 +42,26 @@ describe("assistant organization matcher", () => { expect(score).toBeGreaterThanOrEqual(90); }); + it("matches replacement-damaged organization clarification when the live candidate is unique", () => { + const resolved = resolveOrganizationSelectionFromMessage("АЛЬТЕРНАТ\uFFFD?ВА", [ + "ООО Альтернатива Плюс", + "ООО Лайсвуд", + "РАЙМ" + ]); + + expect(resolved).toBe("ООО Альтернатива Плюс"); + }); + + it("keeps replacement-damaged organization clarification ambiguous when candidates share the token", () => { + const resolved = resolveOrganizationSelectionFromMessage("АЛЬТЕРНАТ\uFFFD?ВА", [ + "ООО Альтернатива Плюс", + "ООО Альтернатива Минус", + "ООО Лайсвуд" + ]); + + expect(resolved).toBeNull(); + }); + it("treats minor live label corruption as the same organization entity", () => { expect(organizationsLikelySameEntity("Альтернатива Плюс", 'ООО "Альтернати"а Плюс"')).toBe(true); }); diff --git a/llm_normalizer/backend/tests/autoRunsQuestionSplit.test.ts b/llm_normalizer/backend/tests/autoRunsQuestionSplit.test.ts index bb8ccc3..f4ea5b0 100644 --- a/llm_normalizer/backend/tests/autoRunsQuestionSplit.test.ts +++ b/llm_normalizer/backend/tests/autoRunsQuestionSplit.test.ts @@ -31,4 +31,26 @@ describe("autoruns question extraction", () => { expect(parsed[0]).toMatch(/поставщик/i); expect(parsed[0]).toMatch(/коротко/i); }); + + it("repairs old autorun C1-control mojibake before exposing cards and questions", () => { + const damagedTitle = String.fromCharCode( + 0x420, 0x2018, 0x420, 0x45b, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x401, 0x420, 0x45b, 0x420, + 0x2122, 0x20, 0x420, 0x45b, 0x420, 0x2018, 0x420, 0xa9, 0x420, 0x98, 0x420, 0x2122, 0x20, + 0x420, 0xa0, 0x421, 0x453, 0x421, 0x2021, 0x420, 0x405, 0x420, 0xb0, 0x421, 0x40f, 0x20, + 0x421, 0x403, 0x420, 0xb5, 0x421, 0x403, 0x421, 0x403, 0x420, 0x451, 0x421, 0x40f, 0x20, + 0x31, 0x36, 0x2e, 0x30, 0x34, 0x2e, 0x32, 0x30, 0x32, 0x36, 0x2c, 0x20, 0x32, 0x31, + 0x3a, 0x32, 0x36, 0x3a, 0x30, 0x36 + ); + const damagedAlternative = String.fromCharCode( + 0x420, 0x452, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x45e, 0x420, 0x2022, 0x420, 0xa0, 0x420, + 0x45c, 0x420, 0x452, 0x420, 0x45e, 0x420, 0x98, 0x420, 0x2019, 0x420, 0x452 + ); + + expect(__autoRunsQuestionTestUtils.repairAutogenMojibake(damagedTitle)).toBe( + "\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419 \u0420\u0443\u0447\u043d\u0430\u044f \u0441\u0435\u0441\u0441\u0438\u044f 16.04.2026, 21:26:06" + ); + expect(__autoRunsQuestionTestUtils.repairAutogenMojibake(damagedAlternative)).toBe( + "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410" + ); + }); }); diff --git a/llm_normalizer/backend/tests/evalRuntimeQuestionSplit.test.ts b/llm_normalizer/backend/tests/evalRuntimeQuestionSplit.test.ts index 0474b00..347d51d 100644 --- a/llm_normalizer/backend/tests/evalRuntimeQuestionSplit.test.ts +++ b/llm_normalizer/backend/tests/evalRuntimeQuestionSplit.test.ts @@ -33,4 +33,37 @@ describe("eval runtime question splitting", () => { expect(parsed[0]).toMatch(/поставщик/i); expect(parsed[0]).toMatch(/коротко/i); }); + + it("repairs mojibake questions before runtime job materialization", () => { + const parsed = __evalRouteTestUtils.normalizeRuntimeQuestions([ + "кайф - что там РЅР° складе РїРѕ остаткам?" + ]); + + expect(parsed).toEqual(["кайф - что там на складе по остаткам?"]); + }); + + it("repairs damaged clarification as one scenario turn when splitting is disabled", () => { + const parsed = __evalRouteTestUtils.normalizeRuntimeQuestions( + ["\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\uFFFD?\u0412\u0410"], + { + dedupe: false, + splitCandidates: false + } + ); + + expect(parsed).toEqual(["\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"]); + }); + + it("repairs C1-control autorun clarification before runtime job materialization", () => { + const damagedAlternative = String.fromCharCode( + 0x420, 0x452, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x45e, 0x420, 0x2022, 0x420, 0xa0, 0x420, + 0x45c, 0x420, 0x452, 0x420, 0x45e, 0x420, 0x98, 0x420, 0x2019, 0x420, 0x452 + ); + const parsed = __evalRouteTestUtils.normalizeRuntimeQuestions([damagedAlternative], { + dedupe: false, + splitCandidates: false + }); + + expect(parsed).toEqual(["\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"]); + }); }); diff --git a/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-elT8zDcc9I.json b/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-elT8zDcc9I.json new file mode 100644 index 0000000..3b9458a --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-elT8zDcc9I.json @@ -0,0 +1,123 @@ +{ + "suite_id": "assistant_saved_session_runtime_job-elT8zDcc9I", + "suite_version": "0.1.0", + "schema_version": "assistant_saved_session_runtime_v0_1", + "title": "БОЛЬШОЙ ОБЩИЙ Ручная сессия 16.04.2026, 21:26:06", + "scenario_count": 1, + "case_ids": [ + "SAVED-001" + ], + "cases": [ + { + "case_id": "SAVED-001", + "scenario_tag": "saved_user_sessions_runtime", + "title": "БОЛЬШОЙ ОБЩИЙ Ручная сессия 16.04.2026, 21:26:06", + "question_type": "followup", + "broadness_level": "medium", + "turns": [ + { + "user_message": "приветик - че как там дела" + }, + { + "user_message": "расскажи что можешь интересного" + }, + { + "user_message": "кайф - что там на складе по остаткам?" + }, + { + "user_message": "АЛЬТЕРНАТ�?ВА" + }, + { + "user_message": "а исторические остатки на другие даты умеешь?" + }, + { + "user_message": "давай на июль 2017" + }, + { + "user_message": "март 2016" + }, + { + "user_message": "По выбранному объекту \"Рабочая станция универсального специалиста (индивидуальное изготовление)\": где взяли это?" + }, + { + "user_message": "а кому продали?" + }, + { + "user_message": "у тебя написано кто контрагент: рабочая станция - это ошибка?" + }, + { + "user_message": "ндс можешь прикинуть на дату покупки рабочей станции?" + }, + { + "user_message": "а какой ндс мы должны сгрузить на март 2020?" + }, + { + "user_message": "прикинь какой ндс нам надо заплатить на февраль 2017" + }, + { + "user_message": "кто у нас самый доходный клиент за все время" + }, + { + "user_message": "кто нам должен денег на май 2017" + }, + { + "user_message": "а какой ндс мы должны примерно заплатить за этот период?" + }, + { + "user_message": "мы должны комуто денег на сегодня?" + }, + { + "user_message": "а нам?" + }, + { + "user_message": "какой у нас самый доходный год" + }, + { + "user_message": "а за 2017 мы скок заработали?" + }, + { + "user_message": "сколько вообще денег мы заработали за все время?" + }, + { + "user_message": "ты умеешь считать дельту по договорам?" + }, + { + "user_message": "по чепурнову покажи все доки" + }, + { + "user_message": "а по свк" + }, + { + "user_message": "а сейчас у нас есть что на складе?" + }, + { + "user_message": "что нам отгружал чепурнов? какой товар или услугу?" + }, + { + "user_message": "какие остатки на складе на сегодня" + }, + { + "user_message": "остатки на март 2016" + }, + { + "user_message": "хвосты покажи по счету 60 на август 2022" + }, + { + "user_message": "Есть ли остатки товара, которые закупались очень давно" + }, + { + "user_message": "Какие конкретно номенклатуры формируют остаток по складу на май 2020" + }, + { + "user_message": "а по Альтернативе Плюс сколько лет активности в базе 1С?" + }, + { + "user_message": "Как ты оценишь деятельность компании?" + }, + { + "user_message": "какое нетто по деньгам с Группа СВК за 2020 год: сколько получили и сколько заплатили?" + } + ] + } + ] +} \ No newline at end of file