Починить восстановление кириллицы в автопрогонах
This commit is contained in:
parent
9b02083493
commit
3be06b5f93
|
|
@ -102,7 +102,7 @@ function parseAutoGenTitle(value) {
|
|||
if (!title) {
|
||||
return null;
|
||||
}
|
||||
return title.slice(0, 160);
|
||||
return repairAutogenMojibake(title).slice(0, 160);
|
||||
}
|
||||
function parseManualCaseDecision(value, fallback = "needs_dialog_policy_fix") {
|
||||
const normalized = toStringSafe(value);
|
||||
|
|
@ -1186,7 +1186,9 @@ function textMojibakeScore(value) {
|
|||
const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ‘’“”•–—™љ›њќћџ]/g) ?? []).length;
|
||||
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
|
||||
const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length;
|
||||
return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
const replacement = (source.match(/\uFFFD/g) ?? []).length;
|
||||
const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length;
|
||||
return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
}
|
||||
function looksLikeMojibake(value) {
|
||||
const source = String(value ?? "");
|
||||
|
|
@ -1201,16 +1203,36 @@ function looksLikeMojibake(value) {
|
|||
}
|
||||
return (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length >= 2;
|
||||
}
|
||||
function encodeWin1251MojibakeBytes(value) {
|
||||
const chunks = [];
|
||||
for (const char of String(value ?? "")) {
|
||||
const code = char.codePointAt(0) ?? 0;
|
||||
if (code >= 0x80 && code <= 0x9f) {
|
||||
chunks.push(Buffer.from([code]));
|
||||
continue;
|
||||
}
|
||||
chunks.push(iconv_lite_1.default.encode(char, "win1251"));
|
||||
}
|
||||
return Buffer.concat(chunks);
|
||||
}
|
||||
function decodeUtf8FromWin1251Mojibake(value) {
|
||||
return encodeWin1251MojibakeBytes(value).toString("utf8");
|
||||
}
|
||||
function repairKnownReplacementDamagedAutogenText(value) {
|
||||
return String(value ?? "")
|
||||
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
||||
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
||||
}
|
||||
function repairAutogenMojibake(value) {
|
||||
const source = String(value ?? "");
|
||||
if (!looksLikeMojibake(source)) {
|
||||
const source = repairKnownReplacementDamagedAutogenText(String(value ?? ""));
|
||||
if (!looksLikeMojibake(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) {
|
||||
return source;
|
||||
}
|
||||
let candidate = source;
|
||||
for (let pass = 0; pass < 3; pass += 1) {
|
||||
let improved = false;
|
||||
try {
|
||||
const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8");
|
||||
const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate);
|
||||
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
|
||||
candidate = fromWin1251;
|
||||
improved = true;
|
||||
|
|
@ -1229,11 +1251,16 @@ function repairAutogenMojibake(value) {
|
|||
catch {
|
||||
// ignore
|
||||
}
|
||||
const repairedKnownText = repairKnownReplacementDamagedAutogenText(candidate);
|
||||
if (repairedKnownText !== candidate) {
|
||||
candidate = repairedKnownText;
|
||||
improved = true;
|
||||
}
|
||||
if (!improved) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
return repairKnownReplacementDamagedAutogenText(candidate);
|
||||
}
|
||||
function sanitizeGeneratedQuestion(value) {
|
||||
return repairAutogenMojibake(String(value ?? ""))
|
||||
|
|
@ -1432,7 +1459,8 @@ function extractQuestionsFromAutogenOutput(rawText) {
|
|||
}
|
||||
exports.__autoRunsQuestionTestUtils = {
|
||||
splitQuestionCandidates,
|
||||
extractQuestionsFromAutogenOutput
|
||||
extractQuestionsFromAutogenOutput,
|
||||
repairAutogenMojibake
|
||||
};
|
||||
async function generateQwenSeedQuestionsLive(input) {
|
||||
const seedExamples = collectCanonicalQuestions(40);
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ const nanoid_1 = require("nanoid");
|
|||
const express_1 = require("express");
|
||||
const config_1 = require("../config");
|
||||
const http_1 = require("../utils/http");
|
||||
const addressTextRepair_1 = require("../services/addressTextRepair");
|
||||
const ASYNC_JOBS = new Map();
|
||||
const MAX_ASYNC_JOBS = 80;
|
||||
function toRecord(value) {
|
||||
|
|
@ -30,7 +31,7 @@ function toArray(value) {
|
|||
return Array.isArray(value) ? value : [];
|
||||
}
|
||||
function normalizeQuestionChunk(value) {
|
||||
return String(value ?? "")
|
||||
return (0, addressTextRepair_1.repairAddressMojibakeText)(String(value ?? ""))
|
||||
.replace(/\r/g, " ")
|
||||
.replace(/\t/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
|
|
@ -92,7 +93,7 @@ function normalizeRuntimeQuestionList(items) {
|
|||
return normalized.filter((item) => item.length > 0);
|
||||
}
|
||||
function splitQuestionCandidate(raw) {
|
||||
const normalized = String(raw ?? "").replace(/\r/g, "\n").trim();
|
||||
const normalized = (0, addressTextRepair_1.repairAddressMojibakeText)(String(raw ?? "")).replace(/\r/g, "\n").trim();
|
||||
if (!normalized) {
|
||||
return [];
|
||||
}
|
||||
|
|
@ -455,7 +456,8 @@ function buildEvalRouter(services) {
|
|||
}
|
||||
const questions = normalizeRuntimeQuestions(body.questions);
|
||||
const scenarioQuestions = normalizeRuntimeQuestions(body.scenarioQuestions, { dedupe: false, splitCandidates: false });
|
||||
const scenarioTitle = toStringSafe(body.scenarioTitle);
|
||||
const scenarioTitleRaw = toStringSafe(body.scenarioTitle);
|
||||
const scenarioTitle = scenarioTitleRaw ? (0, addressTextRepair_1.repairAddressMojibakeText)(scenarioTitleRaw) : null;
|
||||
const jobId = `job-${(0, nanoid_1.nanoid)(10)}`;
|
||||
const runId = `assistant-stage1-${(0, nanoid_1.nanoid)(10)}`;
|
||||
const runtimeCaseSetFile = scenarioQuestions.length > 0
|
||||
|
|
|
|||
|
|
@ -6,44 +6,68 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|||
exports.repairAddressMojibakeText = repairAddressMojibakeText;
|
||||
exports.normalizeRussianComparableText = normalizeRussianComparableText;
|
||||
const iconv_lite_1 = __importDefault(require("iconv-lite"));
|
||||
const MOJIBAKE_CONTINUATION_CLASS = "[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]";
|
||||
const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu");
|
||||
function compactWhitespace(value) {
|
||||
return value.replace(/\s+/g, " ").trim();
|
||||
}
|
||||
function countMatches(value, pattern) {
|
||||
return (String(value ?? "").match(pattern) ?? []).length;
|
||||
}
|
||||
function textMojibakeScore(value) {
|
||||
const source = String(value ?? "");
|
||||
const cyrillic = (source.match(/[\u0400-\u04ff]/g) ?? []).length;
|
||||
const latin = (source.match(/[A-Za-z]/g) ?? []).length;
|
||||
const replacement = (source.match(/[<5B>]/g) ?? []).length;
|
||||
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
|
||||
const doubleEncodedMarkers = (source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length;
|
||||
return cyrillic + latin - replacement * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
const cyrillic = countMatches(source, /[\u0400-\u04ff]/g);
|
||||
const latin = countMatches(source, /[A-Za-z]/g);
|
||||
const replacement = countMatches(source, /\uFFFD/g);
|
||||
const c1Controls = countMatches(source, /[\u0080-\u009f]/g);
|
||||
const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN);
|
||||
const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu);
|
||||
return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2;
|
||||
}
|
||||
function looksLikeAddressMojibake(value) {
|
||||
const source = String(value ?? "");
|
||||
if (!source.trim()) {
|
||||
return false;
|
||||
}
|
||||
if (/[<EFBFBD>]/.test(source)) {
|
||||
if (/[\u0080-\u009f\uFFFD]/.test(source)) {
|
||||
return true;
|
||||
}
|
||||
if ((source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length >= 2) {
|
||||
if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) {
|
||||
return true;
|
||||
}
|
||||
if ((source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length >= 2) {
|
||||
return true;
|
||||
return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2;
|
||||
}
|
||||
function encodeWin1251MojibakeBytes(value) {
|
||||
const chunks = [];
|
||||
for (const char of String(value ?? "")) {
|
||||
const code = char.codePointAt(0) ?? 0;
|
||||
if (code >= 0x80 && code <= 0x9f) {
|
||||
chunks.push(Buffer.from([code]));
|
||||
continue;
|
||||
}
|
||||
chunks.push(iconv_lite_1.default.encode(char, "win1251"));
|
||||
}
|
||||
return false;
|
||||
return Buffer.concat(chunks);
|
||||
}
|
||||
function decodeUtf8FromWin1251Mojibake(value) {
|
||||
return encodeWin1251MojibakeBytes(value).toString("utf8");
|
||||
}
|
||||
function repairKnownReplacementDamagedRussianText(value) {
|
||||
return String(value ?? "")
|
||||
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
||||
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
||||
}
|
||||
function repairAddressMojibakeText(value) {
|
||||
const source = String(value ?? "");
|
||||
if (!looksLikeAddressMojibake(source)) {
|
||||
return source;
|
||||
const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source);
|
||||
if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) {
|
||||
return sourceWithKnownRepairs;
|
||||
}
|
||||
let candidate = source;
|
||||
let candidate = sourceWithKnownRepairs;
|
||||
for (let pass = 0; pass < 3; pass += 1) {
|
||||
let improved = false;
|
||||
try {
|
||||
const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8");
|
||||
const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate);
|
||||
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
|
||||
candidate = fromWin1251;
|
||||
improved = true;
|
||||
|
|
@ -62,12 +86,17 @@ function repairAddressMojibakeText(value) {
|
|||
catch {
|
||||
// Ignore decode failures and keep the current candidate.
|
||||
}
|
||||
const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate);
|
||||
if (repairedKnownText !== candidate) {
|
||||
candidate = repairedKnownText;
|
||||
improved = true;
|
||||
}
|
||||
if (!improved) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
return repairKnownReplacementDamagedRussianText(candidate);
|
||||
}
|
||||
function normalizeRussianComparableText(value) {
|
||||
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/ё/g, "е");
|
||||
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -209,7 +209,10 @@ function createAssistantLivingModePolicy(deps) {
|
|||
if (hasAffectiveReactionCue) {
|
||||
return false;
|
||||
}
|
||||
return normalized.length <= 36 && !/[?]/.test(String(userMessage ?? ""));
|
||||
const rawQuestionProbe = String(userMessage ?? "")
|
||||
.replace(/\uFFFD\?/g, "\uFFFD")
|
||||
.replace(/пїЅ\?/giu, "пїЅ");
|
||||
return normalized.length <= 36 && !/[?]/.test(rawQuestionProbe);
|
||||
}
|
||||
function hasAssistantDataScopeMetaQuestionSignal(text) {
|
||||
const repaired = repairAddressMojibake(String(text ?? ""));
|
||||
|
|
|
|||
|
|
@ -182,18 +182,25 @@ function scoreOrganizationMentionInMessage(message, organization) {
|
|||
variantScore = Math.max(variantScore, variant.length * 5);
|
||||
continue;
|
||||
}
|
||||
let fuzzyTokenScore = 0;
|
||||
const fuzzyMatched = messageTokens.some((messageToken) => {
|
||||
if (messageToken === variant) {
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5);
|
||||
return true;
|
||||
}
|
||||
if (messageToken.length >= 5 && variant.length >= 5) {
|
||||
return messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
if (prefixMatched) {
|
||||
const prefixLength = Math.min(messageToken.length, variant.length);
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3));
|
||||
}
|
||||
return prefixMatched;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (fuzzyMatched) {
|
||||
matched = true;
|
||||
variantScore = Math.max(variantScore, Math.max(20, variant.length * 3));
|
||||
variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3));
|
||||
}
|
||||
}
|
||||
if (matched) {
|
||||
|
|
|
|||
|
|
@ -1944,7 +1944,9 @@ function textMojibakeScoreForAddress(value) {
|
|||
const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ\uFFFD?’“”•–—™љ›њќћџ]/g) ?? []).length;
|
||||
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
|
||||
const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length;
|
||||
return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
const replacement = (source.match(/\uFFFD/g) ?? []).length;
|
||||
const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length;
|
||||
return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
}
|
||||
function looksLikeMojibakeForAddress(value) {
|
||||
const source = String(value ?? "");
|
||||
|
|
@ -1962,16 +1964,36 @@ function looksLikeMojibakeForAddress(value) {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
function encodeWin1251MojibakeBytesForAddress(value) {
|
||||
const chunks = [];
|
||||
for (const char of String(value ?? "")) {
|
||||
const code = char.codePointAt(0) ?? 0;
|
||||
if (code >= 0x80 && code <= 0x9f) {
|
||||
chunks.push(Buffer.from([code]));
|
||||
continue;
|
||||
}
|
||||
chunks.push(iconv_lite_1.default.encode(char, "win1251"));
|
||||
}
|
||||
return Buffer.concat(chunks);
|
||||
}
|
||||
function decodeUtf8FromWin1251MojibakeForAddress(value) {
|
||||
return encodeWin1251MojibakeBytesForAddress(value).toString("utf8");
|
||||
}
|
||||
function repairKnownReplacementDamagedAddressText(value) {
|
||||
return String(value ?? "")
|
||||
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
||||
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
||||
}
|
||||
function repairAddressMojibake(value) {
|
||||
const source = String(value ?? "");
|
||||
if (!looksLikeMojibakeForAddress(source)) {
|
||||
const source = repairKnownReplacementDamagedAddressText(String(value ?? ""));
|
||||
if (!looksLikeMojibakeForAddress(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) {
|
||||
return source;
|
||||
}
|
||||
let candidate = source;
|
||||
for (let pass = 0; pass < 3; pass += 1) {
|
||||
let improved = false;
|
||||
try {
|
||||
const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8");
|
||||
const fromWin1251 = decodeUtf8FromWin1251MojibakeForAddress(candidate);
|
||||
if (textMojibakeScoreForAddress(fromWin1251) > textMojibakeScoreForAddress(candidate)) {
|
||||
candidate = fromWin1251;
|
||||
improved = true;
|
||||
|
|
@ -1986,11 +2008,16 @@ function repairAddressMojibake(value) {
|
|||
}
|
||||
}
|
||||
catch (_error) { }
|
||||
const repairedKnownText = repairKnownReplacementDamagedAddressText(candidate);
|
||||
if (repairedKnownText !== candidate) {
|
||||
candidate = repairedKnownText;
|
||||
improved = true;
|
||||
}
|
||||
if (!improved) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
return repairKnownReplacementDamagedAddressText(candidate);
|
||||
}
|
||||
function sanitizeOutgoingAssistantText(value, fallback = "Не смог сформировать читаемый ответ. Уточните запрос.") {
|
||||
const repaired = repairAddressMojibake(String(value ?? ""));
|
||||
|
|
@ -4311,18 +4338,25 @@ function scoreOrganizationMentionInMessage(message, organization) {
|
|||
variantScore = Math.max(variantScore, variant.length * 5);
|
||||
continue;
|
||||
}
|
||||
let fuzzyTokenScore = 0;
|
||||
const fuzzyMatched = messageTokens.some((messageToken) => {
|
||||
if (messageToken === variant) {
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5);
|
||||
return true;
|
||||
}
|
||||
if (messageToken.length >= 5 && variant.length >= 5) {
|
||||
return messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
if (prefixMatched) {
|
||||
const prefixLength = Math.min(messageToken.length, variant.length);
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3));
|
||||
}
|
||||
return prefixMatched;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (fuzzyMatched) {
|
||||
matched = true;
|
||||
variantScore = Math.max(variantScore, Math.max(20, variant.length * 3));
|
||||
variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3));
|
||||
}
|
||||
}
|
||||
if (matched) {
|
||||
|
|
|
|||
|
|
@ -321,7 +321,7 @@ function parseAutoGenTitle(value: unknown): string | null {
|
|||
if (!title) {
|
||||
return null;
|
||||
}
|
||||
return title.slice(0, 160);
|
||||
return repairAutogenMojibake(title).slice(0, 160);
|
||||
}
|
||||
|
||||
function parseManualCaseDecision(value: unknown, fallback: ManualCaseDecision = "needs_dialog_policy_fix"): ManualCaseDecision {
|
||||
|
|
@ -1504,7 +1504,9 @@ function textMojibakeScore(value: string): number {
|
|||
const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ‘’“”•–—™љ›њќћџ]/g) ?? []).length;
|
||||
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
|
||||
const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length;
|
||||
return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
const replacement = (source.match(/\uFFFD/g) ?? []).length;
|
||||
const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length;
|
||||
return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
}
|
||||
|
||||
function looksLikeMojibake(value: string): boolean {
|
||||
|
|
@ -1521,16 +1523,39 @@ function looksLikeMojibake(value: string): boolean {
|
|||
return (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length >= 2;
|
||||
}
|
||||
|
||||
function encodeWin1251MojibakeBytes(value: string): Buffer {
|
||||
const chunks: Buffer[] = [];
|
||||
for (const char of String(value ?? "")) {
|
||||
const code = char.codePointAt(0) ?? 0;
|
||||
if (code >= 0x80 && code <= 0x9f) {
|
||||
chunks.push(Buffer.from([code]));
|
||||
continue;
|
||||
}
|
||||
chunks.push(iconv.encode(char, "win1251"));
|
||||
}
|
||||
return Buffer.concat(chunks);
|
||||
}
|
||||
|
||||
function decodeUtf8FromWin1251Mojibake(value: string): string {
|
||||
return encodeWin1251MojibakeBytes(value).toString("utf8");
|
||||
}
|
||||
|
||||
function repairKnownReplacementDamagedAutogenText(value: string): string {
|
||||
return String(value ?? "")
|
||||
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
||||
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
||||
}
|
||||
|
||||
function repairAutogenMojibake(value: string): string {
|
||||
const source = String(value ?? "");
|
||||
if (!looksLikeMojibake(source)) {
|
||||
const source = repairKnownReplacementDamagedAutogenText(String(value ?? ""));
|
||||
if (!looksLikeMojibake(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) {
|
||||
return source;
|
||||
}
|
||||
let candidate = source;
|
||||
for (let pass = 0; pass < 3; pass += 1) {
|
||||
let improved = false;
|
||||
try {
|
||||
const fromWin1251 = iconv.encode(candidate, "win1251").toString("utf8");
|
||||
const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate);
|
||||
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
|
||||
candidate = fromWin1251;
|
||||
improved = true;
|
||||
|
|
@ -1547,11 +1572,16 @@ function repairAutogenMojibake(value: string): string {
|
|||
} catch {
|
||||
// ignore
|
||||
}
|
||||
const repairedKnownText = repairKnownReplacementDamagedAutogenText(candidate);
|
||||
if (repairedKnownText !== candidate) {
|
||||
candidate = repairedKnownText;
|
||||
improved = true;
|
||||
}
|
||||
if (!improved) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
return repairKnownReplacementDamagedAutogenText(candidate);
|
||||
}
|
||||
|
||||
function sanitizeGeneratedQuestion(value: string): string {
|
||||
|
|
@ -1775,7 +1805,8 @@ function extractQuestionsFromAutogenOutput(rawText: string): string[] {
|
|||
|
||||
export const __autoRunsQuestionTestUtils = {
|
||||
splitQuestionCandidates,
|
||||
extractQuestionsFromAutogenOutput
|
||||
extractQuestionsFromAutogenOutput,
|
||||
repairAutogenMojibake
|
||||
};
|
||||
|
||||
async function generateQwenSeedQuestionsLive(input: {
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import type { AppServices } from "../serverContext";
|
|||
import { ApiError, ok } from "../utils/http";
|
||||
import type { EvalRunMode, NormalizeRequestPayload } from "../types/normalizer";
|
||||
import type { EvalTarget } from "../types/assistantEval";
|
||||
import { repairAddressMojibakeText } from "../services/addressTextRepair";
|
||||
|
||||
type EvalAsyncStatus = "queued" | "running" | "completed" | "failed" | "canceled";
|
||||
|
||||
|
|
@ -67,7 +68,7 @@ function toArray(value: unknown): unknown[] {
|
|||
}
|
||||
|
||||
function normalizeQuestionChunk(value: string): string {
|
||||
return String(value ?? "")
|
||||
return repairAddressMojibakeText(String(value ?? ""))
|
||||
.replace(/\r/g, " ")
|
||||
.replace(/\t/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
|
|
@ -136,7 +137,7 @@ function normalizeRuntimeQuestionList(items: string[]): string[] {
|
|||
}
|
||||
|
||||
function splitQuestionCandidate(raw: string): string[] {
|
||||
const normalized = String(raw ?? "").replace(/\r/g, "\n").trim();
|
||||
const normalized = repairAddressMojibakeText(String(raw ?? "")).replace(/\r/g, "\n").trim();
|
||||
if (!normalized) {
|
||||
return [];
|
||||
}
|
||||
|
|
@ -533,7 +534,8 @@ export function buildEvalRouter(services: AppServices): Router {
|
|||
}
|
||||
const questions = normalizeRuntimeQuestions(body.questions);
|
||||
const scenarioQuestions = normalizeRuntimeQuestions(body.scenarioQuestions, { dedupe: false, splitCandidates: false });
|
||||
const scenarioTitle = toStringSafe(body.scenarioTitle);
|
||||
const scenarioTitleRaw = toStringSafe(body.scenarioTitle);
|
||||
const scenarioTitle = scenarioTitleRaw ? repairAddressMojibakeText(scenarioTitleRaw) : null;
|
||||
|
||||
const jobId = `job-${nanoid(10)}`;
|
||||
const runId = `assistant-stage1-${nanoid(10)}`;
|
||||
|
|
|
|||
|
|
@ -1,17 +1,26 @@
|
|||
import iconv from "iconv-lite";
|
||||
|
||||
const MOJIBAKE_CONTINUATION_CLASS =
|
||||
"[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]";
|
||||
const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu");
|
||||
|
||||
function compactWhitespace(value: string): string {
|
||||
return value.replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function countMatches(value: string, pattern: RegExp): number {
|
||||
return (String(value ?? "").match(pattern) ?? []).length;
|
||||
}
|
||||
|
||||
function textMojibakeScore(value: string): number {
|
||||
const source = String(value ?? "");
|
||||
const cyrillic = (source.match(/[\u0400-\u04ff]/g) ?? []).length;
|
||||
const latin = (source.match(/[A-Za-z]/g) ?? []).length;
|
||||
const replacement = (source.match(/[<5B>]/g) ?? []).length;
|
||||
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
|
||||
const doubleEncodedMarkers = (source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length;
|
||||
return cyrillic + latin - replacement * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
const cyrillic = countMatches(source, /[\u0400-\u04ff]/g);
|
||||
const latin = countMatches(source, /[A-Za-z]/g);
|
||||
const replacement = countMatches(source, /\uFFFD/g);
|
||||
const c1Controls = countMatches(source, /[\u0080-\u009f]/g);
|
||||
const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN);
|
||||
const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu);
|
||||
return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2;
|
||||
}
|
||||
|
||||
function looksLikeAddressMojibake(value: string): boolean {
|
||||
|
|
@ -19,30 +28,51 @@ function looksLikeAddressMojibake(value: string): boolean {
|
|||
if (!source.trim()) {
|
||||
return false;
|
||||
}
|
||||
if (/[<EFBFBD>]/.test(source)) {
|
||||
if (/[\u0080-\u009f\uFFFD]/.test(source)) {
|
||||
return true;
|
||||
}
|
||||
if ((source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length >= 2) {
|
||||
if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) {
|
||||
return true;
|
||||
}
|
||||
if ((source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length >= 2) {
|
||||
return true;
|
||||
return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2;
|
||||
}
|
||||
|
||||
function encodeWin1251MojibakeBytes(value: string): Buffer {
|
||||
const chunks: Buffer[] = [];
|
||||
for (const char of String(value ?? "")) {
|
||||
const code = char.codePointAt(0) ?? 0;
|
||||
if (code >= 0x80 && code <= 0x9f) {
|
||||
chunks.push(Buffer.from([code]));
|
||||
continue;
|
||||
}
|
||||
chunks.push(iconv.encode(char, "win1251"));
|
||||
}
|
||||
return false;
|
||||
return Buffer.concat(chunks);
|
||||
}
|
||||
|
||||
function decodeUtf8FromWin1251Mojibake(value: string): string {
|
||||
return encodeWin1251MojibakeBytes(value).toString("utf8");
|
||||
}
|
||||
|
||||
function repairKnownReplacementDamagedRussianText(value: string): string {
|
||||
return String(value ?? "")
|
||||
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
||||
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
||||
}
|
||||
|
||||
export function repairAddressMojibakeText(value: string): string {
|
||||
const source = String(value ?? "");
|
||||
if (!looksLikeAddressMojibake(source)) {
|
||||
return source;
|
||||
const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source);
|
||||
if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) {
|
||||
return sourceWithKnownRepairs;
|
||||
}
|
||||
|
||||
let candidate = source;
|
||||
let candidate = sourceWithKnownRepairs;
|
||||
for (let pass = 0; pass < 3; pass += 1) {
|
||||
let improved = false;
|
||||
|
||||
try {
|
||||
const fromWin1251 = iconv.encode(candidate, "win1251").toString("utf8");
|
||||
const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate);
|
||||
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
|
||||
candidate = fromWin1251;
|
||||
improved = true;
|
||||
|
|
@ -61,14 +91,20 @@ export function repairAddressMojibakeText(value: string): string {
|
|||
// Ignore decode failures and keep the current candidate.
|
||||
}
|
||||
|
||||
const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate);
|
||||
if (repairedKnownText !== candidate) {
|
||||
candidate = repairedKnownText;
|
||||
improved = true;
|
||||
}
|
||||
|
||||
if (!improved) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return candidate;
|
||||
return repairKnownReplacementDamagedRussianText(candidate);
|
||||
}
|
||||
|
||||
export function normalizeRussianComparableText(value: unknown): string {
|
||||
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/ё/g, "е");
|
||||
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -279,7 +279,10 @@ export function createAssistantLivingModePolicy(deps: AssistantLivingModePolicyD
|
|||
if (hasAffectiveReactionCue) {
|
||||
return false;
|
||||
}
|
||||
return normalized.length <= 36 && !/[?]/.test(String(userMessage ?? ""));
|
||||
const rawQuestionProbe = String(userMessage ?? "")
|
||||
.replace(/\uFFFD\?/g, "\uFFFD")
|
||||
.replace(/пїЅ\?/giu, "пїЅ");
|
||||
return normalized.length <= 36 && !/[?]/.test(rawQuestionProbe);
|
||||
}
|
||||
|
||||
function hasAssistantDataScopeMetaQuestionSignal(text) {
|
||||
|
|
|
|||
|
|
@ -189,18 +189,25 @@ export function scoreOrganizationMentionInMessage(message: unknown, organization
|
|||
variantScore = Math.max(variantScore, variant.length * 5);
|
||||
continue;
|
||||
}
|
||||
let fuzzyTokenScore = 0;
|
||||
const fuzzyMatched = messageTokens.some((messageToken) => {
|
||||
if (messageToken === variant) {
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5);
|
||||
return true;
|
||||
}
|
||||
if (messageToken.length >= 5 && variant.length >= 5) {
|
||||
return messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
if (prefixMatched) {
|
||||
const prefixLength = Math.min(messageToken.length, variant.length);
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3));
|
||||
}
|
||||
return prefixMatched;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (fuzzyMatched) {
|
||||
matched = true;
|
||||
variantScore = Math.max(variantScore, Math.max(20, variant.length * 3));
|
||||
variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3));
|
||||
}
|
||||
}
|
||||
if (matched) {
|
||||
|
|
|
|||
|
|
@ -1900,7 +1900,9 @@ function textMojibakeScoreForAddress(value) {
|
|||
const hardMarkers = (source.match(/[Ѓѓ‚„…†‡€‰‹ЉЊЌЋЏ\uFFFD?’“”•–—™љ›њќћџ]/g) ?? []).length;
|
||||
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
|
||||
const doubleEncodedMarkers = (source.match(/(?:Г[Ђ-џ]|В[Ђ-џ]|Ã.|Â.)/gu) ?? []).length;
|
||||
return cyrillic + latin - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
const replacement = (source.match(/\uFFFD/g) ?? []).length;
|
||||
const c1Controls = (source.match(/[\u0080-\u009f]/g) ?? []).length;
|
||||
return cyrillic + latin - replacement * 8 - c1Controls * 5 - hardMarkers * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||||
}
|
||||
function looksLikeMojibakeForAddress(value) {
|
||||
const source = String(value ?? "");
|
||||
|
|
@ -1918,16 +1920,36 @@ function looksLikeMojibakeForAddress(value) {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
function encodeWin1251MojibakeBytesForAddress(value) {
|
||||
const chunks = [];
|
||||
for (const char of String(value ?? "")) {
|
||||
const code = char.codePointAt(0) ?? 0;
|
||||
if (code >= 0x80 && code <= 0x9f) {
|
||||
chunks.push(Buffer.from([code]));
|
||||
continue;
|
||||
}
|
||||
chunks.push(iconv.encode(char, "win1251"));
|
||||
}
|
||||
return Buffer.concat(chunks);
|
||||
}
|
||||
function decodeUtf8FromWin1251MojibakeForAddress(value) {
|
||||
return encodeWin1251MojibakeBytesForAddress(value).toString("utf8");
|
||||
}
|
||||
function repairKnownReplacementDamagedAddressText(value) {
|
||||
return String(value ?? "")
|
||||
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
||||
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
||||
}
|
||||
function repairAddressMojibake(value) {
|
||||
const source = String(value ?? "");
|
||||
if (!looksLikeMojibakeForAddress(source)) {
|
||||
const source = repairKnownReplacementDamagedAddressText(String(value ?? ""));
|
||||
if (!looksLikeMojibakeForAddress(source) && !/[\u0080-\u009f\uFFFD]/.test(source)) {
|
||||
return source;
|
||||
}
|
||||
let candidate = source;
|
||||
for (let pass = 0; pass < 3; pass += 1) {
|
||||
let improved = false;
|
||||
try {
|
||||
const fromWin1251 = iconv.encode(candidate, "win1251").toString("utf8");
|
||||
const fromWin1251 = decodeUtf8FromWin1251MojibakeForAddress(candidate);
|
||||
if (textMojibakeScoreForAddress(fromWin1251) > textMojibakeScoreForAddress(candidate)) {
|
||||
candidate = fromWin1251;
|
||||
improved = true;
|
||||
|
|
@ -1942,11 +1964,16 @@ function repairAddressMojibake(value) {
|
|||
}
|
||||
}
|
||||
catch (_error) { }
|
||||
const repairedKnownText = repairKnownReplacementDamagedAddressText(candidate);
|
||||
if (repairedKnownText !== candidate) {
|
||||
candidate = repairedKnownText;
|
||||
improved = true;
|
||||
}
|
||||
if (!improved) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
return repairKnownReplacementDamagedAddressText(candidate);
|
||||
}
|
||||
function sanitizeOutgoingAssistantText(value, fallback = "Не смог сформировать читаемый ответ. Уточните запрос.") {
|
||||
const repaired = repairAddressMojibake(String(value ?? ""));
|
||||
|
|
@ -4268,18 +4295,25 @@ function scoreOrganizationMentionInMessage(message, organization) {
|
|||
variantScore = Math.max(variantScore, variant.length * 5);
|
||||
continue;
|
||||
}
|
||||
let fuzzyTokenScore = 0;
|
||||
const fuzzyMatched = messageTokens.some((messageToken) => {
|
||||
if (messageToken === variant) {
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5);
|
||||
return true;
|
||||
}
|
||||
if (messageToken.length >= 5 && variant.length >= 5) {
|
||||
return messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken);
|
||||
if (prefixMatched) {
|
||||
const prefixLength = Math.min(messageToken.length, variant.length);
|
||||
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3));
|
||||
}
|
||||
return prefixMatched;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (fuzzyMatched) {
|
||||
matched = true;
|
||||
variantScore = Math.max(variantScore, Math.max(20, variant.length * 3));
|
||||
variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3));
|
||||
}
|
||||
}
|
||||
if (matched) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import { repairAddressMojibakeText } from "../src/services/addressTextRepair";
|
||||
|
||||
describe("address text mojibake repair", () => {
|
||||
it("preserves C1 bytes when repairing old autorun Russian text", () => {
|
||||
const damagedTitle = String.fromCharCode(
|
||||
0x420, 0x2018, 0x420, 0x45b, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x401, 0x420, 0x45b, 0x420,
|
||||
0x2122, 0x20, 0x420, 0x45b, 0x420, 0x2018, 0x420, 0xa9, 0x420, 0x98, 0x420, 0x2122, 0x20,
|
||||
0x420, 0xa0, 0x421, 0x453, 0x421, 0x2021, 0x420, 0x405, 0x420, 0xb0, 0x421, 0x40f, 0x20,
|
||||
0x421, 0x403, 0x420, 0xb5, 0x421, 0x403, 0x421, 0x403, 0x420, 0x451, 0x421, 0x40f, 0x20,
|
||||
0x31, 0x36, 0x2e, 0x30, 0x34, 0x2e, 0x32, 0x30, 0x32, 0x36, 0x2c, 0x20, 0x32, 0x31,
|
||||
0x3a, 0x32, 0x36, 0x3a, 0x30, 0x36
|
||||
);
|
||||
const damagedAlternative = String.fromCharCode(
|
||||
0x420, 0x452, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x45e, 0x420, 0x2022, 0x420, 0xa0, 0x420,
|
||||
0x45c, 0x420, 0x452, 0x420, 0x45e, 0x420, 0x98, 0x420, 0x2019, 0x420, 0x452
|
||||
);
|
||||
|
||||
expect(repairAddressMojibakeText(damagedTitle)).toBe(
|
||||
"\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419 \u0420\u0443\u0447\u043d\u0430\u044f \u0441\u0435\u0441\u0441\u0438\u044f 16.04.2026, 21:26:06"
|
||||
);
|
||||
expect(repairAddressMojibakeText(damagedAlternative)).toBe(
|
||||
"\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"
|
||||
);
|
||||
});
|
||||
|
||||
it("repairs already lossy known replacement fragments", () => {
|
||||
expect(repairAddressMojibakeText("\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\uFFFD?\u0412\u0410")).toBe(
|
||||
"\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"
|
||||
);
|
||||
expect(repairAddressMojibakeText("\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\uFFFD\u0419")).toBe(
|
||||
"\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419"
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -2276,6 +2276,101 @@ describe("assistant address follow-up carryover", () => {
|
|||
expect(normalizerService.normalize).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("continues the original inventory query after replacement-damaged organization clarification", async () => {
|
||||
const calls: Array<{ message: string; options?: any }> = [];
|
||||
const firstMessage = "кайф - что там на складе по остаткам?";
|
||||
const secondMessage = "АЛЬТЕРНАТ\uFFFD?ВА";
|
||||
const repairedSecondMessage = "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410";
|
||||
const addressQueryService = {
|
||||
tryHandle: vi.fn(async (message: string, options?: any) => {
|
||||
calls.push({ message, options });
|
||||
if (message === firstMessage) {
|
||||
return buildAddressLimitedLaneResult("missing_anchor", {
|
||||
reply_text: [
|
||||
"Нужно уточнить организацию, чтобы не смешивать компании в одном ответе.",
|
||||
"Сейчас в доступном контуре вижу такие организации:",
|
||||
"- ООО Альтернатива Плюс",
|
||||
"- ООО Лайсвуд",
|
||||
"- РАЙМ"
|
||||
].join("\n"),
|
||||
debug: {
|
||||
...buildAddressLimitedLaneResult("missing_anchor").debug,
|
||||
detected_intent: "inventory_on_hand_as_of_date",
|
||||
extracted_filters: {
|
||||
as_of_date: "2026-04-15"
|
||||
},
|
||||
selected_recipe: null,
|
||||
organization_candidates: ["ООО Альтернатива Плюс", "ООО Лайсвуд", "РАЙМ"],
|
||||
reasons: ["organization_clarification_required", "multiple_known_organizations_detected"]
|
||||
}
|
||||
});
|
||||
}
|
||||
if (
|
||||
(message === secondMessage || message === repairedSecondMessage) &&
|
||||
options?.followupContext &&
|
||||
options?.activeOrganization === "ООО Альтернатива Плюс"
|
||||
) {
|
||||
return buildAddressLaneResult({
|
||||
reply_text: "На 15.04.2026 по ООО Альтернатива Плюс подтвержден складской остаток по всем складам.",
|
||||
debug: {
|
||||
...buildAddressLaneResult().debug,
|
||||
detected_intent: "inventory_on_hand_as_of_date",
|
||||
extracted_filters: {
|
||||
as_of_date: "2026-04-15",
|
||||
organization: "ООО Альтернатива Плюс"
|
||||
},
|
||||
reasons: ["address_followup_context_applied", "organization_grounded_from_scope_candidates"]
|
||||
}
|
||||
});
|
||||
}
|
||||
return null;
|
||||
})
|
||||
} as any;
|
||||
|
||||
const normalizerService = {
|
||||
normalize: vi.fn(async () => ({
|
||||
assistant_reply: "normalizer_fallback_should_not_be_used",
|
||||
reply_type: "partial_coverage",
|
||||
debug: {}
|
||||
}))
|
||||
} as any;
|
||||
|
||||
const sessions = new AssistantSessionStore();
|
||||
const service = new AssistantService(
|
||||
normalizerService,
|
||||
sessions as any,
|
||||
{} as any,
|
||||
{ persistSession: vi.fn() } as any,
|
||||
addressQueryService
|
||||
);
|
||||
|
||||
const sessionId = `asst-address-org-clarification-damaged-${Date.now()}`;
|
||||
const first = await service.handleMessage({
|
||||
session_id: sessionId,
|
||||
user_message: firstMessage,
|
||||
useMock: true
|
||||
} as any);
|
||||
expect(first.ok).toBe(true);
|
||||
expect(first.reply_type).toBe("partial_coverage");
|
||||
|
||||
const second = await service.handleMessage({
|
||||
session_id: sessionId,
|
||||
user_message: secondMessage,
|
||||
useMock: true
|
||||
} as any);
|
||||
|
||||
expect(second.ok).toBe(true);
|
||||
expect(second.reply_type).toBe("factual");
|
||||
expect(calls).toHaveLength(2);
|
||||
expect(calls[1].message).toBe(repairedSecondMessage);
|
||||
expect(calls[1].options?.activeOrganization).toBe("ООО Альтернатива Плюс");
|
||||
expect(calls[1].options?.knownOrganizations).toEqual(["ООО Альтернатива Плюс", "ООО Лайсвуд", "РАЙМ"]);
|
||||
expect(calls[1].options?.followupContext?.previous_intent).toBe("inventory_on_hand_as_of_date");
|
||||
expect(calls[1].options?.followupContext?.previous_filters?.organization).toBe("ООО Альтернатива Плюс");
|
||||
expect(calls[1].options?.followupContext?.root_filters?.organization).toBe("ООО Альтернатива Плюс");
|
||||
expect(normalizerService.normalize).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("keeps historical inventory date follow-up alive after company clarification and a capability answer", async () => {
|
||||
const calls: Array<{ message: string; options?: any }> = [];
|
||||
const firstMessage = "покажи остатки по складу";
|
||||
|
|
|
|||
|
|
@ -42,6 +42,26 @@ describe("assistant organization matcher", () => {
|
|||
expect(score).toBeGreaterThanOrEqual(90);
|
||||
});
|
||||
|
||||
it("matches replacement-damaged organization clarification when the live candidate is unique", () => {
|
||||
const resolved = resolveOrganizationSelectionFromMessage("АЛЬТЕРНАТ\uFFFD?ВА", [
|
||||
"ООО Альтернатива Плюс",
|
||||
"ООО Лайсвуд",
|
||||
"РАЙМ"
|
||||
]);
|
||||
|
||||
expect(resolved).toBe("ООО Альтернатива Плюс");
|
||||
});
|
||||
|
||||
it("keeps replacement-damaged organization clarification ambiguous when candidates share the token", () => {
|
||||
const resolved = resolveOrganizationSelectionFromMessage("АЛЬТЕРНАТ\uFFFD?ВА", [
|
||||
"ООО Альтернатива Плюс",
|
||||
"ООО Альтернатива Минус",
|
||||
"ООО Лайсвуд"
|
||||
]);
|
||||
|
||||
expect(resolved).toBeNull();
|
||||
});
|
||||
|
||||
it("treats minor live label corruption as the same organization entity", () => {
|
||||
expect(organizationsLikelySameEntity("Альтернатива Плюс", 'ООО "Альтернати"а Плюс"')).toBe(true);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -31,4 +31,26 @@ describe("autoruns question extraction", () => {
|
|||
expect(parsed[0]).toMatch(/поставщик/i);
|
||||
expect(parsed[0]).toMatch(/коротко/i);
|
||||
});
|
||||
|
||||
it("repairs old autorun C1-control mojibake before exposing cards and questions", () => {
|
||||
const damagedTitle = String.fromCharCode(
|
||||
0x420, 0x2018, 0x420, 0x45b, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x401, 0x420, 0x45b, 0x420,
|
||||
0x2122, 0x20, 0x420, 0x45b, 0x420, 0x2018, 0x420, 0xa9, 0x420, 0x98, 0x420, 0x2122, 0x20,
|
||||
0x420, 0xa0, 0x421, 0x453, 0x421, 0x2021, 0x420, 0x405, 0x420, 0xb0, 0x421, 0x40f, 0x20,
|
||||
0x421, 0x403, 0x420, 0xb5, 0x421, 0x403, 0x421, 0x403, 0x420, 0x451, 0x421, 0x40f, 0x20,
|
||||
0x31, 0x36, 0x2e, 0x30, 0x34, 0x2e, 0x32, 0x30, 0x32, 0x36, 0x2c, 0x20, 0x32, 0x31,
|
||||
0x3a, 0x32, 0x36, 0x3a, 0x30, 0x36
|
||||
);
|
||||
const damagedAlternative = String.fromCharCode(
|
||||
0x420, 0x452, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x45e, 0x420, 0x2022, 0x420, 0xa0, 0x420,
|
||||
0x45c, 0x420, 0x452, 0x420, 0x45e, 0x420, 0x98, 0x420, 0x2019, 0x420, 0x452
|
||||
);
|
||||
|
||||
expect(__autoRunsQuestionTestUtils.repairAutogenMojibake(damagedTitle)).toBe(
|
||||
"\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419 \u0420\u0443\u0447\u043d\u0430\u044f \u0441\u0435\u0441\u0441\u0438\u044f 16.04.2026, 21:26:06"
|
||||
);
|
||||
expect(__autoRunsQuestionTestUtils.repairAutogenMojibake(damagedAlternative)).toBe(
|
||||
"\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -33,4 +33,37 @@ describe("eval runtime question splitting", () => {
|
|||
expect(parsed[0]).toMatch(/поставщик/i);
|
||||
expect(parsed[0]).toMatch(/коротко/i);
|
||||
});
|
||||
|
||||
it("repairs mojibake questions before runtime job materialization", () => {
|
||||
const parsed = __evalRouteTestUtils.normalizeRuntimeQuestions([
|
||||
"кайф - что там на складе по остаткам?"
|
||||
]);
|
||||
|
||||
expect(parsed).toEqual(["кайф - что там на складе по остаткам?"]);
|
||||
});
|
||||
|
||||
it("repairs damaged clarification as one scenario turn when splitting is disabled", () => {
|
||||
const parsed = __evalRouteTestUtils.normalizeRuntimeQuestions(
|
||||
["\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\uFFFD?\u0412\u0410"],
|
||||
{
|
||||
dedupe: false,
|
||||
splitCandidates: false
|
||||
}
|
||||
);
|
||||
|
||||
expect(parsed).toEqual(["\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"]);
|
||||
});
|
||||
|
||||
it("repairs C1-control autorun clarification before runtime job materialization", () => {
|
||||
const damagedAlternative = String.fromCharCode(
|
||||
0x420, 0x452, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x45e, 0x420, 0x2022, 0x420, 0xa0, 0x420,
|
||||
0x45c, 0x420, 0x452, 0x420, 0x45e, 0x420, 0x98, 0x420, 0x2019, 0x420, 0x452
|
||||
);
|
||||
const parsed = __evalRouteTestUtils.normalizeRuntimeQuestions([damagedAlternative], {
|
||||
dedupe: false,
|
||||
splitCandidates: false
|
||||
});
|
||||
|
||||
expect(parsed).toEqual(["\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"]);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -0,0 +1,123 @@
|
|||
{
|
||||
"suite_id": "assistant_saved_session_runtime_job-elT8zDcc9I",
|
||||
"suite_version": "0.1.0",
|
||||
"schema_version": "assistant_saved_session_runtime_v0_1",
|
||||
"title": "БОЛЬШОЙ ОБЩРР™ Ручная сессия 16.04.2026, 21:26:06",
|
||||
"scenario_count": 1,
|
||||
"case_ids": [
|
||||
"SAVED-001"
|
||||
],
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "SAVED-001",
|
||||
"scenario_tag": "saved_user_sessions_runtime",
|
||||
"title": "БОЛЬШОЙ ОБЩРР™ Ручная сессия 16.04.2026, 21:26:06",
|
||||
"question_type": "followup",
|
||||
"broadness_level": "medium",
|
||||
"turns": [
|
||||
{
|
||||
"user_message": "приветик - че как там дела"
|
||||
},
|
||||
{
|
||||
"user_message": "расскажи что можешь интересного"
|
||||
},
|
||||
{
|
||||
"user_message": "кайф - что там на складе по остаткам?"
|
||||
},
|
||||
{
|
||||
"user_message": "АЛЬТЕРНАТ<D090>?ВА"
|
||||
},
|
||||
{
|
||||
"user_message": "а исторические остатки на другие даты умеешь?"
|
||||
},
|
||||
{
|
||||
"user_message": "давай на июль 2017"
|
||||
},
|
||||
{
|
||||
"user_message": "март 2016"
|
||||
},
|
||||
{
|
||||
"user_message": "По выбранному объекту \"Рабочая станция универсального специалиста (индивидуальное изготовление)\": где взяли это?"
|
||||
},
|
||||
{
|
||||
"user_message": "а кому продали?"
|
||||
},
|
||||
{
|
||||
"user_message": "у тебя написано кто контрагент: рабочая станция - это ошибка?"
|
||||
},
|
||||
{
|
||||
"user_message": "ндс можешь прикинуть на дату покупки рабочей станции?"
|
||||
},
|
||||
{
|
||||
"user_message": "а какой ндс мы должны сгрузить на март 2020?"
|
||||
},
|
||||
{
|
||||
"user_message": "прикинь какой ндс нам надо заплатить на февраль 2017"
|
||||
},
|
||||
{
|
||||
"user_message": "кто у нас самый доходный клиент за все время"
|
||||
},
|
||||
{
|
||||
"user_message": "кто нам должен денег на май 2017"
|
||||
},
|
||||
{
|
||||
"user_message": "а какой ндс мы должны примерно заплатить за этот период?"
|
||||
},
|
||||
{
|
||||
"user_message": "мы должны комуто денег на сегодня?"
|
||||
},
|
||||
{
|
||||
"user_message": "а нам?"
|
||||
},
|
||||
{
|
||||
"user_message": "какой у нас самый доходный год"
|
||||
},
|
||||
{
|
||||
"user_message": "а за 2017 мы скок заработали?"
|
||||
},
|
||||
{
|
||||
"user_message": "сколько вообще денег мы заработали за все время?"
|
||||
},
|
||||
{
|
||||
"user_message": "ты умеешь считать дельту по договорам?"
|
||||
},
|
||||
{
|
||||
"user_message": "по чепурнову покажи все доки"
|
||||
},
|
||||
{
|
||||
"user_message": "а по свк"
|
||||
},
|
||||
{
|
||||
"user_message": "а сейчас у нас есть что на складе?"
|
||||
},
|
||||
{
|
||||
"user_message": "что нам отгружал чепурнов? какой товар или услугу?"
|
||||
},
|
||||
{
|
||||
"user_message": "какие остатки на складе на сегодня"
|
||||
},
|
||||
{
|
||||
"user_message": "остатки на март 2016"
|
||||
},
|
||||
{
|
||||
"user_message": "хвосты покажи по счету 60 на август 2022"
|
||||
},
|
||||
{
|
||||
"user_message": "Есть ли остатки товара, которые закупались очень давно"
|
||||
},
|
||||
{
|
||||
"user_message": "Какие конкретно номенклатуры формируют остаток по складу на май 2020"
|
||||
},
|
||||
{
|
||||
"user_message": "а по Альтернативе Плюс сколько лет активности в базе 1С?"
|
||||
},
|
||||
{
|
||||
"user_message": "Как ты оценишь деятельность компании?"
|
||||
},
|
||||
{
|
||||
"user_message": "какое нетто по деньгам с Группа СВК за 2020 год: сколько получили и сколько заплатили?"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
Reference in New Issue