"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.repairAddressMojibakeText = repairAddressMojibakeText; exports.normalizeRussianComparableText = normalizeRussianComparableText; const iconv_lite_1 = __importDefault(require("iconv-lite")); const MOJIBAKE_CONTINUATION_CLASS = "[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]"; const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu"); function compactWhitespace(value) { return value.replace(/\s+/g, " ").trim(); } function countMatches(value, pattern) { return (String(value ?? "").match(pattern) ?? []).length; } function textMojibakeScore(value) { const source = String(value ?? ""); const cyrillic = countMatches(source, /[\u0400-\u04ff]/g); const latin = countMatches(source, /[A-Za-z]/g); const replacement = countMatches(source, /\uFFFD/g); const c1Controls = countMatches(source, /[\u0080-\u009f]/g); const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN); const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu); return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2; } function looksLikeAddressMojibake(value) { const source = String(value ?? ""); if (!source.trim()) { return false; } if (/[\u0080-\u009f\uFFFD]/.test(source)) { return true; } if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) { return true; } return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2; } function encodeWin1251MojibakeBytes(value) { const chunks = []; for (const char of String(value ?? "")) { const code = char.codePointAt(0) ?? 0; if (code >= 0x80 && code <= 0x9f) { chunks.push(Buffer.from([code])); continue; } chunks.push(iconv_lite_1.default.encode(char, "win1251")); } return Buffer.concat(chunks); } function decodeUtf8FromWin1251Mojibake(value) { return encodeWin1251MojibakeBytes(value).toString("utf8"); } function repairKnownReplacementDamagedRussianText(value) { return String(value ?? "") .replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410") .replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419"); } function repairAddressMojibakeText(value) { const source = String(value ?? ""); const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source); if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) { return sourceWithKnownRepairs; } let candidate = sourceWithKnownRepairs; for (let pass = 0; pass < 3; pass += 1) { let improved = false; try { const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate); if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) { candidate = fromWin1251; improved = true; } } catch { // Ignore decode failures and keep the current candidate. } try { const fromLatin1 = Buffer.from(candidate, "latin1").toString("utf8"); if (textMojibakeScore(fromLatin1) > textMojibakeScore(candidate)) { candidate = fromLatin1; improved = true; } } catch { // Ignore decode failures and keep the current candidate. } const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate); if (repairedKnownText !== candidate) { candidate = repairedKnownText; improved = true; } if (!improved) { break; } } return repairKnownReplacementDamagedRussianText(candidate); } function normalizeRussianComparableText(value) { return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435"); }