103 lines
4.3 KiB
JavaScript
103 lines
4.3 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.repairAddressMojibakeText = repairAddressMojibakeText;
|
|
exports.normalizeRussianComparableText = normalizeRussianComparableText;
|
|
const iconv_lite_1 = __importDefault(require("iconv-lite"));
|
|
const MOJIBAKE_CONTINUATION_CLASS = "[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]";
|
|
const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu");
|
|
function compactWhitespace(value) {
|
|
return value.replace(/\s+/g, " ").trim();
|
|
}
|
|
function countMatches(value, pattern) {
|
|
return (String(value ?? "").match(pattern) ?? []).length;
|
|
}
|
|
function textMojibakeScore(value) {
|
|
const source = String(value ?? "");
|
|
const cyrillic = countMatches(source, /[\u0400-\u04ff]/g);
|
|
const latin = countMatches(source, /[A-Za-z]/g);
|
|
const replacement = countMatches(source, /\uFFFD/g);
|
|
const c1Controls = countMatches(source, /[\u0080-\u009f]/g);
|
|
const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN);
|
|
const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu);
|
|
return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2;
|
|
}
|
|
function looksLikeAddressMojibake(value) {
|
|
const source = String(value ?? "");
|
|
if (!source.trim()) {
|
|
return false;
|
|
}
|
|
if (/[\u0080-\u009f\uFFFD]/.test(source)) {
|
|
return true;
|
|
}
|
|
if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) {
|
|
return true;
|
|
}
|
|
return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2;
|
|
}
|
|
function encodeWin1251MojibakeBytes(value) {
|
|
const chunks = [];
|
|
for (const char of String(value ?? "")) {
|
|
const code = char.codePointAt(0) ?? 0;
|
|
if (code >= 0x80 && code <= 0x9f) {
|
|
chunks.push(Buffer.from([code]));
|
|
continue;
|
|
}
|
|
chunks.push(iconv_lite_1.default.encode(char, "win1251"));
|
|
}
|
|
return Buffer.concat(chunks);
|
|
}
|
|
function decodeUtf8FromWin1251Mojibake(value) {
|
|
return encodeWin1251MojibakeBytes(value).toString("utf8");
|
|
}
|
|
function repairKnownReplacementDamagedRussianText(value) {
|
|
return String(value ?? "")
|
|
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
|
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
|
}
|
|
function repairAddressMojibakeText(value) {
|
|
const source = String(value ?? "");
|
|
const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source);
|
|
if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) {
|
|
return sourceWithKnownRepairs;
|
|
}
|
|
let candidate = sourceWithKnownRepairs;
|
|
for (let pass = 0; pass < 3; pass += 1) {
|
|
let improved = false;
|
|
try {
|
|
const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate);
|
|
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
|
|
candidate = fromWin1251;
|
|
improved = true;
|
|
}
|
|
}
|
|
catch {
|
|
// Ignore decode failures and keep the current candidate.
|
|
}
|
|
try {
|
|
const fromLatin1 = Buffer.from(candidate, "latin1").toString("utf8");
|
|
if (textMojibakeScore(fromLatin1) > textMojibakeScore(candidate)) {
|
|
candidate = fromLatin1;
|
|
improved = true;
|
|
}
|
|
}
|
|
catch {
|
|
// Ignore decode failures and keep the current candidate.
|
|
}
|
|
const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate);
|
|
if (repairedKnownText !== candidate) {
|
|
candidate = repairedKnownText;
|
|
improved = true;
|
|
}
|
|
if (!improved) {
|
|
break;
|
|
}
|
|
}
|
|
return repairKnownReplacementDamagedRussianText(candidate);
|
|
}
|
|
function normalizeRussianComparableText(value) {
|
|
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435");
|
|
}
|