74 lines
2.7 KiB
JavaScript
74 lines
2.7 KiB
JavaScript
"use strict";
|
||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||
};
|
||
Object.defineProperty(exports, "__esModule", { value: true });
|
||
exports.repairAddressMojibakeText = repairAddressMojibakeText;
|
||
exports.normalizeRussianComparableText = normalizeRussianComparableText;
|
||
const iconv_lite_1 = __importDefault(require("iconv-lite"));
|
||
function compactWhitespace(value) {
|
||
return value.replace(/\s+/g, " ").trim();
|
||
}
|
||
function textMojibakeScore(value) {
|
||
const source = String(value ?? "");
|
||
const cyrillic = (source.match(/[\u0400-\u04ff]/g) ?? []).length;
|
||
const latin = (source.match(/[A-Za-z]/g) ?? []).length;
|
||
const replacement = (source.match(/[<5B>]/g) ?? []).length;
|
||
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
|
||
const doubleEncodedMarkers = (source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length;
|
||
return cyrillic + latin - replacement * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
|
||
}
|
||
function looksLikeAddressMojibake(value) {
|
||
const source = String(value ?? "");
|
||
if (!source.trim()) {
|
||
return false;
|
||
}
|
||
if (/[<5B>]/.test(source)) {
|
||
return true;
|
||
}
|
||
if ((source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length >= 2) {
|
||
return true;
|
||
}
|
||
if ((source.match(/(?:Р“[Р-џ]|Р’[Р-џ]|Ã.|Â.)/gu) ?? []).length >= 2) {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
function repairAddressMojibakeText(value) {
|
||
const source = String(value ?? "");
|
||
if (!looksLikeAddressMojibake(source)) {
|
||
return source;
|
||
}
|
||
let candidate = source;
|
||
for (let pass = 0; pass < 3; pass += 1) {
|
||
let improved = false;
|
||
try {
|
||
const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8");
|
||
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
|
||
candidate = fromWin1251;
|
||
improved = true;
|
||
}
|
||
}
|
||
catch {
|
||
// Ignore decode failures and keep the current candidate.
|
||
}
|
||
try {
|
||
const fromLatin1 = Buffer.from(candidate, "latin1").toString("utf8");
|
||
if (textMojibakeScore(fromLatin1) > textMojibakeScore(candidate)) {
|
||
candidate = fromLatin1;
|
||
improved = true;
|
||
}
|
||
}
|
||
catch {
|
||
// Ignore decode failures and keep the current candidate.
|
||
}
|
||
if (!improved) {
|
||
break;
|
||
}
|
||
}
|
||
return candidate;
|
||
}
|
||
function normalizeRussianComparableText(value) {
|
||
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/ё/g, "е");
|
||
}
|