111 lines
3.8 KiB
TypeScript
111 lines
3.8 KiB
TypeScript
import iconv from "iconv-lite";
|
|
|
|
const MOJIBAKE_CONTINUATION_CLASS =
|
|
"[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]";
|
|
const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu");
|
|
|
|
function compactWhitespace(value: string): string {
|
|
return value.replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
function countMatches(value: string, pattern: RegExp): number {
|
|
return (String(value ?? "").match(pattern) ?? []).length;
|
|
}
|
|
|
|
function textMojibakeScore(value: string): number {
|
|
const source = String(value ?? "");
|
|
const cyrillic = countMatches(source, /[\u0400-\u04ff]/g);
|
|
const latin = countMatches(source, /[A-Za-z]/g);
|
|
const replacement = countMatches(source, /\uFFFD/g);
|
|
const c1Controls = countMatches(source, /[\u0080-\u009f]/g);
|
|
const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN);
|
|
const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu);
|
|
return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2;
|
|
}
|
|
|
|
function looksLikeAddressMojibake(value: string): boolean {
|
|
const source = String(value ?? "");
|
|
if (!source.trim()) {
|
|
return false;
|
|
}
|
|
if (/[\u0080-\u009f\uFFFD]/.test(source)) {
|
|
return true;
|
|
}
|
|
if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) {
|
|
return true;
|
|
}
|
|
return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2;
|
|
}
|
|
|
|
function encodeWin1251MojibakeBytes(value: string): Buffer {
|
|
const chunks: Buffer[] = [];
|
|
for (const char of String(value ?? "")) {
|
|
const code = char.codePointAt(0) ?? 0;
|
|
if (code >= 0x80 && code <= 0x9f) {
|
|
chunks.push(Buffer.from([code]));
|
|
continue;
|
|
}
|
|
chunks.push(iconv.encode(char, "win1251"));
|
|
}
|
|
return Buffer.concat(chunks);
|
|
}
|
|
|
|
function decodeUtf8FromWin1251Mojibake(value: string): string {
|
|
return encodeWin1251MojibakeBytes(value).toString("utf8");
|
|
}
|
|
|
|
function repairKnownReplacementDamagedRussianText(value: string): string {
|
|
return String(value ?? "")
|
|
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
|
|
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
|
|
}
|
|
|
|
export function repairAddressMojibakeText(value: string): string {
|
|
const source = String(value ?? "");
|
|
const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source);
|
|
if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) {
|
|
return sourceWithKnownRepairs;
|
|
}
|
|
|
|
let candidate = sourceWithKnownRepairs;
|
|
for (let pass = 0; pass < 3; pass += 1) {
|
|
let improved = false;
|
|
|
|
try {
|
|
const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate);
|
|
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
|
|
candidate = fromWin1251;
|
|
improved = true;
|
|
}
|
|
} catch {
|
|
// Ignore decode failures and keep the current candidate.
|
|
}
|
|
|
|
try {
|
|
const fromLatin1 = Buffer.from(candidate, "latin1").toString("utf8");
|
|
if (textMojibakeScore(fromLatin1) > textMojibakeScore(candidate)) {
|
|
candidate = fromLatin1;
|
|
improved = true;
|
|
}
|
|
} catch {
|
|
// Ignore decode failures and keep the current candidate.
|
|
}
|
|
|
|
const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate);
|
|
if (repairedKnownText !== candidate) {
|
|
candidate = repairedKnownText;
|
|
improved = true;
|
|
}
|
|
|
|
if (!improved) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return repairKnownReplacementDamagedRussianText(candidate);
|
|
}
|
|
|
|
export function normalizeRussianComparableText(value: unknown): string {
|
|
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435");
|
|
}
|