NODEDC_1C/llm_normalizer/backend/dist/services/addressTextRepair.js

103 lines
4.3 KiB
JavaScript

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.repairAddressMojibakeText = repairAddressMojibakeText;
exports.normalizeRussianComparableText = normalizeRussianComparableText;
const iconv_lite_1 = __importDefault(require("iconv-lite"));
const MOJIBAKE_CONTINUATION_CLASS = "[\\u0080-\\u00bf\\u0401-\\u040f\\u0451-\\u045f\\u2018-\\u201e\\u2020-\\u2022\\u2013-\\u2014\\u2122\\u20ac]";
const MOJIBAKE_PAIR_PATTERN = new RegExp(`(?:[\\u0420\\u0421]${MOJIBAKE_CONTINUATION_CLASS})`, "gu");
function compactWhitespace(value) {
return value.replace(/\s+/g, " ").trim();
}
function countMatches(value, pattern) {
return (String(value ?? "").match(pattern) ?? []).length;
}
function textMojibakeScore(value) {
const source = String(value ?? "");
const cyrillic = countMatches(source, /[\u0400-\u04ff]/g);
const latin = countMatches(source, /[A-Za-z]/g);
const replacement = countMatches(source, /\uFFFD/g);
const c1Controls = countMatches(source, /[\u0080-\u009f]/g);
const pairMarkers = countMatches(source, MOJIBAKE_PAIR_PATTERN);
const doubleEncodedMarkers = countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu);
return cyrillic + latin - replacement * 8 - c1Controls * 5 - pairMarkers * 3 - doubleEncodedMarkers * 2;
}
function looksLikeAddressMojibake(value) {
const source = String(value ?? "");
if (!source.trim()) {
return false;
}
if (/[\u0080-\u009f\uFFFD]/.test(source)) {
return true;
}
if (countMatches(source, MOJIBAKE_PAIR_PATTERN) >= 2) {
return true;
}
return countMatches(source, /(?:\u0420[\u00a0-\u00bf]\u0421|\u0413[\u0080-\u00bf]|\u00c3.|\u00c2.)/gu) >= 2;
}
function encodeWin1251MojibakeBytes(value) {
const chunks = [];
for (const char of String(value ?? "")) {
const code = char.codePointAt(0) ?? 0;
if (code >= 0x80 && code <= 0x9f) {
chunks.push(Buffer.from([code]));
continue;
}
chunks.push(iconv_lite_1.default.encode(char, "win1251"));
}
return Buffer.concat(chunks);
}
function decodeUtf8FromWin1251Mojibake(value) {
return encodeWin1251MojibakeBytes(value).toString("utf8");
}
function repairKnownReplacementDamagedRussianText(value) {
return String(value ?? "")
.replace(/\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422[\uFFFD?]+\u0412\u0410/giu, "\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410")
.replace(/\u041e\u0411\u0429[\uFFFD?]+\u0419/giu, "\u041e\u0411\u0429\u0418\u0419");
}
function repairAddressMojibakeText(value) {
const source = String(value ?? "");
const sourceWithKnownRepairs = repairKnownReplacementDamagedRussianText(source);
if (!looksLikeAddressMojibake(sourceWithKnownRepairs)) {
return sourceWithKnownRepairs;
}
let candidate = sourceWithKnownRepairs;
for (let pass = 0; pass < 3; pass += 1) {
let improved = false;
try {
const fromWin1251 = decodeUtf8FromWin1251Mojibake(candidate);
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
candidate = fromWin1251;
improved = true;
}
}
catch {
// Ignore decode failures and keep the current candidate.
}
try {
const fromLatin1 = Buffer.from(candidate, "latin1").toString("utf8");
if (textMojibakeScore(fromLatin1) > textMojibakeScore(candidate)) {
candidate = fromLatin1;
improved = true;
}
}
catch {
// Ignore decode failures and keep the current candidate.
}
const repairedKnownText = repairKnownReplacementDamagedRussianText(candidate);
if (repairedKnownText !== candidate) {
candidate = repairedKnownText;
improved = true;
}
if (!improved) {
break;
}
}
return repairKnownReplacementDamagedRussianText(candidate);
}
function normalizeRussianComparableText(value) {
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/\u0451/g, "\u0435");
}