NODEDC_1C/llm_normalizer/backend/dist/services/addressTextRepair.js

74 lines
2.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.repairAddressMojibakeText = repairAddressMojibakeText;
exports.normalizeRussianComparableText = normalizeRussianComparableText;
const iconv_lite_1 = __importDefault(require("iconv-lite"));
function compactWhitespace(value) {
return value.replace(/\s+/g, " ").trim();
}
function textMojibakeScore(value) {
const source = String(value ?? "");
const cyrillic = (source.match(/[\u0400-\u04ff]/g) ?? []).length;
const latin = (source.match(/[A-Za-z]/g) ?? []).length;
const replacement = (source.match(/[<5B>]/g) ?? []).length;
const pairMarkers = (source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length;
const doubleEncodedMarkers = (source.match(/(?:Р“[Р-џ]|Р[Р-џ]|Ã.|Â.)/gu) ?? []).length;
return cyrillic + latin - replacement * 3 - pairMarkers * 2 - doubleEncodedMarkers * 2;
}
function looksLikeAddressMojibake(value) {
const source = String(value ?? "");
if (!source.trim()) {
return false;
}
if (/[<5B>]/.test(source)) {
return true;
}
if ((source.match(/(?:Р.|С.|Ð.|Ñ.)/g) ?? []).length >= 2) {
return true;
}
if ((source.match(/(?:Р“[Р-џ]|Р[Р-џ]|Ã.|Â.)/gu) ?? []).length >= 2) {
return true;
}
return false;
}
function repairAddressMojibakeText(value) {
const source = String(value ?? "");
if (!looksLikeAddressMojibake(source)) {
return source;
}
let candidate = source;
for (let pass = 0; pass < 3; pass += 1) {
let improved = false;
try {
const fromWin1251 = iconv_lite_1.default.encode(candidate, "win1251").toString("utf8");
if (textMojibakeScore(fromWin1251) > textMojibakeScore(candidate)) {
candidate = fromWin1251;
improved = true;
}
}
catch {
// Ignore decode failures and keep the current candidate.
}
try {
const fromLatin1 = Buffer.from(candidate, "latin1").toString("utf8");
if (textMojibakeScore(fromLatin1) > textMojibakeScore(candidate)) {
candidate = fromLatin1;
improved = true;
}
}
catch {
// Ignore decode failures and keep the current candidate.
}
if (!improved) {
break;
}
}
return candidate;
}
function normalizeRussianComparableText(value) {
return compactWhitespace(repairAddressMojibakeText(String(value ?? "")).toLowerCase()).replace(/ё/g, "е");
}