NODEDC_1C/llm_normalizer/backend/tests/addressTextRepair.test.ts

36 lines
1.9 KiB
TypeScript

import { describe, expect, it } from "vitest";
import { repairAddressMojibakeText } from "../src/services/addressTextRepair";
describe("address text mojibake repair", () => {
it("preserves C1 bytes when repairing old autorun Russian text", () => {
const damagedTitle = String.fromCharCode(
0x420, 0x2018, 0x420, 0x45b, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x401, 0x420, 0x45b, 0x420,
0x2122, 0x20, 0x420, 0x45b, 0x420, 0x2018, 0x420, 0xa9, 0x420, 0x98, 0x420, 0x2122, 0x20,
0x420, 0xa0, 0x421, 0x453, 0x421, 0x2021, 0x420, 0x405, 0x420, 0xb0, 0x421, 0x40f, 0x20,
0x421, 0x403, 0x420, 0xb5, 0x421, 0x403, 0x421, 0x403, 0x420, 0x451, 0x421, 0x40f, 0x20,
0x31, 0x36, 0x2e, 0x30, 0x34, 0x2e, 0x32, 0x30, 0x32, 0x36, 0x2c, 0x20, 0x32, 0x31,
0x3a, 0x32, 0x36, 0x3a, 0x30, 0x36
);
const damagedAlternative = String.fromCharCode(
0x420, 0x452, 0x420, 0x203a, 0x420, 0xac, 0x420, 0x45e, 0x420, 0x2022, 0x420, 0xa0, 0x420,
0x45c, 0x420, 0x452, 0x420, 0x45e, 0x420, 0x98, 0x420, 0x2019, 0x420, 0x452
);
expect(repairAddressMojibakeText(damagedTitle)).toBe(
"\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419 \u0420\u0443\u0447\u043d\u0430\u044f \u0441\u0435\u0441\u0441\u0438\u044f 16.04.2026, 21:26:06"
);
expect(repairAddressMojibakeText(damagedAlternative)).toBe(
"\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"
);
});
it("repairs already lossy known replacement fragments", () => {
expect(repairAddressMojibakeText("\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\uFFFD?\u0412\u0410")).toBe(
"\u0410\u041b\u042c\u0422\u0415\u0420\u041d\u0410\u0422\u0418\u0412\u0410"
);
expect(repairAddressMojibakeText("\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\uFFFD\u0419")).toBe(
"\u0411\u041e\u041b\u042c\u0428\u041e\u0419 \u041e\u0411\u0429\u0418\u0419"
);
});
});