NODEDC_1C/llm_normalizer/backend/dist/services/companyAnchorResolver.js

152 lines
6.5 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.resolveCompanyAnchors = resolveCompanyAnchors;
const CONTRACT_PATTERN = /(?:\u0434\u043e\u0433\u043e\u0432\u043e\u0440(?:\u0430|\u0443|ом|е)?\s*(?:№|#|n)?\s*([a-zа-я0-9./_-]+))/giu;
const DOCUMENT_NUMBER_PATTERN = /(?:(?:\u0441\u0447(?:\u0435|\u0451)\u0442(?:-\u0444\u0430\u043a\u0442\u0443\u0440(?:а|ы))?|\u0440\u0435\u0430\u043b\u0438\u0437\u0430\u0446(?:ия|ии)|\u0430\u043a\u0442)\s*(?:№|#|n)\s*([a-zа-я0-9./_-]+))/giu;
const DATE_PATTERN = /\b(?:\d{1,2}[./]\d{1,2}[./]\d{2,4}|\d{1,2}\s+(?:\u044f\u043d\u0432\u0430\u0440\u044f|\u0444\u0435\u0432\u0440\u0430\u043b\u044f|\u043c\u0430\u0440\u0442\u0430|\u0430\u043f\u0440\u0435\u043b\u044f|\u043c\u0430\u044f|\u0438\u044e\u043d\u044f|\u0438\u044e\u043b\u044f|\u0430\u0432\u0433\u0443\u0441\u0442\u0430|\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044f|\u043e\u043a\u0442\u044f\u0431\u0440\u044f|\u043d\u043e\u044f\u0431\u0440\u044f|\u0434\u0435\u043a\u0430\u0431\u0440\u044f))\b/giu;
const AMOUNT_PATTERN = /\b(?:\d{1,3}(?:[ \u00A0]\d{3})+(?:[.,]\d{2})?|\d+[.,]\d{2})\b/gu;
const CONTEXTUAL_ACCOUNT_PATTERN = /(?:\b(?:\u0441\u0447(?:\u0435|\u0451)\u0442(?:а|у|ом|ов)?|account|schet)\b\s*(?:№|#|:)?\s*)(\d{2}(?:\.\d{2})?)/giu;
const ACCOUNT_PAIR_PATTERN = /\b(\d{2}\.\d{2})\s*\/\s*(\d{2}\.\d{2})\b/gu;
const PERIOD_PATTERN = /\b(?:20\d{2}(?:[-./](?:0?[1-9]|1[0-2]))?|(?:\u0438\u044e\u043b\u044c|\u0438\u044e\u043d\u044c|\u0430\u0432\u0433\u0443\u0441\u0442|\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044c|\u043e\u043a\u0442\u044f\u0431\u0440\u044c|\u043d\u043e\u044f\u0431\u0440\u044c|\u0434\u0435\u043a\u0430\u0431\u0440\u044c|\u044f\u043d\u0432\u0430\u0440\u044c|\u0444\u0435\u0432\u0440\u0430\u043b\u044c|\u043c\u0430\u0440\u0442|\u0430\u043f\u0440\u0435\u043b\u044c|\u043c\u0430\u0439)\s+20\d{2})\b/giu;
const DOCUMENT_TYPE_PATTERNS = [
{ name: "invoice", pattern: /\b(?:\u0441\u0447(?:\u0435|\u0451)\u0442-\u0444\u0430\u043a\u0442\u0443\u0440|invoice)\b/iu },
{ name: "realization", pattern: /\b(?:\u0440\u0435\u0430\u043b\u0438\u0437\u0430\u0446|realization)\b/iu },
{ name: "payment", pattern: /\b(?:\u043e\u043f\u043b\u0430\u0442|payment|\u043f\u043b\u0430\u0442\u0435\u0436)\b/iu },
{ name: "receipt", pattern: /\b(?:\u043f\u043e\u0441\u0442\u0443\u043f\u043b\u0435\u043d|receipt)\b/iu },
{ name: "close", pattern: /\b(?:\u0437\u0430\u043a\u0440\u044b\u0442\u0438|\u0440\u0435\u0433\u043b\u0430\u043c\u0435\u043d\u0442)\b/iu },
{ name: "rbp_writeoff", pattern: /\b(?:\u0440\u0431\u043f|\u0441\u043f\u0438\u0441\u0430\u043d\u0438\u0435)\b/iu },
{ name: "amortization", pattern: /\b(?:\u0430\u043c\u043e\u0440\u0442\u0438\u0437|amortization)\b/iu }
];
const KNOWN_ACCOUNT_PREFIXES = new Set([
"01",
"02",
"07",
"08",
"10",
"13",
"19",
"20",
"21",
"23",
"25",
"26",
"41",
"43",
"44",
"45",
"50",
"51",
"52",
"55",
"57",
"58",
"60",
"62",
"66",
"67",
"68",
"69",
"70",
"71",
"73",
"76",
"90",
"91",
"94",
"96",
"97"
]);
function uniqueStrings(values, limit = 48) {
return Array.from(new Set(values.map((item) => String(item ?? "").trim()).filter(Boolean))).slice(0, limit);
}
function normalizeAnchorToken(value) {
return String(value ?? "")
.replace(/\s+/g, " ")
.trim();
}
function collectMatches(text, pattern, useCaptures = true) {
const values = [];
pattern.lastIndex = 0;
for (const match of text.matchAll(pattern)) {
if (!match)
continue;
if (useCaptures && match.length > 1) {
for (let i = 1; i < match.length; i += 1) {
const token = normalizeAnchorToken(match[i] ?? "");
if (token)
values.push(token);
}
continue;
}
const token = normalizeAnchorToken(match[0] ?? "");
if (token)
values.push(token);
}
return uniqueStrings(values);
}
function isKnownAccount(value) {
const token = String(value ?? "").trim();
const match = token.match(/^(\d{2})/);
if (!match) {
return false;
}
return KNOWN_ACCOUNT_PREFIXES.has(match[1]);
}
function collectAccountAnchors(text) {
const tokens = new Set();
for (const token of collectMatches(text, CONTEXTUAL_ACCOUNT_PATTERN, true)) {
if (isKnownAccount(token)) {
tokens.add(token);
}
}
ACCOUNT_PAIR_PATTERN.lastIndex = 0;
for (const match of text.matchAll(ACCOUNT_PAIR_PATTERN)) {
const left = normalizeAnchorToken(match[1] ?? "");
const right = normalizeAnchorToken(match[2] ?? "");
if (left && isKnownAccount(left)) {
tokens.add(left);
}
if (right && isKnownAccount(right)) {
tokens.add(right);
}
}
return Array.from(tokens).slice(0, 24);
}
function collectDocumentTypeAnchors(text) {
return uniqueStrings(DOCUMENT_TYPE_PATTERNS.filter((entry) => entry.pattern.test(text)).map((entry) => entry.name), 12);
}
function flattenAnchors(input) {
return uniqueStrings([
...input.contract_numbers,
...input.document_numbers,
...input.dates,
...input.amounts,
...input.accounts.map((item) => `account:${item}`),
...input.periods.map((item) => `period:${item}`),
...input.document_types.map((item) => `doc_type:${item}`)
], 64);
}
function resolveCompanyAnchors(input) {
const text = String(input ?? "");
const contractNumbers = collectMatches(text, CONTRACT_PATTERN, true).map((item) => `\u0434\u043e\u0433\u043e\u0432\u043e\u0440 № ${item}`);
const documentNumbers = collectMatches(text, DOCUMENT_NUMBER_PATTERN, true).map((item) => `\u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442 № ${item}`);
const dates = collectMatches(text, DATE_PATTERN, false);
const amounts = collectMatches(text, AMOUNT_PATTERN, false);
const accounts = collectAccountAnchors(text);
const periods = collectMatches(text, PERIOD_PATTERN, false);
const documentTypes = collectDocumentTypeAnchors(text);
const resultBase = {
contract_numbers: uniqueStrings(contractNumbers, 12),
document_numbers: uniqueStrings(documentNumbers, 16),
dates: uniqueStrings(dates, 16),
amounts: uniqueStrings(amounts, 16),
accounts: uniqueStrings(accounts, 24),
periods: uniqueStrings(periods, 12),
document_types: documentTypes
};
return {
...resultBase,
all: flattenAnchors(resultBase)
};
}