NODEDC_1C/llm_normalizer/backend/dist/services/assistantOrganizationMatche...

308 lines
10 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeOrganizationScopeValue = normalizeOrganizationScopeValue;
exports.normalizeOrganizationScopeSearchText = normalizeOrganizationScopeSearchText;
exports.scoreOrganizationMentionInMessage = scoreOrganizationMentionInMessage;
exports.organizationsLikelySameEntity = organizationsLikelySameEntity;
exports.mergeKnownOrganizations = mergeKnownOrganizations;
exports.resolveOrganizationSelectionFromMessage = resolveOrganizationSelectionFromMessage;
const ORGANIZATION_SCOPE_STOPWORDS = new Set([
"ооо",
"зао",
"оао",
"пао",
"ао",
"ип",
"llc",
"inc",
"ltd",
"corp",
"group",
"company",
"co",
"the",
"and",
"org",
"organization",
"компания",
"организация",
"контора",
"фирма",
"база",
"по",
"в",
"во",
"на",
"для",
"из",
"у",
"к",
"от",
"это",
"эта",
"этой",
"этот",
"сегодня",
"сейчас",
"текущая",
"текущей",
"наш",
"наша",
"нашей",
"нашу",
"наши"
]);
function normalizeScopeLabel(value) {
return String(value ?? "")
.replace(/\\/g, " ")
.replace(/[“”«»]/g, '"')
.replace(/([\p{L}])"(?=[\p{L}])/gu, "$1в")
.replace(/\s+/g, " ")
.trim();
}
function normalizeScopeKey(value) {
return normalizeScopeLabel(value).toLowerCase().replace(/ё/g, "е");
}
function normalizeOrganizationScopeValue(value) {
const normalized = normalizeScopeLabel(value);
if (!normalized) {
return null;
}
let unwrapped = normalized.trim();
if ((unwrapped.startsWith('"') && unwrapped.endsWith('"')) ||
(unwrapped.startsWith("'") && unwrapped.endsWith("'"))) {
unwrapped = unwrapped.slice(1, -1).trim();
}
return unwrapped.length > 0 ? unwrapped : null;
}
function normalizeOrganizationScopeSearchText(value) {
return normalizeScopeKey(value)
.replace(/[^\p{L}\p{N}]+/gu, " ")
.replace(/\s+/g, " ")
.trim();
}
function tokenizeOrganizationScope(value) {
const normalized = normalizeOrganizationScopeSearchText(value);
if (!normalized) {
return [];
}
return normalized
.split(" ")
.map((token) => token.trim())
.filter((token) => token.length >= 3 && !ORGANIZATION_SCOPE_STOPWORDS.has(token));
}
function organizationTokenVariants(token) {
const source = String(token ?? "").trim().toLowerCase();
if (!source) {
return [];
}
const variants = new Set([source]);
const withoutLongEnding = source.replace(/(?:ами|ями|ого|ему|ому|ыми|ими|иях|ях|ах|ей|ой|ом|ем|ам|ям|ую|юю|ая|яя|ое|ее|ые|ие|ов|ев|ий|ый|ой)$/iu, "");
if (withoutLongEnding.length >= 4) {
variants.add(withoutLongEnding);
}
const withoutShortEnding = source.replace(/[аеёиоуыэюя]$/iu, "");
if (withoutShortEnding.length >= 4) {
variants.add(withoutShortEnding);
}
return Array.from(variants);
}
function isSingleInsertionOrDeletionAway(left, right) {
const longer = left.length >= right.length ? left : right;
const shorter = left.length >= right.length ? right : left;
if (longer.length - shorter.length !== 1) {
return false;
}
let longIndex = 0;
let shortIndex = 0;
let mismatchUsed = false;
while (longIndex < longer.length && shortIndex < shorter.length) {
if (longer[longIndex] === shorter[shortIndex]) {
longIndex += 1;
shortIndex += 1;
continue;
}
if (mismatchUsed) {
return false;
}
mismatchUsed = true;
longIndex += 1;
}
return true;
}
function organizationTokensLookEquivalent(left, right) {
if (!left || !right) {
return false;
}
if (left === right) {
return true;
}
if (left.length >= 5 && right.length >= 5 && (left.startsWith(right) || right.startsWith(left))) {
return true;
}
const leftCompact = left.replace(/\s+/g, "");
const rightCompact = right.replace(/\s+/g, "");
if (!leftCompact || !rightCompact) {
return false;
}
if (leftCompact === rightCompact) {
return true;
}
if (leftCompact.length >= 6 && rightCompact.length >= 6 && isSingleInsertionOrDeletionAway(leftCompact, rightCompact)) {
return true;
}
return false;
}
function scoreOrganizationMentionInMessage(message, organization) {
const messageNorm = normalizeOrganizationScopeSearchText(message);
const organizationNorm = normalizeOrganizationScopeSearchText(organization);
if (!messageNorm || !organizationNorm) {
return 0;
}
if (messageNorm.includes(organizationNorm)) {
return 10_000 + organizationNorm.length;
}
const organizationTokens = tokenizeOrganizationScope(organizationNorm);
const messageTokens = tokenizeOrganizationScope(messageNorm);
if (organizationTokens.length === 0 || messageTokens.length === 0) {
return 0;
}
let matchedTokens = 0;
let score = 0;
for (const token of organizationTokens) {
const variants = organizationTokenVariants(token);
let matched = false;
let variantScore = 0;
for (const variant of variants) {
if (!variant) {
continue;
}
if (messageNorm.includes(variant)) {
matched = true;
variantScore = Math.max(variantScore, variant.length * 5);
continue;
}
let fuzzyTokenScore = 0;
const fuzzyMatched = messageTokens.some((messageToken) => {
if (messageToken === variant) {
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * 5);
return true;
}
if (messageToken.length >= 5 && variant.length >= 5) {
const prefixMatched = messageToken.startsWith(variant) || variant.startsWith(messageToken);
if (prefixMatched) {
const prefixLength = Math.min(messageToken.length, variant.length);
fuzzyTokenScore = Math.max(fuzzyTokenScore, variant.length * (prefixLength >= 7 ? 5 : 3));
}
return prefixMatched;
}
return false;
});
if (fuzzyMatched) {
matched = true;
variantScore = Math.max(variantScore, Math.max(20, fuzzyTokenScore || variant.length * 3));
}
}
if (matched) {
matchedTokens += 1;
score += variantScore > 0 ? variantScore : 10;
}
}
if (matchedTokens === 0) {
return 0;
}
if (matchedTokens === organizationTokens.length) {
score += 400;
}
else {
score += matchedTokens * 50;
}
return score;
}
function organizationsLikelySameEntity(left, right) {
const leftNorm = normalizeOrganizationScopeSearchText(left);
const rightNorm = normalizeOrganizationScopeSearchText(right);
if (!leftNorm || !rightNorm) {
return false;
}
if (leftNorm === rightNorm) {
return true;
}
const leftTokens = tokenizeOrganizationScope(leftNorm);
const rightTokens = tokenizeOrganizationScope(rightNorm);
if (leftTokens.length === 0 || rightTokens.length === 0) {
return false;
}
const leftCompact = leftTokens.join("");
const rightCompact = rightTokens.join("");
if (leftCompact && rightCompact) {
if (leftCompact === rightCompact) {
return true;
}
if (leftCompact.length >= 8 &&
rightCompact.length >= 8 &&
isSingleInsertionOrDeletionAway(leftCompact, rightCompact)) {
return true;
}
}
const leftCovered = leftTokens.every((leftToken) => rightTokens.some((rightToken) => organizationTokensLookEquivalent(leftToken, rightToken)));
if (!leftCovered) {
return false;
}
const rightCovered = rightTokens.every((rightToken) => leftTokens.some((leftToken) => organizationTokensLookEquivalent(leftToken, rightToken)));
return rightCovered;
}
function mergeKnownOrganizations(values, limit = 50) {
const dedup = [];
for (const raw of Array.isArray(values) ? values : []) {
const normalized = normalizeOrganizationScopeValue(raw);
if (!normalized) {
continue;
}
const key = normalizeOrganizationScopeSearchText(normalized);
if (!key) {
continue;
}
const existingIndex = dedup.findIndex((item) => organizationsLikelySameEntity(item, normalized));
if (existingIndex >= 0) {
const existing = dedup[existingIndex];
const existingKey = normalizeOrganizationScopeSearchText(existing);
if (key.length > existingKey.length || normalized.length > existing.length) {
dedup[existingIndex] = normalized;
}
continue;
}
dedup.push(normalized);
}
return dedup.slice(0, limit);
}
function resolveOrganizationSelectionFromMessage(userMessage, knownOrganizations) {
const known = mergeKnownOrganizations(Array.isArray(knownOrganizations) ? knownOrganizations : []);
if (!userMessage || known.length === 0) {
return null;
}
const messageNorm = normalizeOrganizationScopeSearchText(userMessage);
if (!messageNorm) {
return null;
}
const scored = known
.map((organization) => ({
organization,
score: scoreOrganizationMentionInMessage(messageNorm, organization)
}))
.filter((item) => item.score > 0)
.sort((a, b) => b.score - a.score || a.organization.length - b.organization.length);
if (scored.length === 0) {
return null;
}
const best = scored[0];
const second = scored[1];
if (best.score < 90) {
return null;
}
if (second && second.score === best.score) {
return null;
}
return best.organization;
}