NODEDC_1C/llm_normalizer/backend/src/services/addressQueryClassifier.ts

307 lines
6.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import type { AddressModeDetection } from "../types/addressQuery";
const ADDRESS_ACTION_TOKENS = [
"show",
"list",
"find",
"get",
"lookup",
"open",
"balance",
"debt",
"owe",
"покажи",
"покаж",
"показ",
"список",
"найди",
"найд",
"выведи",
"вывед",
"кто",
"кому",
"какие",
"что по",
"че по",
"чё по",
"остаток",
"скока",
"сколько",
"долг",
"задолж",
"хвост",
"незакрыт"
];
const ADDRESS_ENTITY_TOKENS = [
"counterparty",
"counterparties",
"company",
"organization",
"supplier",
"vendor",
"customer",
"client",
"partner",
"contract",
"contracts",
"account",
"accounts",
"document",
"documents",
"balance",
"payable",
"payables",
"receivable",
"receivables",
"owe",
"owes",
"owed",
"контрагент",
"контра",
"компан",
"организац",
"поставщик",
"клиент",
"покупател",
"партнер",
"банк",
"выписк",
"операц",
"транзак",
"договор",
"счет",
"счёт",
"документ",
"доки",
"док",
"остаток",
"дебитор",
"кредитор",
"аванс",
"оплат",
"поступлен",
"поступлени",
"списан",
"списани",
"долг",
"должен",
"должны",
"должна"
];
const DEEP_REASONING_TOKENS = [
"why",
"because",
"root cause",
"mechanism",
"prove",
"chain",
"почему",
"причин",
"механизм",
"докажи",
"цепоч",
"разрыв",
"ошибк"
];
function hasLooseByAnchorMention(text: string): boolean {
const match = text.match(/(?:^|\s)по\s+([a-zа-яё][a-zа-яё0-9._-]{1,})(?=[\s,.;:!?)]|$)/iu);
if (!match) {
return false;
}
const token = String(match[1] ?? "").toLowerCase();
if (!token) {
return false;
}
const stopWords = new Set([
"контрагенту",
"контрагента",
"контре",
"компании",
"компанию",
"организации",
"организацию",
"поставщику",
"поставщика",
"клиенту",
"клиента",
"покупателю",
"покупателя",
"партнеру",
"партнера",
"договору",
"договора",
"счету",
"счёту",
"дате",
"периоду",
"период",
"документам",
"докам",
"взаиморасчетам",
"взаиморасчётам"
]);
return !stopWords.has(token);
}
function hasAddressFollowupSignal(text: string): boolean {
if (/(?:за\s+любой\s+период|за\s+вс[её]\s+время|for\s+all\s+time|all\s+time)/iu.test(text)) {
return true;
}
if (/(?:\bесть\s+что(?:-|\s)?то\b|\bесть\s+ли\b|\bчто\s+есть\b)/iu.test(text)) {
return true;
}
return false;
}
function hasDocsOrBankSignal(text: string): boolean {
return /(?:док(?:и|умент|ументы|ументов)|docs?|documents?|банк|выписк|платеж|платёж|оплат|поступлен|списан|транзак|transactions?|bank\s+ops|bank\s+operations?)/iu.test(
text
);
}
function hasAccountCodeAnchor(text: string): boolean {
return /(?<![\d-])\d{2}(?:[.,]\d{1,2})(?![\d-])/u.test(text);
}
function hasLikelyCounterpartyToken(text: string): boolean {
const stopWords = new Set([
"за",
"с",
"по",
"на",
"и",
"или",
"док",
"доки",
"документ",
"документы",
"документов",
"банк",
"банковские",
"операции",
"платежи",
"платеж",
"платёж",
"контрагент",
"контрагенту",
"контрагента",
"компания",
"компании",
"организация",
"организации",
"год",
"года",
"г",
"плс",
"pls",
"пж",
"пжлст",
"пожалуйста",
"бля",
"блять",
"епт",
"ёпт",
"епта",
"нах",
"нахуй",
"покеж",
"покажи",
"показать",
"покаж",
"выведи",
"show",
"list",
"please",
"all",
"vse"
]);
const tokens = String(text ?? "")
.split(/[^a-zа-яё0-9._-]+/iu)
.map((token) => token.trim())
.filter((token) => token.length >= 2);
return tokens.some((token) => {
const lowered = token.toLowerCase();
if (stopWords.has(lowered)) {
return false;
}
if (/^\d+$/.test(lowered)) {
return false;
}
if (/^(?:19|20)\d{2}$/.test(lowered)) {
return false;
}
return true;
});
}
function hasAnyToken(text: string, tokens: string[]): boolean {
return tokens.some((token) => text.includes(token));
}
export function detectAddressQuestionMode(userMessage: string): AddressModeDetection {
const text = String(userMessage ?? "").trim().toLowerCase();
if (!text) {
return {
mode: "unsupported",
confidence: "low",
reasons: ["empty_message"]
};
}
const hasAddressAction = hasAnyToken(text, ADDRESS_ACTION_TOKENS);
const hasAddressEntity = hasAnyToken(text, ADDRESS_ENTITY_TOKENS);
const hasDeepReasoning = hasAnyToken(text, DEEP_REASONING_TOKENS);
const hasLooseByAnchor = hasLooseByAnchorMention(text);
const hasFollowupSignal = hasAddressFollowupSignal(text);
const hasAccountCode = hasAccountCodeAnchor(text);
if (hasAddressAction && (hasAddressEntity || hasAccountCode) && !hasDeepReasoning) {
return {
mode: "address_query",
confidence: "high",
reasons: ["address_action_detected", "address_entity_detected"]
};
}
if (hasLooseByAnchor && (hasAddressAction || hasAddressEntity || hasFollowupSignal || hasAccountCode) && !hasDeepReasoning) {
return {
mode: "address_query",
confidence: "medium",
reasons: ["loose_by_anchor_detected", ...(hasFollowupSignal ? ["address_followup_signal_detected"] : [])]
};
}
if ((hasAddressEntity || hasAccountCode) && !hasDeepReasoning) {
return {
mode: "address_query",
confidence: "medium",
reasons: ["address_entity_detected"]
};
}
if (!hasDeepReasoning && hasDocsOrBankSignal(text) && (hasLooseByAnchor || hasLikelyCounterpartyToken(text))) {
return {
mode: "address_query",
confidence: "medium",
reasons: ["docs_or_bank_signal_detected", "anchor_like_token_detected"]
};
}
if (hasDeepReasoning) {
return {
mode: "deep_analysis",
confidence: "high",
reasons: ["deep_reasoning_signal_detected"]
};
}
return {
mode: "unsupported",
confidence: "low",
reasons: ["no_address_or_deep_signal"]
};
}