From a4dd9c7c6618b62e9bc7f9528b145d509dbec318 Mon Sep 17 00:00:00 2001 From: dctouch Date: Sat, 23 May 2026 22:01:29 +0300 Subject: [PATCH] =?UTF-8?q?=D0=A3=D1=81=D0=B8=D0=BB=D0=B8=D1=82=D1=8C=20?= =?UTF-8?q?=D1=81=D0=B5=D0=BC=D0=B0=D0=BD=D1=82=D0=B8=D1=87=D0=B5=D1=81?= =?UTF-8?q?=D0=BA=D0=B8=D0=B9=20AGENT=20gate=20=D0=B4=D0=BB=D1=8F=201?= =?UTF-8?q?=D0=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../assistantMcpDiscoveryResponsePolicy.js | 17 +- .../assistantMcpDiscoveryResponsePolicy.ts | 18 +- ...ssistantMcpDiscoveryResponsePolicy.test.ts | 61 ++++ .../data/autorun_generators/history.json | 80 +++++ ..._20260523185848_gen-ag05231858-323f86.json | 287 ++++++++++++++++++ ..._20260523185848_gen-ag05231858-323f86.json | 70 +++++ scripts/domain_case_loop.py | 3 + scripts/domain_truth_harness.py | 26 +- scripts/test_domain_case_loop_step_state.py | 85 ++++++ 9 files changed, 634 insertions(+), 13 deletions(-) create mode 100644 llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260523185848_gen-ag05231858-323f86.json create mode 100644 llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260523185848_gen-ag05231858-323f86.json diff --git a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js index 94b567a..322555b 100644 --- a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js +++ b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js @@ -381,6 +381,9 @@ function hasValueFlowActionConflictWithDiscoveryTurnMeaning(input, entryPoint) { if (askedDomain !== "counterparty_value") { return false; } + if (askedAction === "net_value_flow") { + return true; + } if (hasExactBankOperationsAddressReply(input, entryPoint)) { return false; } @@ -388,9 +391,6 @@ function hasValueFlowActionConflictWithDiscoveryTurnMeaning(input, entryPoint) { if (askedAction === "payout") { return detectedIntent !== "supplier_payouts_profile"; } - if (askedAction === "net_value_flow") { - return true; - } return false; } function hasEvidenceLaneConflictWithDiscoveryTurnMeaning(input, entryPoint) { @@ -534,6 +534,10 @@ function hasSemanticConflictWithDiscoveryTurnMeaning(input, entryPoint) { return false; } if (hasExactBankOperationsAddressReply(input, entryPoint)) { + const askedAction = toNonEmptyString(readDiscoveryTurnMeaning(entryPoint)?.asked_action_family); + if (askedAction === "net_value_flow") { + return true; + } return false; } const detectedIntent = toNonEmptyString(input.addressRuntimeMeta?.detected_intent); @@ -644,6 +648,9 @@ function applyAssistantMcpDiscoveryResponsePolicy(input) { const metadataDiscoveryPriority = hasMetadataDiscoveryPriority(input, entryPoint); const valueFlowActionConflictWithDiscoveryTurnMeaning = hasValueFlowActionConflictWithDiscoveryTurnMeaning(input, entryPoint); const evidenceLaneConflictWithDiscoveryTurnMeaning = hasEvidenceLaneConflictWithDiscoveryTurnMeaning(input, entryPoint); + const exactBankOperationsProtectsCurrent = exactBankOperationsAddressReply && + !semanticConflictWithDiscoveryTurnMeaning && + !valueFlowActionConflictWithDiscoveryTurnMeaning; if (!entryPoint) { pushReason(reasonCodes, "mcp_discovery_response_policy_no_entry_point"); } @@ -701,7 +708,7 @@ function applyAssistantMcpDiscoveryResponsePolicy(input) { if (exactValueFlowReplyForBusinessOverviewDirectMoneyNeed) { pushReason(reasonCodes, "mcp_discovery_response_policy_keep_exact_value_flow_reply_over_business_overview_direct_money_clarification"); } - if (exactBankOperationsAddressReply) { + if (exactBankOperationsProtectsCurrent) { pushReason(reasonCodes, "mcp_discovery_response_policy_keep_exact_bank_operations_address_reply"); } if (inventoryMarginRankingAddressReply) { @@ -736,7 +743,7 @@ function applyAssistantMcpDiscoveryResponsePolicy(input) { !runtimeMatchedExactReply && !staleMetadataDiscoveryFallbackAgainstExactAddressReply && !exactValueFlowReplyForBusinessOverviewDirectMoneyNeed && - !exactBankOperationsAddressReply && + !exactBankOperationsProtectsCurrent && !inventoryMarginRankingAddressReply && !(deterministicBroadBusinessEvaluationReply && candidate.candidate_status === "clarification_candidate") && ALLOWED_CANDIDATE_STATUSES.has(candidate.candidate_status) && diff --git a/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts b/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts index 5e6e2d3..1657785 100644 --- a/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts +++ b/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts @@ -537,6 +537,9 @@ function hasValueFlowActionConflictWithDiscoveryTurnMeaning( if (askedDomain !== "counterparty_value") { return false; } + if (askedAction === "net_value_flow") { + return true; + } if (hasExactBankOperationsAddressReply(input, entryPoint)) { return false; } @@ -544,9 +547,6 @@ function hasValueFlowActionConflictWithDiscoveryTurnMeaning( if (askedAction === "payout") { return detectedIntent !== "supplier_payouts_profile"; } - if (askedAction === "net_value_flow") { - return true; - } return false; } @@ -726,6 +726,10 @@ function hasSemanticConflictWithDiscoveryTurnMeaning( return false; } if (hasExactBankOperationsAddressReply(input, entryPoint)) { + const askedAction = toNonEmptyString(readDiscoveryTurnMeaning(entryPoint)?.asked_action_family); + if (askedAction === "net_value_flow") { + return true; + } return false; } const detectedIntent = toNonEmptyString(input.addressRuntimeMeta?.detected_intent); @@ -870,6 +874,10 @@ export function applyAssistantMcpDiscoveryResponsePolicy( input, entryPoint ); + const exactBankOperationsProtectsCurrent = + exactBankOperationsAddressReply && + !semanticConflictWithDiscoveryTurnMeaning && + !valueFlowActionConflictWithDiscoveryTurnMeaning; if (!entryPoint) { pushReason(reasonCodes, "mcp_discovery_response_policy_no_entry_point"); @@ -940,7 +948,7 @@ export function applyAssistantMcpDiscoveryResponsePolicy( "mcp_discovery_response_policy_keep_exact_value_flow_reply_over_business_overview_direct_money_clarification" ); } - if (exactBankOperationsAddressReply) { + if (exactBankOperationsProtectsCurrent) { pushReason(reasonCodes, "mcp_discovery_response_policy_keep_exact_bank_operations_address_reply"); } if (inventoryMarginRankingAddressReply) { @@ -980,7 +988,7 @@ export function applyAssistantMcpDiscoveryResponsePolicy( !runtimeMatchedExactReply && !staleMetadataDiscoveryFallbackAgainstExactAddressReply && !exactValueFlowReplyForBusinessOverviewDirectMoneyNeed && - !exactBankOperationsAddressReply && + !exactBankOperationsProtectsCurrent && !inventoryMarginRankingAddressReply && !(deterministicBroadBusinessEvaluationReply && candidate.candidate_status === "clarification_candidate") && ALLOWED_CANDIDATE_STATUSES.has(candidate.candidate_status) && diff --git a/llm_normalizer/backend/tests/assistantMcpDiscoveryResponsePolicy.test.ts b/llm_normalizer/backend/tests/assistantMcpDiscoveryResponsePolicy.test.ts index 557b6b1..6eef701 100644 --- a/llm_normalizer/backend/tests/assistantMcpDiscoveryResponsePolicy.test.ts +++ b/llm_normalizer/backend/tests/assistantMcpDiscoveryResponsePolicy.test.ts @@ -700,6 +700,67 @@ describe("assistant MCP discovery response policy", () => { expect(result.reason_codes).not.toContain("mcp_discovery_response_policy_semantic_conflict_allows_candidate_override"); }); + it("overrides exact bank operation replies for explicit counterparty received-paid-net questions", () => { + const result = applyAssistantMcpDiscoveryResponsePolicy({ + currentReply: + "Exact bank operation answer: found 13 bank operations for Group SVK, showing rows and payment purposes.", + currentReplySource: "address_query_runtime_v1", + currentReplyType: "factual", + addressRuntimeMeta: { + detected_intent: "bank_operations_by_counterparty", + selected_recipe: "address_bank_operations_by_counterparty_v1", + mcp_call_status: "matched_non_empty", + capability_route_mode: "exact", + answer_grounding_check: { + status: "grounded" + }, + assistant_mcp_discovery_entry_point_v1: entryPoint({ + turn_input: { + adapter_status: "ready", + should_run_discovery: true, + data_need_graph: { + business_fact_family: "value_flow", + subject_candidates: ["Group SVK"], + reason_codes: ["data_need_graph_built", "data_need_graph_bidirectional_value_flow"] + }, + turn_meaning_ref: { + asked_domain_family: "counterparty_value", + asked_action_family: "net_value_flow", + explicit_entity_candidates: ["Group SVK"], + explicit_date_scope: "2020", + unsupported_but_understood_family: "counterparty_bidirectional_value_flow_or_netting" + } + }, + bridge: { + bridge_status: "answer_draft_ready", + user_facing_response_allowed: true, + business_fact_answer_allowed: true, + requires_user_clarification: false, + answer_draft: { + answer_mode: "confirmed_with_bounded_inference", + headline: "Group SVK 2020: received 12 093 465 rub., paid 0 rub.; net 12 093 465 rub. toward us.", + confirmed_lines: ["received 12 093 465 rub.; paid 0 rub.; net 12 093 465 rub."], + inference_lines: [], + unknown_lines: [], + limitation_lines: [], + next_step_line: null + } + } + }) + } + }); + + expect(result.applied).toBe(true); + expect(result.decision).toBe("apply_candidate"); + expect(result.reply_source).toBe("mcp_discovery_response_candidate_guarded"); + expect(result.reply_text).toContain("received 12 093 465"); + expect(result.reply_text).toContain("paid 0"); + expect(result.reply_text).toContain("net 12 093 465"); + expect(result.reason_codes).toContain("mcp_discovery_response_policy_value_flow_action_conflict_allows_candidate_override"); + expect(result.reason_codes).toContain("mcp_discovery_response_policy_semantic_conflict_allows_candidate_override"); + expect(result.reason_codes).not.toContain("mcp_discovery_response_policy_keep_exact_bank_operations_address_reply"); + }); + it("overrides an exact ranking-shaped address reply when open-scope ranking still needs organization", () => { const result = applyAssistantMcpDiscoveryResponsePolicy({ currentReply: diff --git a/llm_normalizer/data/autorun_generators/history.json b/llm_normalizer/data/autorun_generators/history.json index 16c7cad..7553e3d 100644 --- a/llm_normalizer/data/autorun_generators/history.json +++ b/llm_normalizer/data/autorun_generators/history.json @@ -1,4 +1,84 @@ [ + { + "generation_id": "gen-ag05231858-323f86", + "created_at": "2026-05-23T18:58:48+00:00", + "mode": "saved_user_sessions", + "title": "AGENT | Autonomy business quality pack", + "count": 15, + "domain": "autonomy_business_quality", + "questions": [ + "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?", + "Теперь дай взрослый обзор за 2020 по компании: входящие, исходящие, нетто, топы, но банк в топах отдельно объясни как финансовый поток.", + "скока денег альтернатива заработала за 20 год?", + "а это чистая прибыль?", + "Какая чистая прибыль по ООО Альтернатива Плюс за 2020?", + "А отдельно по СБЕРБАНКУ: он для нас клиент, поставщик или финансовый поток? Дай коротко по подтвержденным строкам.", + "какое нетто по деньгам с Группа СВК за 2020 год: сколько получили и сколько заплатили?", + "кому мы должны на конец 2020?", + "а нам кто должен на конец 2020?", + "сколько НДС надо заплатить в налоговую за декабрь 2019?", + "Как ты оценишь деятельность компании?", + "Какая номенклатура товара реализована с высокой прибылью какая с низкой?", + "май 2020" + ], + "generated_by": "codex_agent", + "saved_case_set_file": "assistant_autogen_saved_user_sessions_20260523185848_gen-ag05231858-323f86.json", + "context": { + "llm_provider": null, + "model": null, + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "autogen_personality_id": null, + "autogen_personality_prompt": null, + "source_session_id": null, + "saved_session_file": "assistant_saved_session_20260523185848_gen-ag05231858-323f86.json", + "saved_case_set_kind": "agent_semantic_scenario", + "agent_run": true, + "agent_focus": "Expanded targeted AGENT replay for the autonomy milestone: value-flow, business overview, debts, VAT, profit/cashflow distinction, nomenclature margin boundary, and final answer quality must survive realistic business questions.", + "architecture_phase": "turnaround_11", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\agent_autonomy_business_quality_20260523.json", + "scenario_id": "agent_autonomy_business_quality_20260523", + "semantic_tags": [ + "autonomy_core", + "bank_boundary", + "bank_classification", + "business_answer_quality", + "business_evaluation", + "business_overview", + "cashflow_overview", + "cashflow_vs_profit", + "clarification", + "colloquial_money", + "colloquial_total", + "counterparty_value_flow", + "debt_answer_quality", + "direct_profit", + "domain_purity", + "followup_context", + "incoming_total", + "limit_honesty", + "limited_answer", + "net_flow", + "next_action", + "no_profit_substitution", + "no_top_guard", + "nomenclature_margin", + "outgoing_total", + "payables", + "profit_vs_cashflow", + "receivables", + "technical_garbage_guard", + "value_flow", + "vat" + ], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_autonomy_business_quality_live_semantic_gate_accepted_20260523", + "saved_after_validated_replay": true + } + }, { "generation_id": "gen-ag05231427-70915a", "created_at": "2026-05-23T14:27:55+00:00", diff --git a/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260523185848_gen-ag05231858-323f86.json b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260523185848_gen-ag05231858-323f86.json new file mode 100644 index 0000000..7892a0e --- /dev/null +++ b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260523185848_gen-ag05231858-323f86.json @@ -0,0 +1,287 @@ +{ + "saved_at": "2026-05-23T18:58:48+00:00", + "generation_id": "gen-ag05231858-323f86", + "mode": "saved_user_sessions", + "title": "AGENT | Autonomy business quality pack", + "agent_run": true, + "questions": [ + "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?", + "Теперь дай взрослый обзор за 2020 по компании: входящие, исходящие, нетто, топы, но банк в топах отдельно объясни как финансовый поток.", + "скока денег альтернатива заработала за 20 год?", + "а это чистая прибыль?", + "Какая чистая прибыль по ООО Альтернатива Плюс за 2020?", + "А отдельно по СБЕРБАНКУ: он для нас клиент, поставщик или финансовый поток? Дай коротко по подтвержденным строкам.", + "какое нетто по деньгам с Группа СВК за 2020 год: сколько получили и сколько заплатили?", + "кому мы должны на конец 2020?", + "а нам кто должен на конец 2020?", + "сколько НДС надо заплатить в налоговую за декабрь 2019?", + "Как ты оценишь деятельность компании?", + "Какая номенклатура товара реализована с высокой прибылью какая с низкой?", + "май 2020" + ], + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "Expanded targeted AGENT replay for the autonomy milestone: value-flow, business overview, debts, VAT, profit/cashflow distinction, nomenclature margin boundary, and final answer quality must survive realistic business questions.", + "architecture_phase": "turnaround_11", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\agent_autonomy_business_quality_20260523.json", + "scenario_id": "agent_autonomy_business_quality_20260523", + "semantic_tags": [ + "autonomy_core", + "bank_boundary", + "bank_classification", + "business_answer_quality", + "business_evaluation", + "business_overview", + "cashflow_overview", + "cashflow_vs_profit", + "clarification", + "colloquial_money", + "colloquial_total", + "counterparty_value_flow", + "debt_answer_quality", + "direct_profit", + "domain_purity", + "followup_context", + "incoming_total", + "limit_honesty", + "limited_answer", + "net_flow", + "next_action", + "no_profit_substitution", + "no_top_guard", + "nomenclature_margin", + "outgoing_total", + "payables", + "profit_vs_cashflow", + "receivables", + "technical_garbage_guard", + "value_flow", + "vat" + ], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_autonomy_business_quality_live_semantic_gate_accepted_20260523", + "saved_after_validated_replay": true, + "save_gate": { + "schema_version": "agent_semantic_save_gate_v1", + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_autonomy_business_quality_live_semantic_gate_accepted_20260523", + "final_status": "accepted", + "review_overall_status": "pass", + "business_overall_status": "pass", + "steps_total": 15, + "steps_passed": 15, + "steps_failed": 0, + "steps_with_business_failures": 0, + "steps_with_business_warnings": 0, + "acceptance_gate_passed": true, + "saved_after_validated_replay": true + } + }, + "source_session_id": null, + "session": { + "session_id": null, + "mode": "agent_semantic_run", + "items": [ + { + "message_id": "agent-user-001", + "role": "user", + "text": "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-002", + "role": "user", + "text": "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-003", + "role": "user", + "text": "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-004", + "role": "user", + "text": "Теперь дай взрослый обзор за 2020 по компании: входящие, исходящие, нетто, топы, но банк в топах отдельно объясни как финансовый поток.", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-005", + "role": "user", + "text": "скока денег альтернатива заработала за 20 год?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-006", + "role": "user", + "text": "а это чистая прибыль?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-007", + "role": "user", + "text": "Какая чистая прибыль по ООО Альтернатива Плюс за 2020?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-008", + "role": "user", + "text": "А отдельно по СБЕРБАНКУ: он для нас клиент, поставщик или финансовый поток? Дай коротко по подтвержденным строкам.", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-009", + "role": "user", + "text": "какое нетто по деньгам с Группа СВК за 2020 год: сколько получили и сколько заплатили?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-010", + "role": "user", + "text": "кому мы должны на конец 2020?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-011", + "role": "user", + "text": "а нам кто должен на конец 2020?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-012", + "role": "user", + "text": "сколько НДС надо заплатить в налоговую за декабрь 2019?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-013", + "role": "user", + "text": "Как ты оценишь деятельность компании?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-014", + "role": "user", + "text": "Какая номенклатура товара реализована с высокой прибылью какая с низкой?", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-015", + "role": "user", + "text": "май 2020", + "created_at": "2026-05-23T18:58:48+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + } + ], + "agent_run": true, + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "Expanded targeted AGENT replay for the autonomy milestone: value-flow, business overview, debts, VAT, profit/cashflow distinction, nomenclature margin boundary, and final answer quality must survive realistic business questions.", + "architecture_phase": "turnaround_11", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\agent_autonomy_business_quality_20260523.json", + "scenario_id": "agent_autonomy_business_quality_20260523", + "semantic_tags": [ + "autonomy_core", + "bank_boundary", + "bank_classification", + "business_answer_quality", + "business_evaluation", + "business_overview", + "cashflow_overview", + "cashflow_vs_profit", + "clarification", + "colloquial_money", + "colloquial_total", + "counterparty_value_flow", + "debt_answer_quality", + "direct_profit", + "domain_purity", + "followup_context", + "incoming_total", + "limit_honesty", + "limited_answer", + "net_flow", + "next_action", + "no_profit_substitution", + "no_top_guard", + "nomenclature_margin", + "outgoing_total", + "payables", + "profit_vs_cashflow", + "receivables", + "technical_garbage_guard", + "value_flow", + "vat" + ], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_autonomy_business_quality_live_semantic_gate_accepted_20260523", + "saved_after_validated_replay": true, + "save_gate": { + "schema_version": "agent_semantic_save_gate_v1", + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_autonomy_business_quality_live_semantic_gate_accepted_20260523", + "final_status": "accepted", + "review_overall_status": "pass", + "business_overall_status": "pass", + "steps_total": 15, + "steps_passed": 15, + "steps_failed": 0, + "steps_with_business_failures": 0, + "steps_with_business_warnings": 0, + "acceptance_gate_passed": true, + "saved_after_validated_replay": true + } + } + } +} diff --git a/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260523185848_gen-ag05231858-323f86.json b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260523185848_gen-ag05231858-323f86.json new file mode 100644 index 0000000..0c7b288 --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260523185848_gen-ag05231858-323f86.json @@ -0,0 +1,70 @@ +{ + "suite_id": "assistant_saved_session_gen-ag05231858-323f86", + "suite_version": "0.1.0", + "schema_version": "assistant_saved_session_suite_v0_1", + "generated_at": "2026-05-23T18:58:48+00:00", + "generation_id": "gen-ag05231858-323f86", + "mode": "saved_user_sessions", + "title": "AGENT | Autonomy business quality pack", + "domain": "autonomy_business_quality", + "scenario_count": 1, + "case_ids": [ + "SAVED-001" + ], + "cases": [ + { + "case_id": "SAVED-001", + "scenario_tag": "agent_saved_user_sessions", + "title": "AGENT | Autonomy business quality pack", + "question_type": "followup", + "broadness_level": "medium", + "turns": [ + { + "user_message": "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?" + }, + { + "user_message": "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?" + }, + { + "user_message": "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?" + }, + { + "user_message": "Теперь дай взрослый обзор за 2020 по компании: входящие, исходящие, нетто, топы, но банк в топах отдельно объясни как финансовый поток." + }, + { + "user_message": "скока денег альтернатива заработала за 20 год?" + }, + { + "user_message": "а это чистая прибыль?" + }, + { + "user_message": "Какая чистая прибыль по ООО Альтернатива Плюс за 2020?" + }, + { + "user_message": "А отдельно по СБЕРБАНКУ: он для нас клиент, поставщик или финансовый поток? Дай коротко по подтвержденным строкам." + }, + { + "user_message": "какое нетто по деньгам с Группа СВК за 2020 год: сколько получили и сколько заплатили?" + }, + { + "user_message": "кому мы должны на конец 2020?" + }, + { + "user_message": "а нам кто должен на конец 2020?" + }, + { + "user_message": "сколько НДС надо заплатить в налоговую за декабрь 2019?" + }, + { + "user_message": "Как ты оценишь деятельность компании?" + }, + { + "user_message": "Какая номенклатура товара реализована с высокой прибылью какая с низкой?" + }, + { + "user_message": "май 2020" + } + ] + } + ] +} diff --git a/scripts/domain_case_loop.py b/scripts/domain_case_loop.py index 6d7325b..8e4364b 100644 --- a/scripts/domain_case_loop.py +++ b/scripts/domain_case_loop.py @@ -118,6 +118,9 @@ BUSINESS_TOP_LINE_SCAFFOLD_MARKERS = ( "\u043e\u0433\u0440\u0430\u043d\u0438\u0447\u0435\u043d\u043d\u044b\u0439 \u0431\u0438\u0437\u043d\u0435\u0441-\u043e\u0431\u0437\u043e\u0440", "\u0447\u0442\u043e \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d\u043e", "\u043f\u0440\u043e\u0432\u0435\u0440\u0435\u043d\u043d\u044b\u0435 \u043a\u043e\u043d\u0442\u0443\u0440\u044b", + "\u043f\u043e \u0434\u0430\u043d\u043d\u044b\u043c 1\u0441 \u043d\u0430\u0439\u0434\u0435\u043d\u044b", + "\u043d\u0430\u0439\u0434\u0435\u043d\u044b \u0441\u0442\u0440\u043e\u043a\u0438", + "\u0441\u0443\u043c\u043c\u0443 \u043c\u043e\u0436\u043d\u043e \u043d\u0430\u0437\u044b\u0432\u0430\u0442\u044c \u0442\u043e\u043b\u044c\u043a\u043e", "\u0431\u043b\u043e\u043a 1", "\u0441\u0442\u0430\u0442\u0443\u0441", ) diff --git a/scripts/domain_truth_harness.py b/scripts/domain_truth_harness.py index 0bdda66..9d6e6dc 100644 --- a/scripts/domain_truth_harness.py +++ b/scripts/domain_truth_harness.py @@ -1156,8 +1156,15 @@ def build_business_review_summary(spec: dict[str, Any], scenario_state: dict[str step_outputs = scenario_state.get("step_outputs") if isinstance(scenario_state.get("step_outputs"), dict) else {} steps: list[dict[str, Any]] = [] issue_counts: dict[str, int] = {} + review_failures = 0 + review_warnings = 0 for index, step in enumerate(spec["steps"], start=1): step_state = step_outputs.get(step["step_id"], {}) + review_status = step_state.get("review_status") if isinstance(step_state, dict) else None + if review_status == "fail": + review_failures += 1 + elif review_status == "warning": + review_warnings += 1 business_review = ( step_state.get("business_first_review") if isinstance(step_state, dict) and isinstance(step_state.get("business_first_review"), dict) @@ -1171,7 +1178,7 @@ def build_business_review_summary(spec: dict[str, Any], scenario_state: dict[str "index": index, "step_id": step["step_id"], "question": step["question_template"], - "review_status": step_state.get("review_status") if isinstance(step_state, dict) else None, + "review_status": review_status, "direct_answer": business_review.get("actual_direct_answer"), "answer_length_chars": business_review.get("answer_length_chars"), "direct_answer_required": business_review.get("direct_answer_required"), @@ -1193,6 +1200,7 @@ def build_business_review_summary(spec: dict[str, Any], scenario_state: dict[str ) ) warnings = sum(1 for step in steps if "business_answer_too_verbose" in step["issue_codes"]) + semantic_status = "fail" if failed or review_failures else ("warning" if warnings or review_warnings else "pass") return { "schema_version": "business_first_run_review_v1", "scenario_id": spec["scenario_id"], @@ -1202,24 +1210,32 @@ def build_business_review_summary(spec: dict[str, Any], scenario_state: dict[str "steps_total": len(steps), "steps_with_business_failures": failed, "steps_with_business_warnings": warnings, + "steps_with_review_failures": review_failures, + "steps_with_review_warnings": review_warnings, "issue_counts": issue_counts, "overall_business_status": "fail" if failed else ("warning" if warnings else "pass"), + "overall_semantic_status": semantic_status, + "semantic_gate_passed": semantic_status == "pass", "steps": steps, } -def build_business_review_markdown(business_review: dict[str, Any]) -> str: +def build_business_review_markdown(business_review: dict[str, Any], *, title: str = "Business-first review") -> str: lines = [ - "# Business-first review", + f"# {title}", "", f"- scenario_id: `{business_review.get('scenario_id') or 'n/a'}`", f"- domain: `{business_review.get('domain') or 'n/a'}`", f"- title: {business_review.get('title') or 'n/a'}", f"- session_id: `{business_review.get('session_id') or 'n/a'}`", f"- overall_business_status: `{business_review.get('overall_business_status') or 'n/a'}`", + f"- overall_semantic_status: `{business_review.get('overall_semantic_status') or 'n/a'}`", + f"- semantic_gate_passed: `{business_review.get('semantic_gate_passed') is True}`", f"- steps_total: `{business_review.get('steps_total')}`", f"- steps_with_business_failures: `{business_review.get('steps_with_business_failures')}`", f"- steps_with_business_warnings: `{business_review.get('steps_with_business_warnings')}`", + f"- steps_with_review_failures: `{business_review.get('steps_with_review_failures')}`", + f"- steps_with_review_warnings: `{business_review.get('steps_with_review_warnings')}`", f"- issue_counts: `{dump_json(business_review.get('issue_counts') or {})}`", "", "## Human Answer Surface", @@ -1479,6 +1495,8 @@ def review_export(spec: dict[str, Any], export_path: Path, output_dir: Path) -> write_text(output_dir / "truth_review.md", review_markdown) write_json(output_dir / "business_review.json", business_review) write_text(output_dir / "business_review.md", build_business_review_markdown(business_review)) + write_json(output_dir / "semantic_audit.json", business_review) + write_text(output_dir / "semantic_audit.md", build_business_review_markdown(business_review, title="Semantic audit")) acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary) return { "scenario_state": scenario_state, @@ -1575,6 +1593,8 @@ def run_live(spec: dict[str, Any], output_dir: Path, args: argparse.Namespace) - write_text(output_dir / "truth_review.md", review_markdown) write_json(output_dir / "business_review.json", business_review) write_text(output_dir / "business_review.md", build_business_review_markdown(business_review)) + write_json(output_dir / "semantic_audit.json", business_review) + write_text(output_dir / "semantic_audit.md", build_business_review_markdown(business_review, title="Semantic audit")) acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary) print(f"[truth-harness] saved artifacts to {output_dir}") print(f"[truth-harness] overall_status={review_summary['overall_status']}") diff --git a/scripts/test_domain_case_loop_step_state.py b/scripts/test_domain_case_loop_step_state.py index 724942d..079a62f 100644 --- a/scripts/test_domain_case_loop_step_state.py +++ b/scripts/test_domain_case_loop_step_state.py @@ -927,6 +927,91 @@ class DomainCaseLoopStepStateTests(unittest.TestCase): self.assertIn("technical_garbage_in_answer", review["issue_codes"]) self.assertNotIn("business_direct_answer_missing", review["issue_codes"]) + def test_business_first_review_rejects_found_rows_scaffold_as_direct_answer(self) -> None: + question = "\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0432\u0445\u043e\u0434\u044f\u0449\u0438\u0445 \u0434\u0435\u043d\u0435\u0433 \u0437\u0430 2020" + step_state = dcl.build_scenario_step_state( + scenario_id="semantic_gate_demo", + domain="value_flow", + step={ + "step_id": "step_01", + "title": "Direct money answer", + "depends_on": [], + "question_template": question, + }, + step_index=1, + question_resolved=question, + analysis_context={}, + turn_artifact={ + "assistant_message": { + "reply_type": "factual", + "text": "\u041a\u043e\u0440\u043e\u0442\u043a\u043e: \u041f\u043e \u0434\u0430\u043d\u043d\u044b\u043c 1\u0421 \u043d\u0430\u0439\u0434\u0435\u043d\u044b \u0441\u0442\u0440\u043e\u043a\u0438 \u0432\u0445\u043e\u0434\u044f\u0449\u0438\u0445 \u0434\u0435\u043d\u0435\u0436\u043d\u044b\u0445 \u043f\u043e\u0441\u0442\u0443\u043f\u043b\u0435\u043d\u0438\u0439; \u0441\u0443\u043c\u043c\u0443 \u043c\u043e\u0436\u043d\u043e \u043d\u0430\u0437\u044b\u0432\u0430\u0442\u044c \u0442\u043e\u043b\u044c\u043a\u043e \u0432 \u0440\u0430\u043c\u043a\u0430\u0445 \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0438.\n\u0427\u0442\u043e \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d\u043e: 47 628 853,03 \u0440\u0443\u0431.", + "message_id": "msg-1", + "trace_id": "trace-1", + }, + "technical_debug_payload": {}, + "session_summary": {}, + }, + entries=[], + ) + + review = step_state["business_first_review"] + self.assertFalse(review["direct_answer_first_ok"]) + self.assertFalse(review["answer_layering_ok"]) + self.assertIn("business_direct_answer_missing", review["issue_codes"]) + self.assertIn("answer_layering_noise", review["issue_codes"]) + + def test_business_first_review_allows_direct_answer_with_evidence_boundary(self) -> None: + question = "\u043a\u0430\u043a\u043e\u0435 \u043d\u0435\u0442\u0442\u043e \u043f\u043e \u0434\u0435\u043d\u044c\u0433\u0430\u043c \u0441 \u0413\u0440\u0443\u043f\u043f\u0430 \u0421\u0412\u041a \u0437\u0430 2020" + step_state = dcl.build_scenario_step_state( + scenario_id="semantic_gate_demo", + domain="value_flow", + step={ + "step_id": "step_01", + "title": "Counterparty net flow", + "depends_on": [], + "question_template": question, + }, + step_index=1, + question_resolved=question, + analysis_context={}, + turn_artifact={ + "assistant_message": { + "reply_type": "factual_with_explanation", + "text": "\u041a\u043e\u0440\u043e\u0442\u043a\u043e: \u043f\u043e \u043a\u043e\u043d\u0442\u0440\u0430\u0433\u0435\u043d\u0442\u0443 \u0413\u0440\u0443\u043f\u043f\u0430 \u0421\u0412\u041a \u0437\u0430 2020 \u043f\u043e \u043d\u0430\u0439\u0434\u0435\u043d\u043d\u044b\u043c \u0441\u0442\u0440\u043e\u043a\u0430\u043c 1\u0421 \u043f\u043e\u043b\u0443\u0447\u0438\u043b\u0438 12 093 465 \u0440\u0443\u0431., \u0437\u0430\u043f\u043b\u0430\u0442\u0438\u043b\u0438 0 \u0440\u0443\u0431.; \u043d\u0435\u0442\u0442\u043e \u0432 \u043d\u0430\u0448\u0443 \u0441\u0442\u043e\u0440\u043e\u043d\u0443 12 093 465 \u0440\u0443\u0431.", + "message_id": "msg-1", + "trace_id": "trace-1", + }, + "technical_debug_payload": {}, + "session_summary": {}, + }, + entries=[], + ) + + review = step_state["business_first_review"] + self.assertTrue(review["direct_answer_first_ok"]) + self.assertTrue(review["answer_layering_ok"]) + self.assertEqual(review["issue_codes"], []) + + def test_semantic_audit_markdown_exposes_semantic_gate(self) -> None: + markdown = dth.build_business_review_markdown( + { + "scenario_id": "semantic_gate_demo", + "domain": "value_flow", + "title": "Semantic gate demo", + "session_id": "session-1", + "overall_business_status": "fail", + "steps_total": 1, + "steps_with_business_failures": 1, + "steps_with_business_warnings": 0, + "issue_counts": {"answer_layering_noise": 1}, + "steps": [], + }, + title="Semantic audit", + ) + + self.assertIn("# Semantic audit", markdown) + self.assertIn("semantic_gate_passed: `False`", markdown) + def test_truth_harness_promotes_business_review_issues_to_findings(self) -> None: step_state = dcl.build_scenario_step_state( scenario_id="business_surface_demo",