diff --git a/docs/orchestration/agent_hot_value_flow_handoff_20260522.json b/docs/orchestration/agent_hot_value_flow_handoff_20260522.json new file mode 100644 index 0000000..cba59b8 --- /dev/null +++ b/docs/orchestration/agent_hot_value_flow_handoff_20260522.json @@ -0,0 +1,157 @@ +{ + "schema_version": "domain_truth_harness_spec_v1", + "scenario_id": "agent_hot_value_flow_handoff_20260522", + "domain": "autonomy_hot_value_flow_handoff", + "title": "AGENT | Hot value-flow discovery handoff", + "description": "Targeted AGENT replay for the current Autonomy Core slice: organization-scoped value-flow questions must be answered through guarded MCP discovery response with hot handoff, not through stale exact fallback or a counterparty-only route.", + "bindings": {}, + "steps": [ + { + "step_id": "step_01_incoming_total_hot_handoff", + "title": "Organization-scoped incoming total uses hot value-flow discovery candidate", + "question": "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "allowed_reply_types": [ + "partial_coverage", + "factual_with_explanation", + "factual" + ], + "expected_mcp_discovery_response_applied": true, + "expected_mcp_discovery_selected_chain_id": "value_flow", + "expected_mcp_discovery_response_candidate_status": "ready_for_guarded_use", + "expected_mcp_discovery_candidate_hot_runtime_wired": true, + "expected_mcp_discovery_hot_runtime_wired": true, + "expected_mcp_discovery_execution_handoff_status": "ready_for_guarded_response", + "expected_mcp_discovery_execution_handoff_can_use_guarded_response": true, + "expected_catalog_alignment_status": "selected_matches_top", + "expected_catalog_chain_top_match": "value_flow", + "expected_catalog_selected_matches_top": true, + "expected_route_candidate_status": "ready_for_reviewed_execution", + "expected_route_candidate_executable_now": true, + "required_answer_patterns_all": [ + "(?i)2020", + "(?i)входящ|получ|поступ", + "(?i)руб" + ], + "required_answer_patterns_any": [ + "(?i)Альтернатива", + "(?i)проверенн", + "(?i)1С" + ], + "forbidden_answer_patterns": [ + "(?i)уточните контрагента", + "(?i)по какому контрагенту", + "(?i)не найден контрагент", + "(?i)runtime_", + "(?i)planner_", + "(?i)query_movements", + "(?i)primitive" + ], + "criticality": "critical", + "semantic_tags": [ + "autonomy_core", + "value_flow", + "hot_handoff", + "guarded_response", + "incoming_total" + ] + }, + { + "step_id": "step_02_outgoing_total_hot_handoff", + "title": "Organization-scoped outgoing total uses hot value-flow discovery candidate", + "question": "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "allowed_reply_types": [ + "partial_coverage", + "factual_with_explanation", + "factual" + ], + "expected_mcp_discovery_response_applied": true, + "expected_mcp_discovery_selected_chain_id": "value_flow", + "expected_mcp_discovery_response_candidate_status": "ready_for_guarded_use", + "expected_mcp_discovery_candidate_hot_runtime_wired": true, + "expected_mcp_discovery_hot_runtime_wired": true, + "expected_mcp_discovery_execution_handoff_status": "ready_for_guarded_response", + "expected_mcp_discovery_execution_handoff_can_use_guarded_response": true, + "expected_catalog_alignment_status": "selected_matches_top", + "expected_catalog_chain_top_match": "value_flow", + "expected_catalog_selected_matches_top": true, + "expected_route_candidate_status": "ready_for_reviewed_execution", + "expected_route_candidate_executable_now": true, + "required_answer_patterns_all": [ + "(?i)2020", + "(?i)исходящ|списан|заплат", + "(?i)руб" + ], + "required_answer_patterns_any": [ + "(?i)Альтернатива", + "(?i)проверенн", + "(?i)1С" + ], + "forbidden_answer_patterns": [ + "(?i)уточните контрагента", + "(?i)по какому контрагенту", + "(?i)не найден контрагент", + "(?i)runtime_", + "(?i)planner_", + "(?i)query_movements", + "(?i)primitive" + ], + "criticality": "critical", + "semantic_tags": [ + "autonomy_core", + "value_flow", + "hot_handoff", + "guarded_response", + "outgoing_total" + ] + }, + { + "step_id": "step_03_colloquial_money_total_hot_handoff", + "title": "Colloquial company money wording still uses hot value-flow discovery candidate", + "question": "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?", + "allowed_reply_types": [ + "partial_coverage", + "factual_with_explanation", + "factual" + ], + "expected_mcp_discovery_response_applied": true, + "expected_mcp_discovery_selected_chain_id": "value_flow", + "expected_mcp_discovery_response_candidate_status": "ready_for_guarded_use", + "expected_mcp_discovery_candidate_hot_runtime_wired": true, + "expected_mcp_discovery_hot_runtime_wired": true, + "expected_mcp_discovery_execution_handoff_status": "ready_for_guarded_response", + "expected_mcp_discovery_execution_handoff_can_use_guarded_response": true, + "expected_catalog_alignment_status": "selected_matches_top", + "expected_catalog_chain_top_match": "value_flow", + "expected_catalog_selected_matches_top": true, + "expected_route_candidate_status": "ready_for_reviewed_execution", + "expected_route_candidate_executable_now": true, + "required_answer_patterns_all": [ + "(?i)2020", + "(?i)пришл|получ|поступ|входящ", + "(?i)руб" + ], + "required_answer_patterns_any": [ + "(?i)Альтернатива", + "(?i)проверенн", + "(?i)1С" + ], + "forbidden_answer_patterns": [ + "(?i)уточните контрагента", + "(?i)по какому контрагенту", + "(?i)не найден контрагент", + "(?i)runtime_", + "(?i)planner_", + "(?i)query_movements", + "(?i)primitive" + ], + "criticality": "critical", + "semantic_tags": [ + "autonomy_core", + "value_flow", + "hot_handoff", + "guarded_response", + "colloquial_total" + ] + } + ] +} diff --git a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDataNeedGraph.js b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDataNeedGraph.js index fb377c6..7d8517d 100644 --- a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDataNeedGraph.js +++ b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDataNeedGraph.js @@ -193,6 +193,12 @@ function rankingNeedFromRawUtterance(value) { if (!text) { return null; } + if (/\u0438\u0441\u043a\u043b\u044e\u0447[\p{L}\p{N}_]*\s+\u0442\u043e\u043f/iu.test(text)) { + return null; + } + if (/(?:\u0431\u0435\u0437\s+\u0442\u043e\u043f(?:\u043e\u0432|\u0430)?\b|\u043d\u0435\s+\u0442\u043e\u043f\b|\u0438\u0441\u043a\u043b\u044e\u0447\w*\s+\u0442\u043e\u043f|\u0431\u0435\u0437\s+\u0440\u0435\u0439\u0442\u0438\u043d\u0433\u0430\b|без\s+топ(?:РѕРІ|Р°)?\b|РЅРµ\s+топ\b|исключ\S*\s+топ|без\s+рейтинга\b)/iu.test(text)) { + return null; + } if (/(?:\u0442\u043e\u043f[-\s]?\d*|\u0441\u0430\u043c(?:\u044b\u0439|\u0430\u044f|\u043e\u0435|\u044b\u0435)|\u0431\u043e\u043b\u044c\u0448\u0435\s+\u0432\u0441\u0435\u0433\u043e|\u043d\u0430\u0438\u0431\u043e\u043b[\u0435\u0451]\u0435|\u043c\u0430\u043a\u0441\u0438\u043c\u0430\u043b\u044c\u043d|\u043c\u0430\u043a\u0441\u0438\u043c\u0443\u043c|\u043a\u0440\u0443\u043f\u043d\u0435\u0439\u0448|\u043b\u0443\u0447\u0448\u0438\u0439)/iu.test(text)) { return "top_desc"; } diff --git a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDebugAttachment.js b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDebugAttachment.js index 618b8af..20f9044 100644 --- a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDebugAttachment.js +++ b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryDebugAttachment.js @@ -68,11 +68,14 @@ function buildAssistantMcpDiscoveryDebugAttachmentFields(input) { const routeCandidate = isRouteCandidateContract(bridge?.route_candidate) ? bridge.route_candidate : null; const executionHandoff = isExecutionHandoffContract(bridge?.execution_handoff) ? bridge.execution_handoff : null; const answerDraft = toRecordObject(bridge?.answer_draft); + const hotRuntimeWired = entryPoint?.hot_runtime_wired === true || + bridge?.hot_runtime_wired === true || + executionHandoff?.can_use_guarded_response === true; return { assistant_mcp_discovery_entry_point_v1: entryPoint, mcp_discovery_entry_status: toNonEmptyString(entryPoint?.entry_status), mcp_discovery_attempted: Boolean(entryPoint?.discovery_attempted), - mcp_discovery_hot_runtime_wired: false, + mcp_discovery_hot_runtime_wired: hotRuntimeWired, mcp_discovery_bridge_status: toNonEmptyString(bridge?.bridge_status), mcp_discovery_selected_chain_id: toNonEmptyString(planner?.selected_chain_id), mcp_discovery_evidence_plan_v1: evidencePlan, diff --git a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js index a88fd8c..520604d 100644 --- a/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js +++ b/llm_normalizer/backend/dist/services/assistantMcpDiscoveryResponsePolicy.js @@ -204,12 +204,16 @@ function hasMetadataDiscoveryPriority(input, entryPoint) { } function isOpenScopeValueFlowWithoutSubject(entryPoint) { const graph = readDiscoveryDataNeedGraph(entryPoint); + const turnMeaning = readDiscoveryTurnMeaning(entryPoint); const businessFactFamily = toNonEmptyString(graph?.business_fact_family); const subjectCandidates = Array.isArray(graph?.subject_candidates) ? graph.subject_candidates : []; - const reasonCodes = Array.isArray(graph?.reason_codes) ? graph.reason_codes : []; + const reasonCodes = readStringArray(graph?.reason_codes); + const clarificationGaps = readStringArray(graph?.clarification_gaps); + const explicitOrganizationScope = toNonEmptyString(turnMeaning?.explicit_organization_scope); return (businessFactFamily === "value_flow" && subjectCandidates.length === 0 && - reasonCodes.some((reason) => toNonEmptyString(reason) === "data_need_graph_open_scope_total_without_subject")); + (reasonCodes.includes("data_need_graph_open_scope_total_without_subject") || + (Boolean(explicitOrganizationScope) && clarificationGaps.includes("subject")))); } function needsOpenScopeValueFlowOrganizationClarification(entryPoint) { const graph = readDiscoveryDataNeedGraph(entryPoint); @@ -440,6 +444,9 @@ function hasRuntimeAdjustedExactReply(input, entryPoint) { if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } if (hasEvidenceLaneConflictWithDiscoveryTurnMeaning(input, entryPoint)) { return false; } @@ -463,6 +470,9 @@ function hasRuntimeMatchedExactReply(input, entryPoint) { if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } if (hasEvidenceLaneConflictWithDiscoveryTurnMeaning(input, entryPoint)) { return false; } @@ -483,6 +493,9 @@ function hasAlignedFactualAddressReply(input, entryPoint) { if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } if (hasSemanticConflictWithDiscoveryTurnMeaning(input, entryPoint)) { return false; } @@ -538,6 +551,9 @@ function hasMatchedFactualAddressContinuationTarget(input, entryPoint) { if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } const detectedIntent = toNonEmptyString(input.addressRuntimeMeta?.detected_intent); const dialogContinuationContract = toRecordObject(input.addressRuntimeMeta?.dialogContinuationContract) ?? toRecordObject(input.addressRuntimeMeta?.dialog_continuation_contract_v2); @@ -578,6 +594,9 @@ function hasFullConfirmedFactualAddressReply(input, entryPoint) { if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } return hasFullConfirmedTruth(input); } function applyAssistantMcpDiscoveryResponsePolicy(input) { diff --git a/llm_normalizer/backend/src/services/assistantMcpDiscoveryDataNeedGraph.ts b/llm_normalizer/backend/src/services/assistantMcpDiscoveryDataNeedGraph.ts index e3f8498..7b776f6 100644 --- a/llm_normalizer/backend/src/services/assistantMcpDiscoveryDataNeedGraph.ts +++ b/llm_normalizer/backend/src/services/assistantMcpDiscoveryDataNeedGraph.ts @@ -293,6 +293,12 @@ function rankingNeedFromRawUtterance(value: string): string | null { if (!text) { return null; } + if (/\u0438\u0441\u043a\u043b\u044e\u0447[\p{L}\p{N}_]*\s+\u0442\u043e\u043f/iu.test(text)) { + return null; + } + if (/(?:\u0431\u0435\u0437\s+\u0442\u043e\u043f(?:\u043e\u0432|\u0430)?\b|\u043d\u0435\s+\u0442\u043e\u043f\b|\u0438\u0441\u043a\u043b\u044e\u0447\w*\s+\u0442\u043e\u043f|\u0431\u0435\u0437\s+\u0440\u0435\u0439\u0442\u0438\u043d\u0433\u0430\b|без\s+топ(?:РѕРІ|Р°)?\b|РЅРµ\s+топ\b|исключ\S*\s+топ|без\s+рейтинга\b)/iu.test(text)) { + return null; + } if (/(?:\u0442\u043e\u043f[-\s]?\d*|\u0441\u0430\u043c(?:\u044b\u0439|\u0430\u044f|\u043e\u0435|\u044b\u0435)|\u0431\u043e\u043b\u044c\u0448\u0435\s+\u0432\u0441\u0435\u0433\u043e|\u043d\u0430\u0438\u0431\u043e\u043b[\u0435\u0451]\u0435|\u043c\u0430\u043a\u0441\u0438\u043c\u0430\u043b\u044c\u043d|\u043c\u0430\u043a\u0441\u0438\u043c\u0443\u043c|\u043a\u0440\u0443\u043f\u043d\u0435\u0439\u0448|\u043b\u0443\u0447\u0448\u0438\u0439)/iu.test(text)) { return "top_desc"; } diff --git a/llm_normalizer/backend/src/services/assistantMcpDiscoveryDebugAttachment.ts b/llm_normalizer/backend/src/services/assistantMcpDiscoveryDebugAttachment.ts index 3d7b0a8..c36d82e 100644 --- a/llm_normalizer/backend/src/services/assistantMcpDiscoveryDebugAttachment.ts +++ b/llm_normalizer/backend/src/services/assistantMcpDiscoveryDebugAttachment.ts @@ -7,7 +7,7 @@ export interface AssistantMcpDiscoveryDebugAttachmentFields { assistant_mcp_discovery_entry_point_v1: AssistantMcpDiscoveryRuntimeEntryPointContract | null; mcp_discovery_entry_status: string | null; mcp_discovery_attempted: boolean; - mcp_discovery_hot_runtime_wired: false; + mcp_discovery_hot_runtime_wired: boolean; mcp_discovery_bridge_status: string | null; mcp_discovery_selected_chain_id: string | null; mcp_discovery_evidence_plan_v1: AssistantEvidencePlannerContract | null; @@ -130,12 +130,16 @@ export function buildAssistantMcpDiscoveryDebugAttachmentFields( const routeCandidate = isRouteCandidateContract(bridge?.route_candidate) ? bridge.route_candidate : null; const executionHandoff = isExecutionHandoffContract(bridge?.execution_handoff) ? bridge.execution_handoff : null; const answerDraft = toRecordObject(bridge?.answer_draft); + const hotRuntimeWired = + entryPoint?.hot_runtime_wired === true || + bridge?.hot_runtime_wired === true || + executionHandoff?.can_use_guarded_response === true; return { assistant_mcp_discovery_entry_point_v1: entryPoint, mcp_discovery_entry_status: toNonEmptyString(entryPoint?.entry_status), mcp_discovery_attempted: Boolean(entryPoint?.discovery_attempted), - mcp_discovery_hot_runtime_wired: false, + mcp_discovery_hot_runtime_wired: hotRuntimeWired, mcp_discovery_bridge_status: toNonEmptyString(bridge?.bridge_status), mcp_discovery_selected_chain_id: toNonEmptyString(planner?.selected_chain_id), mcp_discovery_evidence_plan_v1: evidencePlan, diff --git a/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts b/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts index 5bb04bb..7f513b9 100644 --- a/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts +++ b/llm_normalizer/backend/src/services/assistantMcpDiscoveryResponsePolicy.ts @@ -306,13 +306,17 @@ function isOpenScopeValueFlowWithoutSubject( entryPoint: AssistantMcpDiscoveryRuntimeEntryPointContract | null ): boolean { const graph = readDiscoveryDataNeedGraph(entryPoint); + const turnMeaning = readDiscoveryTurnMeaning(entryPoint); const businessFactFamily = toNonEmptyString(graph?.business_fact_family); const subjectCandidates = Array.isArray(graph?.subject_candidates) ? graph.subject_candidates : []; - const reasonCodes = Array.isArray(graph?.reason_codes) ? graph.reason_codes : []; + const reasonCodes = readStringArray(graph?.reason_codes); + const clarificationGaps = readStringArray(graph?.clarification_gaps); + const explicitOrganizationScope = toNonEmptyString(turnMeaning?.explicit_organization_scope); return ( businessFactFamily === "value_flow" && subjectCandidates.length === 0 && - reasonCodes.some((reason) => toNonEmptyString(reason) === "data_need_graph_open_scope_total_without_subject") + (reasonCodes.includes("data_need_graph_open_scope_total_without_subject") || + (Boolean(explicitOrganizationScope) && clarificationGaps.includes("subject"))) ); } @@ -609,6 +613,9 @@ function hasRuntimeAdjustedExactReply( if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } if (hasEvidenceLaneConflictWithDiscoveryTurnMeaning(input, entryPoint)) { return false; } @@ -638,6 +645,9 @@ function hasRuntimeMatchedExactReply( if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } if (hasEvidenceLaneConflictWithDiscoveryTurnMeaning(input, entryPoint)) { return false; } @@ -664,6 +674,9 @@ function hasAlignedFactualAddressReply( if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } if (hasSemanticConflictWithDiscoveryTurnMeaning(input, entryPoint)) { return false; } @@ -729,6 +742,9 @@ function hasMatchedFactualAddressContinuationTarget( if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } const detectedIntent = toNonEmptyString(input.addressRuntimeMeta?.detected_intent); const dialogContinuationContract = toRecordObject(input.addressRuntimeMeta?.dialogContinuationContract) ?? @@ -781,6 +797,9 @@ function hasFullConfirmedFactualAddressReply( if (hasMetadataDiscoveryPriority(input, entryPoint)) { return false; } + if (hasOpenScopeValueFlowDiscoveryPriority(input, entryPoint)) { + return false; + } return hasFullConfirmedTruth(input); } diff --git a/llm_normalizer/backend/tests/assistantMcpDiscoveryDataNeedGraph.test.ts b/llm_normalizer/backend/tests/assistantMcpDiscoveryDataNeedGraph.test.ts index 731844c..bbb6ad4 100644 --- a/llm_normalizer/backend/tests/assistantMcpDiscoveryDataNeedGraph.test.ts +++ b/llm_normalizer/backend/tests/assistantMcpDiscoveryDataNeedGraph.test.ts @@ -60,6 +60,25 @@ describe("assistant MCP discovery data need graph", () => { ); }); + it("does not turn explicit no-top wording into a value-flow ranking", () => { + const result = buildAssistantMcpDiscoveryDataNeedGraph({ + semanticDataNeed: "counterparty value-flow evidence", + rawUtterance: + "Определить общую сумму поступлений в ООО Альтернатива Плюс за 2020 год, исключая топ-контрагентов и детализацию по контрагентам", + turnMeaning: { + asked_domain_family: "counterparty_value", + asked_action_family: "counterparty_value_or_turnover", + explicit_organization_scope: "ООО Альтернатива Плюс", + explicit_date_scope: "2020" + } + }); + + expect(result.business_fact_family).toBe("value_flow"); + expect(result.ranking_need).toBeNull(); + expect(result.decomposition_candidates).toContain("aggregate_checked_amounts"); + expect(result.decomposition_candidates).not.toContain("aggregate_ranked_axis_values"); + }); + it("marks metadata lane choice as a clarification-required graph", () => { const result = buildAssistantMcpDiscoveryDataNeedGraph({ semanticDataNeed: "metadata lane clarification", diff --git a/llm_normalizer/backend/tests/assistantMcpDiscoveryDebugAttachment.test.ts b/llm_normalizer/backend/tests/assistantMcpDiscoveryDebugAttachment.test.ts index 781c8b9..53cc5e9 100644 --- a/llm_normalizer/backend/tests/assistantMcpDiscoveryDebugAttachment.test.ts +++ b/llm_normalizer/backend/tests/assistantMcpDiscoveryDebugAttachment.test.ts @@ -100,7 +100,7 @@ describe("assistant MCP discovery debug attachment", () => { ); expect(debug.mcp_discovery_entry_status).toBe("bridge_executed"); expect(debug.mcp_discovery_attempted).toBe(true); - expect(debug.mcp_discovery_hot_runtime_wired).toBe(false); + expect(debug.mcp_discovery_hot_runtime_wired).toBe(true); expect(debug.mcp_discovery_bridge_status).toBe("answer_draft_ready"); expect(debug.mcp_discovery_selected_chain_id).toBe("value_flow"); expect(debug.mcp_discovery_evidence_plan_status).toBe("ready_for_execution"); diff --git a/llm_normalizer/data/autorun_generators/history.json b/llm_normalizer/data/autorun_generators/history.json index a2964c0..b88b6c4 100644 --- a/llm_normalizer/data/autorun_generators/history.json +++ b/llm_normalizer/data/autorun_generators/history.json @@ -1,4 +1,48 @@ [ + { + "generation_id": "gen-ag05221957-713bbd", + "created_at": "2026-05-22T19:57:37+00:00", + "mode": "saved_user_sessions", + "title": "AGENT | Hot value-flow discovery handoff", + "count": 3, + "domain": "autonomy_hot_value_flow_handoff", + "questions": [ + "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?" + ], + "generated_by": "codex_agent", + "saved_case_set_file": "assistant_autogen_saved_user_sessions_20260522195737_gen-ag05221957-713bbd.json", + "context": { + "llm_provider": null, + "model": null, + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "autogen_personality_id": null, + "autogen_personality_prompt": null, + "source_session_id": null, + "saved_session_file": "assistant_saved_session_20260522195737_gen-ag05221957-713bbd.json", + "saved_case_set_kind": "agent_semantic_scenario", + "agent_run": true, + "agent_focus": "Targeted AGENT replay for the current Autonomy Core slice: organization-scoped value-flow questions must be answered through guarded MCP discovery response with hot handoff, not through stale exact fallback or a counterparty-only route.", + "architecture_phase": "turnaround_11", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\agent_hot_value_flow_handoff_20260522.json", + "scenario_id": "agent_hot_value_flow_handoff_20260522", + "semantic_tags": [ + "autonomy_core", + "colloquial_total", + "guarded_response", + "hot_handoff", + "incoming_total", + "outgoing_total", + "value_flow" + ], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_hot_value_flow_handoff_live5", + "saved_after_validated_replay": true + } + }, { "generation_id": "gen-ag05221319-4035f5", "created_at": "2026-05-22T13:19:31+00:00", diff --git a/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522195737_gen-ag05221957-713bbd.json b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522195737_gen-ag05221957-713bbd.json new file mode 100644 index 0000000..d09d310 --- /dev/null +++ b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522195737_gen-ag05221957-713bbd.json @@ -0,0 +1,119 @@ +{ + "saved_at": "2026-05-22T19:57:37+00:00", + "generation_id": "gen-ag05221957-713bbd", + "mode": "saved_user_sessions", + "title": "AGENT | Hot value-flow discovery handoff", + "agent_run": true, + "questions": [ + "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?" + ], + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "Targeted AGENT replay for the current Autonomy Core slice: organization-scoped value-flow questions must be answered through guarded MCP discovery response with hot handoff, not through stale exact fallback or a counterparty-only route.", + "architecture_phase": "turnaround_11", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\agent_hot_value_flow_handoff_20260522.json", + "scenario_id": "agent_hot_value_flow_handoff_20260522", + "semantic_tags": [ + "autonomy_core", + "colloquial_total", + "guarded_response", + "hot_handoff", + "incoming_total", + "outgoing_total", + "value_flow" + ], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_hot_value_flow_handoff_live5", + "saved_after_validated_replay": true, + "save_gate": { + "schema_version": "agent_semantic_save_gate_v1", + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_hot_value_flow_handoff_live5", + "final_status": "accepted", + "review_overall_status": "pass", + "business_overall_status": "pass", + "steps_total": 3, + "steps_passed": 3, + "steps_failed": 0, + "steps_with_business_failures": 0, + "steps_with_business_warnings": 0, + "acceptance_gate_passed": true, + "saved_after_validated_replay": true + } + }, + "source_session_id": null, + "session": { + "session_id": null, + "mode": "agent_semantic_run", + "items": [ + { + "message_id": "agent-user-001", + "role": "user", + "text": "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "created_at": "2026-05-22T19:57:37+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-002", + "role": "user", + "text": "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?", + "created_at": "2026-05-22T19:57:37+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-003", + "role": "user", + "text": "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?", + "created_at": "2026-05-22T19:57:37+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + } + ], + "agent_run": true, + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "Targeted AGENT replay for the current Autonomy Core slice: organization-scoped value-flow questions must be answered through guarded MCP discovery response with hot handoff, not through stale exact fallback or a counterparty-only route.", + "architecture_phase": "turnaround_11", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\agent_hot_value_flow_handoff_20260522.json", + "scenario_id": "agent_hot_value_flow_handoff_20260522", + "semantic_tags": [ + "autonomy_core", + "colloquial_total", + "guarded_response", + "hot_handoff", + "incoming_total", + "outgoing_total", + "value_flow" + ], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_hot_value_flow_handoff_live5", + "saved_after_validated_replay": true, + "save_gate": { + "schema_version": "agent_semantic_save_gate_v1", + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\agent_hot_value_flow_handoff_live5", + "final_status": "accepted", + "review_overall_status": "pass", + "business_overall_status": "pass", + "steps_total": 3, + "steps_passed": 3, + "steps_failed": 0, + "steps_with_business_failures": 0, + "steps_with_business_warnings": 0, + "acceptance_gate_passed": true, + "saved_after_validated_replay": true + } + } + } +} diff --git a/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522195737_gen-ag05221957-713bbd.json b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522195737_gen-ag05221957-713bbd.json new file mode 100644 index 0000000..10d317b --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522195737_gen-ag05221957-713bbd.json @@ -0,0 +1,34 @@ +{ + "suite_id": "assistant_saved_session_gen-ag05221957-713bbd", + "suite_version": "0.1.0", + "schema_version": "assistant_saved_session_suite_v0_1", + "generated_at": "2026-05-22T19:57:37+00:00", + "generation_id": "gen-ag05221957-713bbd", + "mode": "saved_user_sessions", + "title": "AGENT | Hot value-flow discovery handoff", + "domain": "autonomy_hot_value_flow_handoff", + "scenario_count": 1, + "case_ids": [ + "SAVED-001" + ], + "cases": [ + { + "case_id": "SAVED-001", + "scenario_tag": "agent_saved_user_sessions", + "title": "AGENT | Hot value-flow discovery handoff", + "question_type": "followup", + "broadness_level": "medium", + "turns": [ + { + "user_message": "Сколько входящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?" + }, + { + "user_message": "Сколько исходящих денег за 2020 год по ООО Альтернатива Плюс без разреза по контрагентам?" + }, + { + "user_message": "А всего сколько денег пришло в ООО Альтернатива Плюс за 2020, без топов и без контрагентов?" + } + ] + } + ] +} diff --git a/scripts/domain_case_loop.py b/scripts/domain_case_loop.py index c71be67..6d7325b 100644 --- a/scripts/domain_case_loop.py +++ b/scripts/domain_case_loop.py @@ -2343,12 +2343,25 @@ def build_scenario_step_state( ), "mcp_discovery_route_candidate_next_action": debug.get("mcp_discovery_route_candidate_next_action"), "mcp_discovery_response_applied": debug.get("mcp_discovery_response_applied"), + "mcp_discovery_hot_runtime_wired": debug.get("mcp_discovery_hot_runtime_wired"), "mcp_discovery_selected_chain_id": debug.get("mcp_discovery_selected_chain_id"), + "mcp_discovery_execution_handoff_status": debug.get("mcp_discovery_execution_handoff_status"), + "mcp_discovery_execution_handoff_allowed_hot_chain": debug.get( + "mcp_discovery_execution_handoff_allowed_hot_chain" + ), + "mcp_discovery_execution_handoff_can_use_guarded_response": debug.get( + "mcp_discovery_execution_handoff_can_use_guarded_response" + ), "mcp_discovery_response_candidate_status": ( debug.get("mcp_discovery_response_candidate_v1", {}).get("candidate_status") if isinstance(debug.get("mcp_discovery_response_candidate_v1"), dict) else None ), + "mcp_discovery_response_candidate_hot_runtime_wired": ( + debug.get("mcp_discovery_response_candidate_v1", {}).get("hot_runtime_wired") + if isinstance(debug.get("mcp_discovery_response_candidate_v1"), dict) + else None + ), "mcp_discovery_response_reply_type": ( debug.get("mcp_discovery_response_candidate_v1", {}).get("reply_type") if isinstance(debug.get("mcp_discovery_response_candidate_v1"), dict) diff --git a/scripts/domain_truth_harness.py b/scripts/domain_truth_harness.py index b4a240a..0bdda66 100644 --- a/scripts/domain_truth_harness.py +++ b/scripts/domain_truth_harness.py @@ -28,6 +28,13 @@ TECHNICAL_QUESTION_FIELDS = ( "expected_catalog_alignment_status", "expected_catalog_chain_top_match", "expected_catalog_selected_matches_top", + "expected_mcp_discovery_response_applied", + "expected_mcp_discovery_selected_chain_id", + "expected_mcp_discovery_response_candidate_status", + "expected_mcp_discovery_candidate_hot_runtime_wired", + "expected_mcp_discovery_hot_runtime_wired", + "expected_mcp_discovery_execution_handoff_status", + "expected_mcp_discovery_execution_handoff_can_use_guarded_response", "expected_route_candidate_status", "expected_route_candidate_executable_now", "expected_route_candidate_missing_axes", @@ -103,6 +110,27 @@ def normalize_step_spec(index: int, raw_step: Any) -> dict[str, Any]: str(step.get("expected_catalog_chain_top_match") or "").strip() or None ) normalized_step["expected_catalog_selected_matches_top"] = step.get("expected_catalog_selected_matches_top") + normalized_step["expected_mcp_discovery_response_applied"] = step.get( + "expected_mcp_discovery_response_applied" + ) + normalized_step["expected_mcp_discovery_selected_chain_id"] = ( + str(step.get("expected_mcp_discovery_selected_chain_id") or "").strip() or None + ) + normalized_step["expected_mcp_discovery_response_candidate_status"] = ( + str(step.get("expected_mcp_discovery_response_candidate_status") or "").strip() or None + ) + normalized_step["expected_mcp_discovery_candidate_hot_runtime_wired"] = step.get( + "expected_mcp_discovery_candidate_hot_runtime_wired" + ) + normalized_step["expected_mcp_discovery_hot_runtime_wired"] = step.get( + "expected_mcp_discovery_hot_runtime_wired" + ) + normalized_step["expected_mcp_discovery_execution_handoff_status"] = ( + str(step.get("expected_mcp_discovery_execution_handoff_status") or "").strip() or None + ) + normalized_step["expected_mcp_discovery_execution_handoff_can_use_guarded_response"] = step.get( + "expected_mcp_discovery_execution_handoff_can_use_guarded_response" + ) normalized_step["expected_route_candidate_status"] = ( str(step.get("expected_route_candidate_status") or "").strip() or None ) @@ -486,6 +514,13 @@ def evaluate_truth_step( capability_id = str(step_state.get("capability_id") or "").strip() catalog_alignment_status = str(step_state.get("mcp_discovery_catalog_chain_alignment_status") or "").strip() catalog_chain_top_match = str(step_state.get("mcp_discovery_catalog_chain_top_match") or "").strip() + mcp_discovery_selected_chain_id = str(step_state.get("mcp_discovery_selected_chain_id") or "").strip() + mcp_discovery_response_candidate_status = str( + step_state.get("mcp_discovery_response_candidate_status") or "" + ).strip() + mcp_discovery_execution_handoff_status = str( + step_state.get("mcp_discovery_execution_handoff_status") or "" + ).strip() route_candidate_status = str(step_state.get("mcp_discovery_route_candidate_status") or "").strip() limited_reason_category = str(step_state.get("limited_reason_category") or "").strip() extracted_filters = ( @@ -569,6 +604,156 @@ def evaluate_truth_step( expected=expected_catalog_selected_matches_top, ) + expected_mcp_discovery_response_applied = normalize_optional_bool( + resolve_nested_placeholders( + step.get("expected_mcp_discovery_response_applied"), + step_results, + bindings, + runtime_bindings, + ) + ) + if expected_mcp_discovery_response_applied is not None: + actual_mcp_discovery_response_applied = step_state.get("mcp_discovery_response_applied") is True + if actual_mcp_discovery_response_applied != expected_mcp_discovery_response_applied: + append_finding( + findings, + step, + "wrong_mcp_discovery_response_applied", + "MCP discovery response replacement flag does not match the expected hot handoff behavior.", + actual=actual_mcp_discovery_response_applied, + expected=expected_mcp_discovery_response_applied, + ) + + expected_mcp_discovery_selected_chain_id = str( + resolve_nested_placeholders( + step.get("expected_mcp_discovery_selected_chain_id"), + step_results, + bindings, + runtime_bindings, + ) + or "" + ).strip() + if ( + expected_mcp_discovery_selected_chain_id + and mcp_discovery_selected_chain_id != expected_mcp_discovery_selected_chain_id + ): + append_finding( + findings, + step, + "wrong_mcp_discovery_selected_chain_id", + "MCP discovery selected chain does not match the expected autonomy chain for this step.", + actual=mcp_discovery_selected_chain_id or None, + expected=expected_mcp_discovery_selected_chain_id, + ) + + expected_mcp_discovery_response_candidate_status = str( + resolve_nested_placeholders( + step.get("expected_mcp_discovery_response_candidate_status"), + step_results, + bindings, + runtime_bindings, + ) + or "" + ).strip() + if ( + expected_mcp_discovery_response_candidate_status + and mcp_discovery_response_candidate_status != expected_mcp_discovery_response_candidate_status + ): + append_finding( + findings, + step, + "wrong_mcp_discovery_response_candidate_status", + "MCP discovery response candidate status does not match the expected guarded response readiness.", + actual=mcp_discovery_response_candidate_status or None, + expected=expected_mcp_discovery_response_candidate_status, + ) + + expected_mcp_discovery_candidate_hot_runtime_wired = normalize_optional_bool( + resolve_nested_placeholders( + step.get("expected_mcp_discovery_candidate_hot_runtime_wired"), + step_results, + bindings, + runtime_bindings, + ) + ) + if expected_mcp_discovery_candidate_hot_runtime_wired is not None: + actual_candidate_hot_runtime_wired = ( + step_state.get("mcp_discovery_response_candidate_hot_runtime_wired") is True + ) + if actual_candidate_hot_runtime_wired != expected_mcp_discovery_candidate_hot_runtime_wired: + append_finding( + findings, + step, + "wrong_mcp_discovery_candidate_hot_runtime_wired", + "MCP discovery response candidate hot-runtime flag does not match the expected guarded handoff.", + actual=actual_candidate_hot_runtime_wired, + expected=expected_mcp_discovery_candidate_hot_runtime_wired, + ) + + expected_mcp_discovery_hot_runtime_wired = normalize_optional_bool( + resolve_nested_placeholders( + step.get("expected_mcp_discovery_hot_runtime_wired"), + step_results, + bindings, + runtime_bindings, + ) + ) + if expected_mcp_discovery_hot_runtime_wired is not None: + actual_hot_runtime_wired = step_state.get("mcp_discovery_hot_runtime_wired") is True + if actual_hot_runtime_wired != expected_mcp_discovery_hot_runtime_wired: + append_finding( + findings, + step, + "wrong_mcp_discovery_hot_runtime_wired", + "Top-level MCP discovery hot-runtime flag does not match the expected guarded handoff.", + actual=actual_hot_runtime_wired, + expected=expected_mcp_discovery_hot_runtime_wired, + ) + + expected_mcp_discovery_execution_handoff_status = str( + resolve_nested_placeholders( + step.get("expected_mcp_discovery_execution_handoff_status"), + step_results, + bindings, + runtime_bindings, + ) + or "" + ).strip() + if ( + expected_mcp_discovery_execution_handoff_status + and mcp_discovery_execution_handoff_status != expected_mcp_discovery_execution_handoff_status + ): + append_finding( + findings, + step, + "wrong_mcp_discovery_execution_handoff_status", + "MCP discovery execution handoff status does not match the expected guarded response status.", + actual=mcp_discovery_execution_handoff_status or None, + expected=expected_mcp_discovery_execution_handoff_status, + ) + + expected_mcp_discovery_execution_handoff_can_use_guarded_response = normalize_optional_bool( + resolve_nested_placeholders( + step.get("expected_mcp_discovery_execution_handoff_can_use_guarded_response"), + step_results, + bindings, + runtime_bindings, + ) + ) + if expected_mcp_discovery_execution_handoff_can_use_guarded_response is not None: + actual_can_use_guarded_response = ( + step_state.get("mcp_discovery_execution_handoff_can_use_guarded_response") is True + ) + if actual_can_use_guarded_response != expected_mcp_discovery_execution_handoff_can_use_guarded_response: + append_finding( + findings, + step, + "wrong_mcp_discovery_execution_handoff_guarded_response", + "MCP discovery execution handoff guarded-response flag does not match the expected hot path.", + actual=actual_can_use_guarded_response, + expected=expected_mcp_discovery_execution_handoff_can_use_guarded_response, + ) + expected_route_candidate_status = str( resolve_nested_placeholders( step.get("expected_route_candidate_status"),