From a63742f0d6ebe1519a7ee6d75cb9dc051d1d0915 Mon Sep 17 00:00:00 2001 From: dctouch Date: Fri, 1 May 2026 15:38:06 +0300 Subject: [PATCH] =?UTF-8?q?Planner=20Autonomy:=20=D0=B2=D1=8B=D0=B2=D0=B5?= =?UTF-8?q?=D1=81=D1=82=D0=B8=20catalog-alignment=20=D0=B2=20replay=20arti?= =?UTF-8?q?facts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...anner_autonomy_consolidation_2026-05-01.md | 8 ++- .../11 - architecture_turnaround/README.md | 6 ++- scripts/domain_case_loop.py | 3 ++ scripts/domain_truth_harness.py | 3 ++ scripts/scenario_acceptance_policy.py | 6 +++ scripts/test_domain_case_loop_step_state.py | 54 +++++++++++++++++++ scripts/test_scenario_acceptance_policy.py | 12 +++++ 7 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 scripts/test_domain_case_loop_step_state.py diff --git a/docs/ARCH/11 - architecture_turnaround/20 - planner_autonomy_consolidation_2026-05-01.md b/docs/ARCH/11 - architecture_turnaround/20 - planner_autonomy_consolidation_2026-05-01.md index 0c2c3de..d957983 100644 --- a/docs/ARCH/11 - architecture_turnaround/20 - planner_autonomy_consolidation_2026-05-01.md +++ b/docs/ARCH/11 - architecture_turnaround/20 - planner_autonomy_consolidation_2026-05-01.md @@ -121,6 +121,7 @@ The following consolidation step added catalog-level chain-template scoring: - `catalog_chain_template_alignment` now records whether the selected chain is the top catalog match, its rank, and whether it appeared in the catalog search results; runtime loop state and debug summary expose the same verdict. - planner reason codes now emit stable catalog-alignment telemetry for evaluated top-match, selected-equals-top, selected-lower-rank, selected-outside-match-set, and unscored selected-chain states. - `catalog_chain_template_alignment.alignment_status` now carries the same verdict as one enum-like field, and debug summary exposes it as `mcp_discovery_catalog_chain_alignment_status`. +- `domain_truth_harness` and `scenario_acceptance_policy` now carry the alignment status, top catalog match, and selected-matches-top flag into replay artifacts instead of leaving them buried in raw debug JSON. ## Why This Matters @@ -251,9 +252,14 @@ Latest validation after explicit catalog-alignment status propagation: - `npm.cmd run build`: passed - graphify rebuild: `5943 nodes`, `12915 edges`, `136 communities` +Latest validation after truth-harness catalog-alignment artifact surfacing: + +- Python replay-tooling tests: passed, `4 passed` +- graphify rebuild: `5946 nodes`, `12918 edges`, `136 communities` + ## Next Step -The next safe step is still to re-run live replay once the 1C side is actively polling the proxy. In parallel, local-only consolidation can continue by using `alignment_status`, alignment reason-code telemetry, and the representative guard to find remaining manual branches where selected chains diverge from reviewed catalog-fabric intent. +The next safe step is still to re-run live replay once the 1C side is actively polling the proxy. In parallel, local-only consolidation can continue by using `alignment_status`, alignment reason-code telemetry, truth-harness artifact surfacing, and the representative guard to find remaining manual branches where selected chains diverge from reviewed catalog-fabric intent. Recommended order: diff --git a/docs/ARCH/11 - architecture_turnaround/README.md b/docs/ARCH/11 - architecture_turnaround/README.md index 9cb1055..47dbf0b 100644 --- a/docs/ARCH/11 - architecture_turnaround/README.md +++ b/docs/ARCH/11 - architecture_turnaround/README.md @@ -84,6 +84,7 @@ It now documents a turnaround that is already operational in code, already mater - planner/runtime/debug surfaces now expose `catalog_chain_template_alignment`, so semantic replay can see whether selected chains match the catalog top match, fall back to a lower-ranked template, or bypass catalog search; - planner reason codes now also emit stable catalog-alignment telemetry, so automated replay review can filter top-match, lower-rank, outside-match, and unscored selected-chain states without hand-parsing debug JSON; - catalog-alignment now carries a single `alignment_status` verdict through planner/runtime/debug, making replay divergence detection explicit instead of reconstructing it from booleans; + - truth-harness and scenario acceptance artifacts now preserve catalog-alignment status/top-match fields, so AGENT replay review can spot planner-vs-catalog divergence directly in `truth_review.md` and `scenario_acceptance_matrix.json`; - explicit-counterparty incoming-vs-outgoing data-need graphs now select the reviewed `value_flow_comparison` chain instead of falling back to generic `value_flow`; - live map sync: [20 - planner_autonomy_consolidation_2026-05-01.md](./20%20-%20planner_autonomy_consolidation_2026-05-01.md) @@ -96,8 +97,8 @@ Current honest status: - open-world bounded-autonomy readiness: `~85%` - Post-F semantic integrity module progress: `~99%` operationally closed, with remaining risk now treated as next-slice discovery rather than an open blocker inside the closed slice - active inventory-stock breadth slice progress: `100%` for the declared scenario pack, not for arbitrary inventory questions -- Planner Autonomy Consolidation progress: `~87%` for the declared module, with catalog-fabric, value-flow arbitration, lifecycle bounded inference, broad-evaluation bridge, inventory catalog templates, inventory runtime-boundary honesty, exact inventory recipe bridging, unambiguous metadata-surface lane inference, catalog chain-template scoring, structured chain-match contract exposure, runtime/debug propagation, subject-aware bidirectional comparison arbitration, structured catalog-alignment verdicts, representative alignment regression guard, catalog-alignment reason-code telemetry, and explicit `alignment_status` propagation validated locally, but live replay for the new bridge is currently blocked by missing active 1C polling and broader unfamiliar 1C asks still need replay-backed growth -- graph snapshot after latest rebuild: `5943 nodes`, `12915 edges`, `136 communities` +- Planner Autonomy Consolidation progress: `~88%` for the declared module, with catalog-fabric, value-flow arbitration, lifecycle bounded inference, broad-evaluation bridge, inventory catalog templates, inventory runtime-boundary honesty, exact inventory recipe bridging, unambiguous metadata-surface lane inference, catalog chain-template scoring, structured chain-match contract exposure, runtime/debug propagation, subject-aware bidirectional comparison arbitration, structured catalog-alignment verdicts, representative alignment regression guard, catalog-alignment reason-code telemetry, explicit `alignment_status` propagation, and truth-harness/acceptance-matrix surfacing validated locally, but live replay for the new bridge is currently blocked by missing active 1C polling and broader unfamiliar 1C asks still need replay-backed growth +- graph snapshot after latest rebuild: `5946 nodes`, `12918 edges`, `136 communities` - current breakpoint: - the validated hot paths are no longer structurally broken; - flagship continuity collapse is no longer the primary risk; @@ -150,6 +151,7 @@ Latest live proof now includes: - representative catalog-alignment regression guard accepted locally: planner slice passed `37/37`; full MCP-discovery slice passed `283/283` with `9` skipped; build passed; graphify rebuilt to `5942 nodes`, `12912 edges`, `140 communities` - catalog-alignment reason-code telemetry accepted locally: planner/runtime slice passed `53/53`; full MCP-discovery suite passed `283/283` with `9` skipped; build passed; graphify rebuilt to `5943 nodes`, `12915 edges`, `136 communities` - catalog-alignment status verdict accepted locally: planner/runtime/debug slice passed `55/55`; full MCP-discovery suite passed `283/283` with `9` skipped; build passed; graphify rebuilt to `5943 nodes`, `12915 edges`, `136 communities` +- catalog-alignment replay artifact surfacing accepted locally: Python truth-harness/acceptance tests passed `4/4`; graphify rebuilt to `5946 nodes`, `12918 edges`, `136 communities` Current architectural reading: diff --git a/scripts/domain_case_loop.py b/scripts/domain_case_loop.py index 774fb1f..cc082a8 100644 --- a/scripts/domain_case_loop.py +++ b/scripts/domain_case_loop.py @@ -1727,6 +1727,9 @@ def build_scenario_step_state( "selected_recipe": debug.get("selected_recipe"), "capability_id": debug.get("capability_id"), "capability_route_mode": debug.get("capability_route_mode"), + "mcp_discovery_catalog_chain_alignment_status": debug.get("mcp_discovery_catalog_chain_alignment_status"), + "mcp_discovery_catalog_chain_top_match": debug.get("mcp_discovery_catalog_chain_top_match"), + "mcp_discovery_catalog_chain_selected_matches_top": debug.get("mcp_discovery_catalog_chain_selected_matches_top"), "route_expectation_status": debug.get("route_expectation_status"), "result_mode": debug.get("result_mode"), "response_type": debug.get("response_type"), diff --git a/scripts/domain_truth_harness.py b/scripts/domain_truth_harness.py index 2825b5d..2315f0b 100644 --- a/scripts/domain_truth_harness.py +++ b/scripts/domain_truth_harness.py @@ -679,6 +679,9 @@ def build_truth_review_markdown(spec: dict[str, Any], scenario_state: dict[str, f"intent: `{step_state.get('detected_intent') or 'n/a'}`", f"recipe: `{step_state.get('selected_recipe') or 'n/a'}`", f"capability: `{step_state.get('capability_id') or 'n/a'}`", + f"catalog_alignment_status: `{step_state.get('mcp_discovery_catalog_chain_alignment_status') or 'n/a'}`", + f"catalog_top_match: `{step_state.get('mcp_discovery_catalog_chain_top_match') or 'n/a'}`", + f"catalog_selected_matches_top: `{step_state.get('mcp_discovery_catalog_chain_selected_matches_top')}`", f"limited_reason_category: `{step_state.get('limited_reason_category') or 'n/a'}`", f"filters: `{dump_json(step_state.get('extracted_filters') or {})}`", f"direct_answer: {step_state.get('actual_direct_answer') or 'n/a'}", diff --git a/scripts/scenario_acceptance_policy.py b/scripts/scenario_acceptance_policy.py index 69c519d..6cbf1da 100644 --- a/scripts/scenario_acceptance_policy.py +++ b/scripts/scenario_acceptance_policy.py @@ -198,6 +198,9 @@ def build_scenario_acceptance_matrix( "reply_type": step_state.get("reply_type"), "detected_intent": step_state.get("detected_intent"), "capability_id": step_state.get("capability_id"), + "mcp_discovery_catalog_chain_alignment_status": step_state.get("mcp_discovery_catalog_chain_alignment_status"), + "mcp_discovery_catalog_chain_top_match": step_state.get("mcp_discovery_catalog_chain_top_match"), + "mcp_discovery_catalog_chain_selected_matches_top": step_state.get("mcp_discovery_catalog_chain_selected_matches_top"), "selected_object_step": _has_selected_object_signal(step), "meta_context_step": _has_meta_context_signal(step), "highest_unresolved_priority": highest_priority, @@ -330,6 +333,9 @@ def build_scenario_acceptance_matrix_markdown(acceptance_matrix: dict[str, Any]) f" review_status: `{row.get('review_status')}`", f" criticality: `{row.get('criticality')}`", f" semantic_tags: {', '.join(row.get('semantic_tags') or []) or 'none'}", + f" catalog_alignment_status: `{row.get('mcp_discovery_catalog_chain_alignment_status') or 'n/a'}`", + f" catalog_top_match: `{row.get('mcp_discovery_catalog_chain_top_match') or 'n/a'}`", + f" catalog_selected_matches_top: `{row.get('mcp_discovery_catalog_chain_selected_matches_top')}`", f" highest_unresolved_priority: `{row.get('highest_unresolved_priority')}`", f" selected_object_step: `{row.get('selected_object_step')}`", f" meta_context_step: `{row.get('meta_context_step')}`", diff --git a/scripts/test_domain_case_loop_step_state.py b/scripts/test_domain_case_loop_step_state.py new file mode 100644 index 0000000..d2ed7a2 --- /dev/null +++ b/scripts/test_domain_case_loop_step_state.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import sys +import unittest +from pathlib import Path + + +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +import domain_case_loop as dcl + + +class DomainCaseLoopStepStateTests(unittest.TestCase): + def test_preserves_mcp_catalog_alignment_debug_fields(self) -> None: + step_state = dcl.build_scenario_step_state( + scenario_id="planner_alignment_demo", + domain="planner_autonomy", + step={ + "step_id": "step_01", + "title": "Alignment visibility", + "depends_on": [], + "question_template": "show planner alignment", + }, + step_index=1, + question_resolved="show planner alignment", + analysis_context={}, + turn_artifact={ + "assistant_message": { + "reply_type": "factual", + "text": "Confirmed answer", + "message_id": "msg-1", + "trace_id": "trace-1", + }, + "technical_debug_payload": { + "detected_mode": "address_query", + "detected_intent": "counterparty_turnover", + "selected_recipe": "counterparty_turnover_by_period", + "capability_id": "confirmed_counterparty_turnover", + "mcp_discovery_catalog_chain_alignment_status": "selected_matches_top", + "mcp_discovery_catalog_chain_top_match": "value_flow", + "mcp_discovery_catalog_chain_selected_matches_top": True, + }, + "session_summary": {}, + }, + entries=[], + ) + + self.assertEqual(step_state["mcp_discovery_catalog_chain_alignment_status"], "selected_matches_top") + self.assertEqual(step_state["mcp_discovery_catalog_chain_top_match"], "value_flow") + self.assertTrue(step_state["mcp_discovery_catalog_chain_selected_matches_top"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/scripts/test_scenario_acceptance_policy.py b/scripts/test_scenario_acceptance_policy.py index 63a9d52..0cb5cb4 100644 --- a/scripts/test_scenario_acceptance_policy.py +++ b/scripts/test_scenario_acceptance_policy.py @@ -84,6 +84,9 @@ class ScenarioAcceptancePolicyTests(unittest.TestCase): "reply_type": "factual", "detected_intent": "inventory_on_hand_as_of_date", "capability_id": "confirmed_inventory_on_hand_as_of_date", + "mcp_discovery_catalog_chain_alignment_status": "selected_matches_top", + "mcp_discovery_catalog_chain_top_match": "inventory_stock_snapshot", + "mcp_discovery_catalog_chain_selected_matches_top": True, "review_findings": [], } }, @@ -104,6 +107,15 @@ class ScenarioAcceptancePolicyTests(unittest.TestCase): self.assertTrue(pack_state["acceptance_gate_passed"]) self.assertTrue(pack_state["critical_path_green"]) self.assertTrue(all(pack_state["invariants"].values())) + self.assertEqual( + acceptance_matrix["rows"][0]["mcp_discovery_catalog_chain_alignment_status"], + "selected_matches_top", + ) + self.assertEqual( + acceptance_matrix["rows"][0]["mcp_discovery_catalog_chain_top_match"], + "inventory_stock_snapshot", + ) + self.assertTrue(acceptance_matrix["rows"][0]["mcp_discovery_catalog_chain_selected_matches_top"]) def test_flags_meta_context_integrity_when_meta_step_leaks_technical_answer_shape(self) -> None: spec = {