From f7edf6aacb9715bbfcafc2f0c7a1bc7a53ff7d30 Mon Sep 17 00:00:00 2001 From: dctouch Date: Fri, 17 Apr 2026 12:17:47 +0300 Subject: [PATCH] =?UTF-8?q?=D0=90=D0=A0=D0=A7=20=D0=90=D0=9F11=20-=20?= =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=B8=D1=82=D1=8C=20scenario=20a?= =?UTF-8?q?cceptance=20gate=20=D0=B4=D0=BB=D1=8F=20truth=20harness=20?= =?UTF-8?q?=D0=B8=20=D0=B7=D0=B0=D0=BA=D1=80=D1=8B=D1=82=D1=8C=20Phase=207?= =?UTF-8?q?=20=D0=B0=D0=B3=D0=B5=D0=BD=D1=82=D0=BD=D1=8B=D0=BC=20=D0=BF?= =?UTF-8?q?=D1=80=D0=BE=D0=B3=D0=BE=D0=BD=D0=BE=D0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...th_harness_phase7_acceptance_gate_mix.json | 126 ++++++++ .../data/autorun_generators/history.json | 34 ++ ..._20260417091127_gen-ag04170911-ff51e1.json | 93 ++++++ ..._20260417091127_gen-ag04170911-ff51e1.json | 43 +++ ..._saved_session_runtime_job-Z-vWMI8lw_.json | 39 +++ ..._saved_session_runtime_job-otA7X9BRT5.json | 36 +++ scripts/domain_truth_harness.py | 35 +- scripts/scenario_acceptance_policy.py | 306 ++++++++++++++++++ scripts/test_scenario_acceptance_policy.py | 102 ++++++ 9 files changed, 811 insertions(+), 3 deletions(-) create mode 100644 docs/orchestration/address_truth_harness_phase7_acceptance_gate_mix.json create mode 100644 llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260417091127_gen-ag04170911-ff51e1.json create mode 100644 llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260417091127_gen-ag04170911-ff51e1.json create mode 100644 llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-Z-vWMI8lw_.json create mode 100644 llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-otA7X9BRT5.json create mode 100644 scripts/scenario_acceptance_policy.py create mode 100644 scripts/test_scenario_acceptance_policy.py diff --git a/docs/orchestration/address_truth_harness_phase7_acceptance_gate_mix.json b/docs/orchestration/address_truth_harness_phase7_acceptance_gate_mix.json new file mode 100644 index 0000000..78c5d74 --- /dev/null +++ b/docs/orchestration/address_truth_harness_phase7_acceptance_gate_mix.json @@ -0,0 +1,126 @@ +{ + "schema_version": "domain_truth_harness_spec_v1", + "scenario_id": "address_truth_harness_phase7_acceptance_gate_mix", + "domain": "address_phase7_acceptance_gate_mix", + "title": "Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries", + "description": "Primary acceptance scenario-tree for turnaround 11: root inventory snapshot, selected-object supplier, selected-object documents, same-date restore, plus human meta and historical capability follow-ups.", + "bindings": {}, + "steps": [ + { + "step_id": "step_01_inventory_march_2021", + "title": "Inventory root snapshot at March 2021", + "criticality": "critical", + "question": "какие остатки на складе на март 2021", + "allowed_reply_types": [ + "factual" + ], + "expected_intents": [ + "inventory_on_hand_as_of_date" + ], + "required_filters": { + "as_of_date": "2021-03-31", + "period_from": "2021-03-01", + "period_to": "2021-03-31" + }, + "required_direct_answer_patterns_any": [ + "31\\.03\\.2021", + "(?i)на складе", + "(?i)столешница 600\\*3050\\*26 альмандин" + ] + }, + { + "step_id": "step_02_selected_item_supplier", + "title": "Selected-object supplier provenance", + "criticality": "critical", + "question": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?", + "allowed_reply_types": [ + "factual" + ], + "expected_intents": [ + "inventory_purchase_provenance_for_item" + ], + "required_direct_answer_patterns_any": [ + "(?i)столешница 600\\*3050\\*26 альмандин", + "(?i)поставщик|поставил|куплен", + "(?i)союз|торговый дом" + ], + "forbidden_direct_answer_patterns": [ + "(?i)^на 31\\.03\\.2021 на складе", + "(?i)^сейчас не дам прямой адресный ответ" + ] + }, + { + "step_id": "step_03_selected_item_documents", + "title": "Selected-object documents stay in the same contour", + "criticality": "critical", + "question": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции", + "allowed_reply_types": [ + "factual" + ], + "expected_intents": [ + "inventory_purchase_documents_for_item" + ], + "required_direct_answer_patterns_any": [ + "(?i)столешница 600\\*3050\\*26 альмандин", + "(?i)документ", + "(?i)союз|торговый дом" + ] + }, + { + "step_id": "step_04_inventory_same_date_restore", + "title": "Same-date restore returns to the March root snapshot", + "criticality": "critical", + "question": "покажи еще раз остатки на эту же дату", + "allowed_reply_types": [ + "factual" + ], + "expected_intents": [ + "inventory_on_hand_as_of_date" + ], + "required_filters": { + "as_of_date": "2021-03-31", + "period_from": "2021-03-01", + "period_to": "2021-03-31" + }, + "required_direct_answer_patterns_any": [ + "31\\.03\\.2021", + "(?i)на складе" + ], + "forbidden_direct_answer_patterns": [ + "(?i)^сейчас не дам прямой адресный ответ", + "(?i)transition_not_supported_by_capability" + ] + }, + { + "step_id": "step_05_data_scope_meta_interrupt", + "title": "Data-scope meta question remains human and non-technical", + "criticality": "warning", + "question": "по какой компании мы сейчас работаем?", + "required_answer_patterns_any": [ + "(?i)компан|организац|контур", + "(?i)работ" + ], + "forbidden_answer_patterns": [ + "(?i)tool_gate_reason", + "(?i)hard_meta_mode", + "(?i)living_reason" + ] + }, + { + "step_id": "step_06_historical_capability_followup", + "title": "Historical capability follow-up stays human", + "criticality": "warning", + "question": "а исторические остатки тоже можешь?", + "required_answer_patterns_any": [ + "(?i)историческ|история", + "(?i)могу|умею" + ], + "forbidden_answer_patterns": [ + "(?i)^сейчас не дам прямой адресный ответ", + "(?i)^в текущем адресном контуре этот запрос лучше не закрывать в лоб", + "(?i)tool_gate_reason", + "(?i)hard_meta_mode" + ] + } + ] +} diff --git a/llm_normalizer/data/autorun_generators/history.json b/llm_normalizer/data/autorun_generators/history.json index a4af91c..b19ab8a 100644 --- a/llm_normalizer/data/autorun_generators/history.json +++ b/llm_normalizer/data/autorun_generators/history.json @@ -1,4 +1,38 @@ [ + { + "generation_id": "gen-ag04170911-ff51e1", + "created_at": "2026-04-17T09:11:27+00:00", + "mode": "saved_user_sessions", + "title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries", + "count": 6, + "domain": "address_phase7_acceptance_gate_mix", + "questions": [ + "какие остатки на складе на март 2021", + "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?", + "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции", + "покажи еще раз остатки на эту же дату", + "по какой компании мы сейчас работаем?", + "а исторические остатки тоже можешь?" + ], + "generated_by": "codex_agent", + "saved_case_set_file": "assistant_autogen_saved_user_sessions_20260417091127_gen-ag04170911-ff51e1.json", + "context": { + "llm_provider": null, + "model": null, + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "autogen_personality_id": null, + "autogen_personality_prompt": null, + "source_session_id": null, + "saved_session_file": "assistant_saved_session_20260417091127_gen-ag04170911-ff51e1.json", + "saved_case_set_kind": "agent_semantic_scenario", + "agent_run": true, + "agent_focus": "scenario acceptance gate over root selected-object restore and human meta", + "architecture_phase": "turnaround_11_phase7", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\address_truth_harness_phase7_acceptance_gate_mix.json" + } + }, { "generation_id": "gen-ag04170855-d13dd3", "created_at": "2026-04-17T08:55:50+00:00", diff --git a/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260417091127_gen-ag04170911-ff51e1.json b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260417091127_gen-ag04170911-ff51e1.json new file mode 100644 index 0000000..c9b6897 --- /dev/null +++ b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260417091127_gen-ag04170911-ff51e1.json @@ -0,0 +1,93 @@ +{ + "saved_at": "2026-04-17T09:11:27+00:00", + "generation_id": "gen-ag04170911-ff51e1", + "mode": "saved_user_sessions", + "title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries", + "agent_run": true, + "questions": [ + "какие остатки на складе на март 2021", + "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?", + "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции", + "покажи еще раз остатки на эту же дату", + "по какой компании мы сейчас работаем?", + "а исторические остатки тоже можешь?" + ], + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "scenario acceptance gate over root selected-object restore and human meta", + "architecture_phase": "turnaround_11_phase7", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\address_truth_harness_phase7_acceptance_gate_mix.json" + }, + "source_session_id": null, + "session": { + "session_id": null, + "mode": "agent_semantic_run", + "items": [ + { + "message_id": "agent-user-001", + "role": "user", + "text": "какие остатки на складе на март 2021", + "created_at": "2026-04-17T09:11:27+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-002", + "role": "user", + "text": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?", + "created_at": "2026-04-17T09:11:27+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-003", + "role": "user", + "text": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции", + "created_at": "2026-04-17T09:11:27+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-004", + "role": "user", + "text": "покажи еще раз остатки на эту же дату", + "created_at": "2026-04-17T09:11:27+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-005", + "role": "user", + "text": "по какой компании мы сейчас работаем?", + "created_at": "2026-04-17T09:11:27+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-006", + "role": "user", + "text": "а исторические остатки тоже можешь?", + "created_at": "2026-04-17T09:11:27+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + } + ], + "agent_run": true, + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "scenario acceptance gate over root selected-object restore and human meta", + "architecture_phase": "turnaround_11_phase7", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\address_truth_harness_phase7_acceptance_gate_mix.json" + } + } +} diff --git a/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260417091127_gen-ag04170911-ff51e1.json b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260417091127_gen-ag04170911-ff51e1.json new file mode 100644 index 0000000..92f8418 --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260417091127_gen-ag04170911-ff51e1.json @@ -0,0 +1,43 @@ +{ + "suite_id": "assistant_saved_session_gen-ag04170911-ff51e1", + "suite_version": "0.1.0", + "schema_version": "assistant_saved_session_suite_v0_1", + "generated_at": "2026-04-17T09:11:27+00:00", + "generation_id": "gen-ag04170911-ff51e1", + "mode": "saved_user_sessions", + "title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries", + "domain": "address_phase7_acceptance_gate_mix", + "scenario_count": 1, + "case_ids": [ + "SAVED-001" + ], + "cases": [ + { + "case_id": "SAVED-001", + "scenario_tag": "agent_saved_user_sessions", + "title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries", + "question_type": "followup", + "broadness_level": "medium", + "turns": [ + { + "user_message": "какие остатки на складе на март 2021" + }, + { + "user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?" + }, + { + "user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции" + }, + { + "user_message": "покажи еще раз остатки на эту же дату" + }, + { + "user_message": "по какой компании мы сейчас работаем?" + }, + { + "user_message": "а исторические остатки тоже можешь?" + } + ] + } + ] +} diff --git a/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-Z-vWMI8lw_.json b/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-Z-vWMI8lw_.json new file mode 100644 index 0000000..d6cba25 --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-Z-vWMI8lw_.json @@ -0,0 +1,39 @@ +{ + "suite_id": "assistant_saved_session_runtime_job-Z-vWMI8lw_", + "suite_version": "0.1.0", + "schema_version": "assistant_saved_session_runtime_v0_1", + "title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries", + "scenario_count": 1, + "case_ids": [ + "SAVED-001" + ], + "cases": [ + { + "case_id": "SAVED-001", + "scenario_tag": "saved_user_sessions_runtime", + "title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries", + "question_type": "followup", + "broadness_level": "medium", + "turns": [ + { + "user_message": "какие остатки на складе на март 2021" + }, + { + "user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?" + }, + { + "user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции" + }, + { + "user_message": "покажи еще раз остатки на эту же дату" + }, + { + "user_message": "по какой компании мы сейчас работаем?" + }, + { + "user_message": "а исторические остатки тоже можешь?" + } + ] + } + ] +} \ No newline at end of file diff --git a/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-otA7X9BRT5.json b/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-otA7X9BRT5.json new file mode 100644 index 0000000..94e0c85 --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_saved_session_runtime_job-otA7X9BRT5.json @@ -0,0 +1,36 @@ +{ + "suite_id": "assistant_saved_session_runtime_job-otA7X9BRT5", + "suite_version": "0.1.0", + "schema_version": "assistant_saved_session_runtime_v0_1", + "title": "AGENT | Phase 6 provider/runtime replay across chat, meta, and address boundaries", + "scenario_count": 1, + "case_ids": [ + "SAVED-001" + ], + "cases": [ + { + "case_id": "SAVED-001", + "scenario_tag": "saved_user_sessions_runtime", + "title": "AGENT | Phase 6 provider/runtime replay across chat, meta, and address boundaries", + "question_type": "followup", + "broadness_level": "medium", + "turns": [ + { + "user_message": "привет, как дела?" + }, + { + "user_message": "по какой компании мы сейчас работаем?" + }, + { + "user_message": "что ты можешь по 1С?" + }, + { + "user_message": "какие остатки на складе на март 2021" + }, + { + "user_message": "а исторические остатки тоже можешь?" + } + ] + } + ] +} \ No newline at end of file diff --git a/scripts/domain_truth_harness.py b/scripts/domain_truth_harness.py index 6a1ef9a..54fca34 100644 --- a/scripts/domain_truth_harness.py +++ b/scripts/domain_truth_harness.py @@ -9,6 +9,7 @@ from types import SimpleNamespace from typing import Any import domain_case_loop as dcl +import scenario_acceptance_policy as sap REPO_ROOT = Path(__file__).resolve().parent.parent @@ -696,6 +697,21 @@ def build_truth_review_markdown(spec: dict[str, Any], scenario_state: dict[str, return "\n".join(lines).strip() + "\n" +def write_acceptance_artifacts( + output_dir: Path, + spec: dict[str, Any], + scenario_state: dict[str, Any], + review_summary: dict[str, Any], +) -> dict[str, Any]: + acceptance_matrix = sap.build_scenario_acceptance_matrix(spec, scenario_state, review_summary) + pack_state = sap.derive_truth_harness_pack_state(spec, scenario_state, review_summary, acceptance_matrix) + write_json(output_dir / "scenario_acceptance_matrix.json", acceptance_matrix) + write_text(output_dir / "scenario_acceptance_matrix.md", sap.build_scenario_acceptance_matrix_markdown(acceptance_matrix)) + write_json(output_dir / "pack_state.json", pack_state) + write_text(output_dir / "final_status.md", sap.build_truth_harness_final_status_markdown(pack_state)) + return {"acceptance_matrix": acceptance_matrix, "pack_state": pack_state} + + def save_step_bundle( *, step_dir: Path, @@ -845,7 +861,13 @@ def review_export(spec: dict[str, Any], export_path: Path, output_dir: Path) -> write_json(output_dir / "scenario_state.json", scenario_state) write_json(output_dir / "truth_review.json", {"summary": review_summary, "steps": scenario_state["step_outputs"]}) write_text(output_dir / "truth_review.md", review_markdown) - return {"scenario_state": scenario_state, "review_summary": review_summary} + acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary) + return { + "scenario_state": scenario_state, + "review_summary": review_summary, + "acceptance_matrix": acceptance_bundle["acceptance_matrix"], + "pack_state": acceptance_bundle["pack_state"], + } def run_live(spec: dict[str, Any], output_dir: Path, args: argparse.Namespace) -> dict[str, Any]: @@ -932,10 +954,15 @@ def run_live(spec: dict[str, Any], output_dir: Path, args: argparse.Namespace) - write_json(output_dir / "scenario_state.json", scenario_state) write_json(output_dir / "truth_review.json", {"summary": review_summary, "steps": scenario_state["step_outputs"]}) write_text(output_dir / "truth_review.md", review_markdown) - write_text(output_dir / "final_status.md", f"# Final status\n\n- status: `{review_summary['overall_status']}`\n") + acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary) print(f"[truth-harness] saved artifacts to {output_dir}") print(f"[truth-harness] overall_status={review_summary['overall_status']}") - return {"scenario_state": scenario_state, "review_summary": review_summary} + return { + "scenario_state": scenario_state, + "review_summary": review_summary, + "acceptance_matrix": acceptance_bundle["acceptance_matrix"], + "pack_state": acceptance_bundle["pack_state"], + } def build_bootstrap_spec(export_path: Path, scenario_id: str, domain: str, title: str | None) -> dict[str, Any]: @@ -994,6 +1021,7 @@ def handle_review_export(args: argparse.Namespace) -> int: ) result = review_export(spec, export_path, output_dir) print(f"[truth-harness] review-export overall_status={result['review_summary']['overall_status']}") + print(f"[truth-harness] review-export final_status={result['pack_state']['final_status']}") print(f"[truth-harness] artifacts={output_dir}") return 0 @@ -1006,6 +1034,7 @@ def handle_run_live(args: argparse.Namespace) -> int: ) result = run_live(spec, output_dir, args) print(f"[truth-harness] run-live overall_status={result['review_summary']['overall_status']}") + print(f"[truth-harness] run-live final_status={result['pack_state']['final_status']}") print(f"[truth-harness] artifacts={output_dir}") return 0 diff --git a/scripts/scenario_acceptance_policy.py b/scripts/scenario_acceptance_policy.py new file mode 100644 index 0000000..23c5b33 --- /dev/null +++ b/scripts/scenario_acceptance_policy.py @@ -0,0 +1,306 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any + + +SCENARIO_ACCEPTANCE_MATRIX_SCHEMA_VERSION = "scenario_acceptance_matrix_v1" +TRUTH_HARNESS_PACK_STATE_SCHEMA_VERSION = "truth_harness_pack_state_v1" + +SEVERITY_TO_PRIORITY = { + "critical": "P0", + "warning": "P1", + "info": "P2", +} +PRIORITY_RANK = {"P0": 0, "P1": 1, "P2": 2, "none": 3} + +SELECTED_OBJECT_INTENTS = { + "inventory_purchase_provenance_for_item", + "inventory_purchase_documents_for_item", + "inventory_sale_trace_for_item", + "inventory_profitability_for_item", + "inventory_purchase_to_sale_chain", +} + + +def _now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def _normalize_step_outputs(scenario_state: dict[str, Any]) -> dict[str, dict[str, Any]]: + raw = scenario_state.get("step_outputs") + return raw if isinstance(raw, dict) else {} + + +def _normalize_findings(step_state: dict[str, Any]) -> list[dict[str, Any]]: + raw = step_state.get("review_findings") + return [item for item in raw if isinstance(item, dict)] if isinstance(raw, list) else [] + + +def _priority_from_finding(finding: dict[str, Any]) -> str: + severity = str(finding.get("severity") or "").strip().lower() + return SEVERITY_TO_PRIORITY.get(severity, "P2") + + +def _highest_priority(findings: list[dict[str, Any]]) -> str: + if not findings: + return "none" + priorities = [_priority_from_finding(item) for item in findings] + return sorted(priorities, key=lambda item: PRIORITY_RANK.get(item, 99))[0] + + +def _has_selected_object_signal(step: dict[str, Any]) -> bool: + question = str(step.get("question_template") or "").lower() + expected_intents = { + str(item).strip() + for item in (step.get("expected_intents") or []) + if str(item).strip() + } + if expected_intents & SELECTED_OBJECT_INTENTS: + return True + return any( + marker in question + for marker in ( + "выбранному объекту", + "по этой позиции", + "по ней", + "по нему", + "\"", + ) + ) + + +def _is_direct_answer_code(code: str) -> bool: + return code.startswith("required_direct_answer_") or code.startswith("forbidden_direct_answer_") + + +def _is_temporal_code(code: str) -> bool: + return ( + code.startswith("missing_filter:") + or code.startswith("wrong_filter:") + or code.startswith("forbidden_filter_key:") + or code.startswith("forbidden_filter_value:") + or code.startswith("period_carryover_") + or code.startswith("previous_step_missing:") + ) + + +def _is_truth_gate_code(code: str) -> bool: + return code in { + "unexpected_reply_type", + "unexpected_limited_reason_category", + "wrong_result_mode", + } + + +def _is_route_code(code: str) -> bool: + return code in {"wrong_intent", "wrong_capability", "wrong_recipe", "question_sequence_mismatch"} + + +def _is_human_answer_quality_code(code: str) -> bool: + return code in { + "required_answer_patterns_any_missing", + "required_answer_patterns_all_missing", + "forbidden_answer_pattern_hit", + } + + +def _derive_step_invariant_failures(step: dict[str, Any], findings: list[dict[str, Any]]) -> dict[str, bool]: + codes = [str(item.get("code") or "").strip() for item in findings] + selected_object_step = _has_selected_object_signal(step) + return { + "direct_answer": any(_is_direct_answer_code(code) for code in codes), + "temporal_honesty": any(_is_temporal_code(code) for code in codes), + "selected_object_continuity": selected_object_step and any(_is_route_code(code) for code in codes), + "truth_gate": any(_is_truth_gate_code(code) for code in codes), + "human_answer_quality": any(_is_human_answer_quality_code(code) for code in codes), + } + + +def build_scenario_acceptance_matrix( + spec: dict[str, Any], scenario_state: dict[str, Any], review_summary: dict[str, Any] +) -> dict[str, Any]: + step_outputs = _normalize_step_outputs(scenario_state) + rows: list[dict[str, Any]] = [] + unresolved_priority_counts = {"P0": 0, "P1": 0, "P2": 0} + invariant_failure_counts = { + "direct_answer": 0, + "temporal_honesty": 0, + "selected_object_continuity": 0, + "truth_gate": 0, + "human_answer_quality": 0, + } + + for index, step in enumerate(spec.get("steps") or [], start=1): + step_id = str(step.get("step_id") or "").strip() + step_state = step_outputs.get(step_id, {}) if step_id else {} + findings = _normalize_findings(step_state) + invariant_failures = _derive_step_invariant_failures(step, findings) + for invariant_name, failed in invariant_failures.items(): + if failed: + invariant_failure_counts[invariant_name] += 1 + highest_priority = _highest_priority(findings) + if highest_priority in unresolved_priority_counts: + unresolved_priority_counts[highest_priority] += len( + [item for item in findings if _priority_from_finding(item) == highest_priority] + ) + rows.append( + { + "index": index, + "step_id": step_id, + "title": step.get("title"), + "question": step.get("question_template"), + "criticality": str(step.get("criticality") or "critical"), + "review_status": str(step_state.get("review_status") or "unknown"), + "reply_type": step_state.get("reply_type"), + "detected_intent": step_state.get("detected_intent"), + "capability_id": step_state.get("capability_id"), + "selected_object_step": _has_selected_object_signal(step), + "highest_unresolved_priority": highest_priority, + "unresolved_findings_count": len(findings), + "invariant_failures": [name for name, failed in invariant_failures.items() if failed], + "findings": findings, + } + ) + + invariants = { + "direct_answer_ok": invariant_failure_counts["direct_answer"] == 0, + "temporal_honesty_ok": invariant_failure_counts["temporal_honesty"] == 0, + "selected_object_continuity_ok": invariant_failure_counts["selected_object_continuity"] == 0, + "truth_gate_ok": invariant_failure_counts["truth_gate"] == 0, + "human_answer_quality_ok": invariant_failure_counts["human_answer_quality"] == 0, + } + critical_rows = [row for row in rows if row["criticality"] == "critical"] + critical_path_green = bool(critical_rows) and all(row["review_status"] == "pass" for row in critical_rows) + + return { + "schema_version": SCENARIO_ACCEPTANCE_MATRIX_SCHEMA_VERSION, + "scenario_id": spec.get("scenario_id"), + "domain": spec.get("domain"), + "title": spec.get("title"), + "review_source": review_summary.get("review_source"), + "session_id": scenario_state.get("session_id"), + "rows": rows, + "summary": { + "steps_total": len(rows), + "critical_steps_total": len(critical_rows), + "critical_steps_passed": sum(1 for row in critical_rows if row["review_status"] == "pass"), + "critical_path_green": critical_path_green, + "unresolved_p0_count": unresolved_priority_counts["P0"], + "unresolved_p1_count": unresolved_priority_counts["P1"], + "unresolved_p2_count": unresolved_priority_counts["P2"], + "invariant_failure_counts": invariant_failure_counts, + "invariants": invariants, + }, + "updated_at": _now_iso(), + } + + +def derive_truth_harness_pack_state( + spec: dict[str, Any], + scenario_state: dict[str, Any], + review_summary: dict[str, Any], + acceptance_matrix: dict[str, Any], +) -> dict[str, Any]: + summary = acceptance_matrix.get("summary") if isinstance(acceptance_matrix.get("summary"), dict) else {} + invariants = summary.get("invariants") if isinstance(summary.get("invariants"), dict) else {} + unresolved_p0_count = int(summary.get("unresolved_p0_count") or 0) + review_overall_status = str(review_summary.get("overall_status") or "unknown") + step_outputs = _normalize_step_outputs(scenario_state) + + if not step_outputs: + final_status = "blocked" + final_status_reason = "no_step_outputs" + elif review_overall_status == "pass" and unresolved_p0_count == 0 and all(bool(value) for value in invariants.values()): + final_status = "accepted" + final_status_reason = "scenario_acceptance_gate_passed" + else: + final_status = "partial" + if unresolved_p0_count > 0: + final_status_reason = "unresolved_p0" + elif review_overall_status == "warning": + final_status_reason = "review_warning_remaining" + elif review_overall_status == "fail": + final_status_reason = "review_failures_remaining" + else: + final_status_reason = "acceptance_invariants_not_green" + + return { + "schema_version": TRUTH_HARNESS_PACK_STATE_SCHEMA_VERSION, + "pack_id": spec.get("scenario_id"), + "scenario_id": spec.get("scenario_id"), + "domain": spec.get("domain"), + "title": spec.get("title"), + "review_source": review_summary.get("review_source"), + "session_id": scenario_state.get("session_id"), + "steps_total": review_summary.get("steps_total"), + "steps_passed": review_summary.get("steps_passed"), + "steps_with_warning": review_summary.get("steps_with_warning"), + "steps_failed": review_summary.get("steps_failed"), + "review_overall_status": review_overall_status, + "execution_status": "exact" if review_overall_status == "pass" else "partial", + "final_status": final_status, + "final_status_reason": final_status_reason, + "acceptance_gate_passed": final_status == "accepted", + "no_unresolved_p0": unresolved_p0_count == 0, + "unresolved_p0_count": unresolved_p0_count, + "unresolved_p1_count": int(summary.get("unresolved_p1_count") or 0), + "unresolved_p2_count": int(summary.get("unresolved_p2_count") or 0), + "critical_path_green": bool(summary.get("critical_path_green")), + "invariants": invariants, + "updated_at": _now_iso(), + } + + +def build_scenario_acceptance_matrix_markdown(acceptance_matrix: dict[str, Any]) -> str: + summary = acceptance_matrix.get("summary") if isinstance(acceptance_matrix.get("summary"), dict) else {} + invariants = summary.get("invariants") if isinstance(summary.get("invariants"), dict) else {} + lines = [ + "# Scenario acceptance matrix", + "", + f"- scenario_id: `{acceptance_matrix.get('scenario_id') or 'n/a'}`", + f"- domain: `{acceptance_matrix.get('domain') or 'n/a'}`", + f"- title: {acceptance_matrix.get('title') or 'n/a'}", + f"- review_source: `{acceptance_matrix.get('review_source') or 'n/a'}`", + f"- session_id: `{acceptance_matrix.get('session_id') or 'n/a'}`", + f"- critical_path_green: `{summary.get('critical_path_green')}`", + f"- unresolved_p0_count: `{summary.get('unresolved_p0_count')}`", + f"- unresolved_p1_count: `{summary.get('unresolved_p1_count')}`", + f"- unresolved_p2_count: `{summary.get('unresolved_p2_count')}`", + "", + "## Acceptance invariants", + f"- direct_answer_ok: `{invariants.get('direct_answer_ok')}`", + f"- temporal_honesty_ok: `{invariants.get('temporal_honesty_ok')}`", + f"- selected_object_continuity_ok: `{invariants.get('selected_object_continuity_ok')}`", + f"- truth_gate_ok: `{invariants.get('truth_gate_ok')}`", + f"- human_answer_quality_ok: `{invariants.get('human_answer_quality_ok')}`", + "", + "## Steps", + ] + for row in acceptance_matrix.get("rows") or []: + lines.extend( + [ + f"- `{row.get('step_id')}`", + f" review_status: `{row.get('review_status')}`", + f" criticality: `{row.get('criticality')}`", + f" highest_unresolved_priority: `{row.get('highest_unresolved_priority')}`", + f" selected_object_step: `{row.get('selected_object_step')}`", + f" invariant_failures: {', '.join(row.get('invariant_failures') or []) or 'none'}", + ] + ) + return "\n".join(lines).strip() + "\n" + + +def build_truth_harness_final_status_markdown(pack_state: dict[str, Any]) -> str: + invariants = pack_state.get("invariants") if isinstance(pack_state.get("invariants"), dict) else {} + return ( + "# Final status\n\n" + f"- status: `{pack_state.get('final_status') or 'n/a'}`\n" + f"- reason: `{pack_state.get('final_status_reason') or 'n/a'}`\n" + f"- review_overall_status: `{pack_state.get('review_overall_status') or 'n/a'}`\n" + f"- no_unresolved_p0: `{pack_state.get('no_unresolved_p0')}`\n" + f"- direct_answer_ok: `{invariants.get('direct_answer_ok')}`\n" + f"- temporal_honesty_ok: `{invariants.get('temporal_honesty_ok')}`\n" + f"- selected_object_continuity_ok: `{invariants.get('selected_object_continuity_ok')}`\n" + f"- truth_gate_ok: `{invariants.get('truth_gate_ok')}`\n" + ) diff --git a/scripts/test_scenario_acceptance_policy.py b/scripts/test_scenario_acceptance_policy.py new file mode 100644 index 0000000..dcb684c --- /dev/null +++ b/scripts/test_scenario_acceptance_policy.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import unittest + +import scenario_acceptance_policy as sap + + +class ScenarioAcceptancePolicyTests(unittest.TestCase): + def test_marks_partial_when_selected_object_and_temporal_p0_findings_exist(self) -> None: + spec = { + "scenario_id": "demo_phase7", + "domain": "inventory_demo", + "title": "Demo", + "steps": [ + { + "step_id": "step_01", + "title": "Selected object supplier", + "question_template": 'По выбранному объекту "Стол": кто поставил?', + "criticality": "critical", + "expected_intents": ["inventory_purchase_provenance_for_item"], + } + ], + } + scenario_state = { + "session_id": "asst-demo", + "step_outputs": { + "step_01": { + "review_status": "fail", + "reply_type": "factual", + "detected_intent": "inventory_on_hand_as_of_date", + "capability_id": "confirmed_inventory_on_hand_as_of_date", + "review_findings": [ + {"code": "wrong_intent", "severity": "critical"}, + {"code": "wrong_filter:as_of_date", "severity": "critical"}, + ], + } + }, + } + review_summary = { + "review_source": "live_strict_replay", + "overall_status": "fail", + "steps_total": 1, + "steps_passed": 0, + "steps_with_warning": 0, + "steps_failed": 1, + } + + acceptance_matrix = sap.build_scenario_acceptance_matrix(spec, scenario_state, review_summary) + pack_state = sap.derive_truth_harness_pack_state(spec, scenario_state, review_summary, acceptance_matrix) + + self.assertEqual(pack_state["final_status"], "partial") + self.assertFalse(pack_state["invariants"]["selected_object_continuity_ok"]) + self.assertFalse(pack_state["invariants"]["temporal_honesty_ok"]) + self.assertEqual(pack_state["unresolved_p0_count"], 2) + + def test_accepts_when_all_review_and_acceptance_invariants_are_green(self) -> None: + spec = { + "scenario_id": "demo_phase7_green", + "domain": "inventory_demo", + "title": "Demo green", + "steps": [ + { + "step_id": "step_01", + "title": "Inventory root", + "question_template": "какие остатки на складе на март 2021", + "criticality": "critical", + "expected_intents": ["inventory_on_hand_as_of_date"], + } + ], + } + scenario_state = { + "session_id": "asst-green", + "step_outputs": { + "step_01": { + "review_status": "pass", + "reply_type": "factual", + "detected_intent": "inventory_on_hand_as_of_date", + "capability_id": "confirmed_inventory_on_hand_as_of_date", + "review_findings": [], + } + }, + } + review_summary = { + "review_source": "live_strict_replay", + "overall_status": "pass", + "steps_total": 1, + "steps_passed": 1, + "steps_with_warning": 0, + "steps_failed": 0, + } + + acceptance_matrix = sap.build_scenario_acceptance_matrix(spec, scenario_state, review_summary) + pack_state = sap.derive_truth_harness_pack_state(spec, scenario_state, review_summary, acceptance_matrix) + + self.assertEqual(pack_state["final_status"], "accepted") + self.assertTrue(pack_state["acceptance_gate_passed"]) + self.assertTrue(pack_state["critical_path_green"]) + self.assertTrue(all(pack_state["invariants"].values())) + + +if __name__ == "__main__": + unittest.main()