АРЧ АП11 - Добавить scenario acceptance gate для truth harness и закрыть Phase 7 агентным прогоном
This commit is contained in:
parent
f5ff844105
commit
f7edf6aacb
|
|
@ -0,0 +1,126 @@
|
|||
{
|
||||
"schema_version": "domain_truth_harness_spec_v1",
|
||||
"scenario_id": "address_truth_harness_phase7_acceptance_gate_mix",
|
||||
"domain": "address_phase7_acceptance_gate_mix",
|
||||
"title": "Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries",
|
||||
"description": "Primary acceptance scenario-tree for turnaround 11: root inventory snapshot, selected-object supplier, selected-object documents, same-date restore, plus human meta and historical capability follow-ups.",
|
||||
"bindings": {},
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01_inventory_march_2021",
|
||||
"title": "Inventory root snapshot at March 2021",
|
||||
"criticality": "critical",
|
||||
"question": "какие остатки на складе на март 2021",
|
||||
"allowed_reply_types": [
|
||||
"factual"
|
||||
],
|
||||
"expected_intents": [
|
||||
"inventory_on_hand_as_of_date"
|
||||
],
|
||||
"required_filters": {
|
||||
"as_of_date": "2021-03-31",
|
||||
"period_from": "2021-03-01",
|
||||
"period_to": "2021-03-31"
|
||||
},
|
||||
"required_direct_answer_patterns_any": [
|
||||
"31\\.03\\.2021",
|
||||
"(?i)на складе",
|
||||
"(?i)столешница 600\\*3050\\*26 альмандин"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_02_selected_item_supplier",
|
||||
"title": "Selected-object supplier provenance",
|
||||
"criticality": "critical",
|
||||
"question": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?",
|
||||
"allowed_reply_types": [
|
||||
"factual"
|
||||
],
|
||||
"expected_intents": [
|
||||
"inventory_purchase_provenance_for_item"
|
||||
],
|
||||
"required_direct_answer_patterns_any": [
|
||||
"(?i)столешница 600\\*3050\\*26 альмандин",
|
||||
"(?i)поставщик|поставил|куплен",
|
||||
"(?i)союз|торговый дом"
|
||||
],
|
||||
"forbidden_direct_answer_patterns": [
|
||||
"(?i)^на 31\\.03\\.2021 на складе",
|
||||
"(?i)^сейчас не дам прямой адресный ответ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_03_selected_item_documents",
|
||||
"title": "Selected-object documents stay in the same contour",
|
||||
"criticality": "critical",
|
||||
"question": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции",
|
||||
"allowed_reply_types": [
|
||||
"factual"
|
||||
],
|
||||
"expected_intents": [
|
||||
"inventory_purchase_documents_for_item"
|
||||
],
|
||||
"required_direct_answer_patterns_any": [
|
||||
"(?i)столешница 600\\*3050\\*26 альмандин",
|
||||
"(?i)документ",
|
||||
"(?i)союз|торговый дом"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_04_inventory_same_date_restore",
|
||||
"title": "Same-date restore returns to the March root snapshot",
|
||||
"criticality": "critical",
|
||||
"question": "покажи еще раз остатки на эту же дату",
|
||||
"allowed_reply_types": [
|
||||
"factual"
|
||||
],
|
||||
"expected_intents": [
|
||||
"inventory_on_hand_as_of_date"
|
||||
],
|
||||
"required_filters": {
|
||||
"as_of_date": "2021-03-31",
|
||||
"period_from": "2021-03-01",
|
||||
"period_to": "2021-03-31"
|
||||
},
|
||||
"required_direct_answer_patterns_any": [
|
||||
"31\\.03\\.2021",
|
||||
"(?i)на складе"
|
||||
],
|
||||
"forbidden_direct_answer_patterns": [
|
||||
"(?i)^сейчас не дам прямой адресный ответ",
|
||||
"(?i)transition_not_supported_by_capability"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_05_data_scope_meta_interrupt",
|
||||
"title": "Data-scope meta question remains human and non-technical",
|
||||
"criticality": "warning",
|
||||
"question": "по какой компании мы сейчас работаем?",
|
||||
"required_answer_patterns_any": [
|
||||
"(?i)компан|организац|контур",
|
||||
"(?i)работ"
|
||||
],
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)tool_gate_reason",
|
||||
"(?i)hard_meta_mode",
|
||||
"(?i)living_reason"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_06_historical_capability_followup",
|
||||
"title": "Historical capability follow-up stays human",
|
||||
"criticality": "warning",
|
||||
"question": "а исторические остатки тоже можешь?",
|
||||
"required_answer_patterns_any": [
|
||||
"(?i)историческ|история",
|
||||
"(?i)могу|умею"
|
||||
],
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)^сейчас не дам прямой адресный ответ",
|
||||
"(?i)^в текущем адресном контуре этот запрос лучше не закрывать в лоб",
|
||||
"(?i)tool_gate_reason",
|
||||
"(?i)hard_meta_mode"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,4 +1,38 @@
|
|||
[
|
||||
{
|
||||
"generation_id": "gen-ag04170911-ff51e1",
|
||||
"created_at": "2026-04-17T09:11:27+00:00",
|
||||
"mode": "saved_user_sessions",
|
||||
"title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries",
|
||||
"count": 6,
|
||||
"domain": "address_phase7_acceptance_gate_mix",
|
||||
"questions": [
|
||||
"какие остатки на складе на март 2021",
|
||||
"По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?",
|
||||
"По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции",
|
||||
"покажи еще раз остатки на эту же дату",
|
||||
"по какой компании мы сейчас работаем?",
|
||||
"а исторические остатки тоже можешь?"
|
||||
],
|
||||
"generated_by": "codex_agent",
|
||||
"saved_case_set_file": "assistant_autogen_saved_user_sessions_20260417091127_gen-ag04170911-ff51e1.json",
|
||||
"context": {
|
||||
"llm_provider": null,
|
||||
"model": null,
|
||||
"assistant_prompt_version": null,
|
||||
"decomposition_prompt_version": null,
|
||||
"prompt_fingerprint": null,
|
||||
"autogen_personality_id": null,
|
||||
"autogen_personality_prompt": null,
|
||||
"source_session_id": null,
|
||||
"saved_session_file": "assistant_saved_session_20260417091127_gen-ag04170911-ff51e1.json",
|
||||
"saved_case_set_kind": "agent_semantic_scenario",
|
||||
"agent_run": true,
|
||||
"agent_focus": "scenario acceptance gate over root selected-object restore and human meta",
|
||||
"architecture_phase": "turnaround_11_phase7",
|
||||
"source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\address_truth_harness_phase7_acceptance_gate_mix.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"generation_id": "gen-ag04170855-d13dd3",
|
||||
"created_at": "2026-04-17T08:55:50+00:00",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,93 @@
|
|||
{
|
||||
"saved_at": "2026-04-17T09:11:27+00:00",
|
||||
"generation_id": "gen-ag04170911-ff51e1",
|
||||
"mode": "saved_user_sessions",
|
||||
"title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries",
|
||||
"agent_run": true,
|
||||
"questions": [
|
||||
"какие остатки на складе на март 2021",
|
||||
"По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?",
|
||||
"По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции",
|
||||
"покажи еще раз остатки на эту же дату",
|
||||
"по какой компании мы сейчас работаем?",
|
||||
"а исторические остатки тоже можешь?"
|
||||
],
|
||||
"metadata": {
|
||||
"assistant_prompt_version": null,
|
||||
"decomposition_prompt_version": null,
|
||||
"prompt_fingerprint": null,
|
||||
"agent_focus": "scenario acceptance gate over root selected-object restore and human meta",
|
||||
"architecture_phase": "turnaround_11_phase7",
|
||||
"source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\address_truth_harness_phase7_acceptance_gate_mix.json"
|
||||
},
|
||||
"source_session_id": null,
|
||||
"session": {
|
||||
"session_id": null,
|
||||
"mode": "agent_semantic_run",
|
||||
"items": [
|
||||
{
|
||||
"message_id": "agent-user-001",
|
||||
"role": "user",
|
||||
"text": "какие остатки на складе на март 2021",
|
||||
"created_at": "2026-04-17T09:11:27+00:00",
|
||||
"reply_type": null,
|
||||
"trace_id": null,
|
||||
"debug": null
|
||||
},
|
||||
{
|
||||
"message_id": "agent-user-002",
|
||||
"role": "user",
|
||||
"text": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?",
|
||||
"created_at": "2026-04-17T09:11:27+00:00",
|
||||
"reply_type": null,
|
||||
"trace_id": null,
|
||||
"debug": null
|
||||
},
|
||||
{
|
||||
"message_id": "agent-user-003",
|
||||
"role": "user",
|
||||
"text": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции",
|
||||
"created_at": "2026-04-17T09:11:27+00:00",
|
||||
"reply_type": null,
|
||||
"trace_id": null,
|
||||
"debug": null
|
||||
},
|
||||
{
|
||||
"message_id": "agent-user-004",
|
||||
"role": "user",
|
||||
"text": "покажи еще раз остатки на эту же дату",
|
||||
"created_at": "2026-04-17T09:11:27+00:00",
|
||||
"reply_type": null,
|
||||
"trace_id": null,
|
||||
"debug": null
|
||||
},
|
||||
{
|
||||
"message_id": "agent-user-005",
|
||||
"role": "user",
|
||||
"text": "по какой компании мы сейчас работаем?",
|
||||
"created_at": "2026-04-17T09:11:27+00:00",
|
||||
"reply_type": null,
|
||||
"trace_id": null,
|
||||
"debug": null
|
||||
},
|
||||
{
|
||||
"message_id": "agent-user-006",
|
||||
"role": "user",
|
||||
"text": "а исторические остатки тоже можешь?",
|
||||
"created_at": "2026-04-17T09:11:27+00:00",
|
||||
"reply_type": null,
|
||||
"trace_id": null,
|
||||
"debug": null
|
||||
}
|
||||
],
|
||||
"agent_run": true,
|
||||
"metadata": {
|
||||
"assistant_prompt_version": null,
|
||||
"decomposition_prompt_version": null,
|
||||
"prompt_fingerprint": null,
|
||||
"agent_focus": "scenario acceptance gate over root selected-object restore and human meta",
|
||||
"architecture_phase": "turnaround_11_phase7",
|
||||
"source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\address_truth_harness_phase7_acceptance_gate_mix.json"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
"suite_id": "assistant_saved_session_gen-ag04170911-ff51e1",
|
||||
"suite_version": "0.1.0",
|
||||
"schema_version": "assistant_saved_session_suite_v0_1",
|
||||
"generated_at": "2026-04-17T09:11:27+00:00",
|
||||
"generation_id": "gen-ag04170911-ff51e1",
|
||||
"mode": "saved_user_sessions",
|
||||
"title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries",
|
||||
"domain": "address_phase7_acceptance_gate_mix",
|
||||
"scenario_count": 1,
|
||||
"case_ids": [
|
||||
"SAVED-001"
|
||||
],
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "SAVED-001",
|
||||
"scenario_tag": "agent_saved_user_sessions",
|
||||
"title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries",
|
||||
"question_type": "followup",
|
||||
"broadness_level": "medium",
|
||||
"turns": [
|
||||
{
|
||||
"user_message": "какие остатки на складе на март 2021"
|
||||
},
|
||||
{
|
||||
"user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?"
|
||||
},
|
||||
{
|
||||
"user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции"
|
||||
},
|
||||
{
|
||||
"user_message": "покажи еще раз остатки на эту же дату"
|
||||
},
|
||||
{
|
||||
"user_message": "по какой компании мы сейчас работаем?"
|
||||
},
|
||||
{
|
||||
"user_message": "а исторические остатки тоже можешь?"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
{
|
||||
"suite_id": "assistant_saved_session_runtime_job-Z-vWMI8lw_",
|
||||
"suite_version": "0.1.0",
|
||||
"schema_version": "assistant_saved_session_runtime_v0_1",
|
||||
"title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries",
|
||||
"scenario_count": 1,
|
||||
"case_ids": [
|
||||
"SAVED-001"
|
||||
],
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "SAVED-001",
|
||||
"scenario_tag": "saved_user_sessions_runtime",
|
||||
"title": "AGENT | Phase 7 acceptance replay for inventory root, selected-object continuity, and human meta boundaries",
|
||||
"question_type": "followup",
|
||||
"broadness_level": "medium",
|
||||
"turns": [
|
||||
{
|
||||
"user_message": "какие остатки на складе на март 2021"
|
||||
},
|
||||
{
|
||||
"user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": кто нам это поставил?"
|
||||
},
|
||||
{
|
||||
"user_message": "По выбранному объекту \"Столешница 600*3050*26 альмандин\": покажи документы по этой позиции"
|
||||
},
|
||||
{
|
||||
"user_message": "покажи еще раз остатки на эту же дату"
|
||||
},
|
||||
{
|
||||
"user_message": "по какой компании мы сейчас работаем?"
|
||||
},
|
||||
{
|
||||
"user_message": "а исторические остатки тоже можешь?"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
{
|
||||
"suite_id": "assistant_saved_session_runtime_job-otA7X9BRT5",
|
||||
"suite_version": "0.1.0",
|
||||
"schema_version": "assistant_saved_session_runtime_v0_1",
|
||||
"title": "AGENT | Phase 6 provider/runtime replay across chat, meta, and address boundaries",
|
||||
"scenario_count": 1,
|
||||
"case_ids": [
|
||||
"SAVED-001"
|
||||
],
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "SAVED-001",
|
||||
"scenario_tag": "saved_user_sessions_runtime",
|
||||
"title": "AGENT | Phase 6 provider/runtime replay across chat, meta, and address boundaries",
|
||||
"question_type": "followup",
|
||||
"broadness_level": "medium",
|
||||
"turns": [
|
||||
{
|
||||
"user_message": "привет, как дела?"
|
||||
},
|
||||
{
|
||||
"user_message": "по какой компании мы сейчас работаем?"
|
||||
},
|
||||
{
|
||||
"user_message": "что ты можешь по 1С?"
|
||||
},
|
||||
{
|
||||
"user_message": "какие остатки на складе на март 2021"
|
||||
},
|
||||
{
|
||||
"user_message": "а исторические остатки тоже можешь?"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -9,6 +9,7 @@ from types import SimpleNamespace
|
|||
from typing import Any
|
||||
|
||||
import domain_case_loop as dcl
|
||||
import scenario_acceptance_policy as sap
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
|
|
@ -696,6 +697,21 @@ def build_truth_review_markdown(spec: dict[str, Any], scenario_state: dict[str,
|
|||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
def write_acceptance_artifacts(
|
||||
output_dir: Path,
|
||||
spec: dict[str, Any],
|
||||
scenario_state: dict[str, Any],
|
||||
review_summary: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
acceptance_matrix = sap.build_scenario_acceptance_matrix(spec, scenario_state, review_summary)
|
||||
pack_state = sap.derive_truth_harness_pack_state(spec, scenario_state, review_summary, acceptance_matrix)
|
||||
write_json(output_dir / "scenario_acceptance_matrix.json", acceptance_matrix)
|
||||
write_text(output_dir / "scenario_acceptance_matrix.md", sap.build_scenario_acceptance_matrix_markdown(acceptance_matrix))
|
||||
write_json(output_dir / "pack_state.json", pack_state)
|
||||
write_text(output_dir / "final_status.md", sap.build_truth_harness_final_status_markdown(pack_state))
|
||||
return {"acceptance_matrix": acceptance_matrix, "pack_state": pack_state}
|
||||
|
||||
|
||||
def save_step_bundle(
|
||||
*,
|
||||
step_dir: Path,
|
||||
|
|
@ -845,7 +861,13 @@ def review_export(spec: dict[str, Any], export_path: Path, output_dir: Path) ->
|
|||
write_json(output_dir / "scenario_state.json", scenario_state)
|
||||
write_json(output_dir / "truth_review.json", {"summary": review_summary, "steps": scenario_state["step_outputs"]})
|
||||
write_text(output_dir / "truth_review.md", review_markdown)
|
||||
return {"scenario_state": scenario_state, "review_summary": review_summary}
|
||||
acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary)
|
||||
return {
|
||||
"scenario_state": scenario_state,
|
||||
"review_summary": review_summary,
|
||||
"acceptance_matrix": acceptance_bundle["acceptance_matrix"],
|
||||
"pack_state": acceptance_bundle["pack_state"],
|
||||
}
|
||||
|
||||
|
||||
def run_live(spec: dict[str, Any], output_dir: Path, args: argparse.Namespace) -> dict[str, Any]:
|
||||
|
|
@ -932,10 +954,15 @@ def run_live(spec: dict[str, Any], output_dir: Path, args: argparse.Namespace) -
|
|||
write_json(output_dir / "scenario_state.json", scenario_state)
|
||||
write_json(output_dir / "truth_review.json", {"summary": review_summary, "steps": scenario_state["step_outputs"]})
|
||||
write_text(output_dir / "truth_review.md", review_markdown)
|
||||
write_text(output_dir / "final_status.md", f"# Final status\n\n- status: `{review_summary['overall_status']}`\n")
|
||||
acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary)
|
||||
print(f"[truth-harness] saved artifacts to {output_dir}")
|
||||
print(f"[truth-harness] overall_status={review_summary['overall_status']}")
|
||||
return {"scenario_state": scenario_state, "review_summary": review_summary}
|
||||
return {
|
||||
"scenario_state": scenario_state,
|
||||
"review_summary": review_summary,
|
||||
"acceptance_matrix": acceptance_bundle["acceptance_matrix"],
|
||||
"pack_state": acceptance_bundle["pack_state"],
|
||||
}
|
||||
|
||||
|
||||
def build_bootstrap_spec(export_path: Path, scenario_id: str, domain: str, title: str | None) -> dict[str, Any]:
|
||||
|
|
@ -994,6 +1021,7 @@ def handle_review_export(args: argparse.Namespace) -> int:
|
|||
)
|
||||
result = review_export(spec, export_path, output_dir)
|
||||
print(f"[truth-harness] review-export overall_status={result['review_summary']['overall_status']}")
|
||||
print(f"[truth-harness] review-export final_status={result['pack_state']['final_status']}")
|
||||
print(f"[truth-harness] artifacts={output_dir}")
|
||||
return 0
|
||||
|
||||
|
|
@ -1006,6 +1034,7 @@ def handle_run_live(args: argparse.Namespace) -> int:
|
|||
)
|
||||
result = run_live(spec, output_dir, args)
|
||||
print(f"[truth-harness] run-live overall_status={result['review_summary']['overall_status']}")
|
||||
print(f"[truth-harness] run-live final_status={result['pack_state']['final_status']}")
|
||||
print(f"[truth-harness] artifacts={output_dir}")
|
||||
return 0
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,306 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
|
||||
SCENARIO_ACCEPTANCE_MATRIX_SCHEMA_VERSION = "scenario_acceptance_matrix_v1"
|
||||
TRUTH_HARNESS_PACK_STATE_SCHEMA_VERSION = "truth_harness_pack_state_v1"
|
||||
|
||||
SEVERITY_TO_PRIORITY = {
|
||||
"critical": "P0",
|
||||
"warning": "P1",
|
||||
"info": "P2",
|
||||
}
|
||||
PRIORITY_RANK = {"P0": 0, "P1": 1, "P2": 2, "none": 3}
|
||||
|
||||
SELECTED_OBJECT_INTENTS = {
|
||||
"inventory_purchase_provenance_for_item",
|
||||
"inventory_purchase_documents_for_item",
|
||||
"inventory_sale_trace_for_item",
|
||||
"inventory_profitability_for_item",
|
||||
"inventory_purchase_to_sale_chain",
|
||||
}
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||||
|
||||
|
||||
def _normalize_step_outputs(scenario_state: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
||||
raw = scenario_state.get("step_outputs")
|
||||
return raw if isinstance(raw, dict) else {}
|
||||
|
||||
|
||||
def _normalize_findings(step_state: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
raw = step_state.get("review_findings")
|
||||
return [item for item in raw if isinstance(item, dict)] if isinstance(raw, list) else []
|
||||
|
||||
|
||||
def _priority_from_finding(finding: dict[str, Any]) -> str:
|
||||
severity = str(finding.get("severity") or "").strip().lower()
|
||||
return SEVERITY_TO_PRIORITY.get(severity, "P2")
|
||||
|
||||
|
||||
def _highest_priority(findings: list[dict[str, Any]]) -> str:
|
||||
if not findings:
|
||||
return "none"
|
||||
priorities = [_priority_from_finding(item) for item in findings]
|
||||
return sorted(priorities, key=lambda item: PRIORITY_RANK.get(item, 99))[0]
|
||||
|
||||
|
||||
def _has_selected_object_signal(step: dict[str, Any]) -> bool:
|
||||
question = str(step.get("question_template") or "").lower()
|
||||
expected_intents = {
|
||||
str(item).strip()
|
||||
for item in (step.get("expected_intents") or [])
|
||||
if str(item).strip()
|
||||
}
|
||||
if expected_intents & SELECTED_OBJECT_INTENTS:
|
||||
return True
|
||||
return any(
|
||||
marker in question
|
||||
for marker in (
|
||||
"выбранному объекту",
|
||||
"по этой позиции",
|
||||
"по ней",
|
||||
"по нему",
|
||||
"\"",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _is_direct_answer_code(code: str) -> bool:
|
||||
return code.startswith("required_direct_answer_") or code.startswith("forbidden_direct_answer_")
|
||||
|
||||
|
||||
def _is_temporal_code(code: str) -> bool:
|
||||
return (
|
||||
code.startswith("missing_filter:")
|
||||
or code.startswith("wrong_filter:")
|
||||
or code.startswith("forbidden_filter_key:")
|
||||
or code.startswith("forbidden_filter_value:")
|
||||
or code.startswith("period_carryover_")
|
||||
or code.startswith("previous_step_missing:")
|
||||
)
|
||||
|
||||
|
||||
def _is_truth_gate_code(code: str) -> bool:
|
||||
return code in {
|
||||
"unexpected_reply_type",
|
||||
"unexpected_limited_reason_category",
|
||||
"wrong_result_mode",
|
||||
}
|
||||
|
||||
|
||||
def _is_route_code(code: str) -> bool:
|
||||
return code in {"wrong_intent", "wrong_capability", "wrong_recipe", "question_sequence_mismatch"}
|
||||
|
||||
|
||||
def _is_human_answer_quality_code(code: str) -> bool:
|
||||
return code in {
|
||||
"required_answer_patterns_any_missing",
|
||||
"required_answer_patterns_all_missing",
|
||||
"forbidden_answer_pattern_hit",
|
||||
}
|
||||
|
||||
|
||||
def _derive_step_invariant_failures(step: dict[str, Any], findings: list[dict[str, Any]]) -> dict[str, bool]:
|
||||
codes = [str(item.get("code") or "").strip() for item in findings]
|
||||
selected_object_step = _has_selected_object_signal(step)
|
||||
return {
|
||||
"direct_answer": any(_is_direct_answer_code(code) for code in codes),
|
||||
"temporal_honesty": any(_is_temporal_code(code) for code in codes),
|
||||
"selected_object_continuity": selected_object_step and any(_is_route_code(code) for code in codes),
|
||||
"truth_gate": any(_is_truth_gate_code(code) for code in codes),
|
||||
"human_answer_quality": any(_is_human_answer_quality_code(code) for code in codes),
|
||||
}
|
||||
|
||||
|
||||
def build_scenario_acceptance_matrix(
|
||||
spec: dict[str, Any], scenario_state: dict[str, Any], review_summary: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
step_outputs = _normalize_step_outputs(scenario_state)
|
||||
rows: list[dict[str, Any]] = []
|
||||
unresolved_priority_counts = {"P0": 0, "P1": 0, "P2": 0}
|
||||
invariant_failure_counts = {
|
||||
"direct_answer": 0,
|
||||
"temporal_honesty": 0,
|
||||
"selected_object_continuity": 0,
|
||||
"truth_gate": 0,
|
||||
"human_answer_quality": 0,
|
||||
}
|
||||
|
||||
for index, step in enumerate(spec.get("steps") or [], start=1):
|
||||
step_id = str(step.get("step_id") or "").strip()
|
||||
step_state = step_outputs.get(step_id, {}) if step_id else {}
|
||||
findings = _normalize_findings(step_state)
|
||||
invariant_failures = _derive_step_invariant_failures(step, findings)
|
||||
for invariant_name, failed in invariant_failures.items():
|
||||
if failed:
|
||||
invariant_failure_counts[invariant_name] += 1
|
||||
highest_priority = _highest_priority(findings)
|
||||
if highest_priority in unresolved_priority_counts:
|
||||
unresolved_priority_counts[highest_priority] += len(
|
||||
[item for item in findings if _priority_from_finding(item) == highest_priority]
|
||||
)
|
||||
rows.append(
|
||||
{
|
||||
"index": index,
|
||||
"step_id": step_id,
|
||||
"title": step.get("title"),
|
||||
"question": step.get("question_template"),
|
||||
"criticality": str(step.get("criticality") or "critical"),
|
||||
"review_status": str(step_state.get("review_status") or "unknown"),
|
||||
"reply_type": step_state.get("reply_type"),
|
||||
"detected_intent": step_state.get("detected_intent"),
|
||||
"capability_id": step_state.get("capability_id"),
|
||||
"selected_object_step": _has_selected_object_signal(step),
|
||||
"highest_unresolved_priority": highest_priority,
|
||||
"unresolved_findings_count": len(findings),
|
||||
"invariant_failures": [name for name, failed in invariant_failures.items() if failed],
|
||||
"findings": findings,
|
||||
}
|
||||
)
|
||||
|
||||
invariants = {
|
||||
"direct_answer_ok": invariant_failure_counts["direct_answer"] == 0,
|
||||
"temporal_honesty_ok": invariant_failure_counts["temporal_honesty"] == 0,
|
||||
"selected_object_continuity_ok": invariant_failure_counts["selected_object_continuity"] == 0,
|
||||
"truth_gate_ok": invariant_failure_counts["truth_gate"] == 0,
|
||||
"human_answer_quality_ok": invariant_failure_counts["human_answer_quality"] == 0,
|
||||
}
|
||||
critical_rows = [row for row in rows if row["criticality"] == "critical"]
|
||||
critical_path_green = bool(critical_rows) and all(row["review_status"] == "pass" for row in critical_rows)
|
||||
|
||||
return {
|
||||
"schema_version": SCENARIO_ACCEPTANCE_MATRIX_SCHEMA_VERSION,
|
||||
"scenario_id": spec.get("scenario_id"),
|
||||
"domain": spec.get("domain"),
|
||||
"title": spec.get("title"),
|
||||
"review_source": review_summary.get("review_source"),
|
||||
"session_id": scenario_state.get("session_id"),
|
||||
"rows": rows,
|
||||
"summary": {
|
||||
"steps_total": len(rows),
|
||||
"critical_steps_total": len(critical_rows),
|
||||
"critical_steps_passed": sum(1 for row in critical_rows if row["review_status"] == "pass"),
|
||||
"critical_path_green": critical_path_green,
|
||||
"unresolved_p0_count": unresolved_priority_counts["P0"],
|
||||
"unresolved_p1_count": unresolved_priority_counts["P1"],
|
||||
"unresolved_p2_count": unresolved_priority_counts["P2"],
|
||||
"invariant_failure_counts": invariant_failure_counts,
|
||||
"invariants": invariants,
|
||||
},
|
||||
"updated_at": _now_iso(),
|
||||
}
|
||||
|
||||
|
||||
def derive_truth_harness_pack_state(
|
||||
spec: dict[str, Any],
|
||||
scenario_state: dict[str, Any],
|
||||
review_summary: dict[str, Any],
|
||||
acceptance_matrix: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
summary = acceptance_matrix.get("summary") if isinstance(acceptance_matrix.get("summary"), dict) else {}
|
||||
invariants = summary.get("invariants") if isinstance(summary.get("invariants"), dict) else {}
|
||||
unresolved_p0_count = int(summary.get("unresolved_p0_count") or 0)
|
||||
review_overall_status = str(review_summary.get("overall_status") or "unknown")
|
||||
step_outputs = _normalize_step_outputs(scenario_state)
|
||||
|
||||
if not step_outputs:
|
||||
final_status = "blocked"
|
||||
final_status_reason = "no_step_outputs"
|
||||
elif review_overall_status == "pass" and unresolved_p0_count == 0 and all(bool(value) for value in invariants.values()):
|
||||
final_status = "accepted"
|
||||
final_status_reason = "scenario_acceptance_gate_passed"
|
||||
else:
|
||||
final_status = "partial"
|
||||
if unresolved_p0_count > 0:
|
||||
final_status_reason = "unresolved_p0"
|
||||
elif review_overall_status == "warning":
|
||||
final_status_reason = "review_warning_remaining"
|
||||
elif review_overall_status == "fail":
|
||||
final_status_reason = "review_failures_remaining"
|
||||
else:
|
||||
final_status_reason = "acceptance_invariants_not_green"
|
||||
|
||||
return {
|
||||
"schema_version": TRUTH_HARNESS_PACK_STATE_SCHEMA_VERSION,
|
||||
"pack_id": spec.get("scenario_id"),
|
||||
"scenario_id": spec.get("scenario_id"),
|
||||
"domain": spec.get("domain"),
|
||||
"title": spec.get("title"),
|
||||
"review_source": review_summary.get("review_source"),
|
||||
"session_id": scenario_state.get("session_id"),
|
||||
"steps_total": review_summary.get("steps_total"),
|
||||
"steps_passed": review_summary.get("steps_passed"),
|
||||
"steps_with_warning": review_summary.get("steps_with_warning"),
|
||||
"steps_failed": review_summary.get("steps_failed"),
|
||||
"review_overall_status": review_overall_status,
|
||||
"execution_status": "exact" if review_overall_status == "pass" else "partial",
|
||||
"final_status": final_status,
|
||||
"final_status_reason": final_status_reason,
|
||||
"acceptance_gate_passed": final_status == "accepted",
|
||||
"no_unresolved_p0": unresolved_p0_count == 0,
|
||||
"unresolved_p0_count": unresolved_p0_count,
|
||||
"unresolved_p1_count": int(summary.get("unresolved_p1_count") or 0),
|
||||
"unresolved_p2_count": int(summary.get("unresolved_p2_count") or 0),
|
||||
"critical_path_green": bool(summary.get("critical_path_green")),
|
||||
"invariants": invariants,
|
||||
"updated_at": _now_iso(),
|
||||
}
|
||||
|
||||
|
||||
def build_scenario_acceptance_matrix_markdown(acceptance_matrix: dict[str, Any]) -> str:
|
||||
summary = acceptance_matrix.get("summary") if isinstance(acceptance_matrix.get("summary"), dict) else {}
|
||||
invariants = summary.get("invariants") if isinstance(summary.get("invariants"), dict) else {}
|
||||
lines = [
|
||||
"# Scenario acceptance matrix",
|
||||
"",
|
||||
f"- scenario_id: `{acceptance_matrix.get('scenario_id') or 'n/a'}`",
|
||||
f"- domain: `{acceptance_matrix.get('domain') or 'n/a'}`",
|
||||
f"- title: {acceptance_matrix.get('title') or 'n/a'}",
|
||||
f"- review_source: `{acceptance_matrix.get('review_source') or 'n/a'}`",
|
||||
f"- session_id: `{acceptance_matrix.get('session_id') or 'n/a'}`",
|
||||
f"- critical_path_green: `{summary.get('critical_path_green')}`",
|
||||
f"- unresolved_p0_count: `{summary.get('unresolved_p0_count')}`",
|
||||
f"- unresolved_p1_count: `{summary.get('unresolved_p1_count')}`",
|
||||
f"- unresolved_p2_count: `{summary.get('unresolved_p2_count')}`",
|
||||
"",
|
||||
"## Acceptance invariants",
|
||||
f"- direct_answer_ok: `{invariants.get('direct_answer_ok')}`",
|
||||
f"- temporal_honesty_ok: `{invariants.get('temporal_honesty_ok')}`",
|
||||
f"- selected_object_continuity_ok: `{invariants.get('selected_object_continuity_ok')}`",
|
||||
f"- truth_gate_ok: `{invariants.get('truth_gate_ok')}`",
|
||||
f"- human_answer_quality_ok: `{invariants.get('human_answer_quality_ok')}`",
|
||||
"",
|
||||
"## Steps",
|
||||
]
|
||||
for row in acceptance_matrix.get("rows") or []:
|
||||
lines.extend(
|
||||
[
|
||||
f"- `{row.get('step_id')}`",
|
||||
f" review_status: `{row.get('review_status')}`",
|
||||
f" criticality: `{row.get('criticality')}`",
|
||||
f" highest_unresolved_priority: `{row.get('highest_unresolved_priority')}`",
|
||||
f" selected_object_step: `{row.get('selected_object_step')}`",
|
||||
f" invariant_failures: {', '.join(row.get('invariant_failures') or []) or 'none'}",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
def build_truth_harness_final_status_markdown(pack_state: dict[str, Any]) -> str:
|
||||
invariants = pack_state.get("invariants") if isinstance(pack_state.get("invariants"), dict) else {}
|
||||
return (
|
||||
"# Final status\n\n"
|
||||
f"- status: `{pack_state.get('final_status') or 'n/a'}`\n"
|
||||
f"- reason: `{pack_state.get('final_status_reason') or 'n/a'}`\n"
|
||||
f"- review_overall_status: `{pack_state.get('review_overall_status') or 'n/a'}`\n"
|
||||
f"- no_unresolved_p0: `{pack_state.get('no_unresolved_p0')}`\n"
|
||||
f"- direct_answer_ok: `{invariants.get('direct_answer_ok')}`\n"
|
||||
f"- temporal_honesty_ok: `{invariants.get('temporal_honesty_ok')}`\n"
|
||||
f"- selected_object_continuity_ok: `{invariants.get('selected_object_continuity_ok')}`\n"
|
||||
f"- truth_gate_ok: `{invariants.get('truth_gate_ok')}`\n"
|
||||
)
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
import scenario_acceptance_policy as sap
|
||||
|
||||
|
||||
class ScenarioAcceptancePolicyTests(unittest.TestCase):
|
||||
def test_marks_partial_when_selected_object_and_temporal_p0_findings_exist(self) -> None:
|
||||
spec = {
|
||||
"scenario_id": "demo_phase7",
|
||||
"domain": "inventory_demo",
|
||||
"title": "Demo",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01",
|
||||
"title": "Selected object supplier",
|
||||
"question_template": 'По выбранному объекту "Стол": кто поставил?',
|
||||
"criticality": "critical",
|
||||
"expected_intents": ["inventory_purchase_provenance_for_item"],
|
||||
}
|
||||
],
|
||||
}
|
||||
scenario_state = {
|
||||
"session_id": "asst-demo",
|
||||
"step_outputs": {
|
||||
"step_01": {
|
||||
"review_status": "fail",
|
||||
"reply_type": "factual",
|
||||
"detected_intent": "inventory_on_hand_as_of_date",
|
||||
"capability_id": "confirmed_inventory_on_hand_as_of_date",
|
||||
"review_findings": [
|
||||
{"code": "wrong_intent", "severity": "critical"},
|
||||
{"code": "wrong_filter:as_of_date", "severity": "critical"},
|
||||
],
|
||||
}
|
||||
},
|
||||
}
|
||||
review_summary = {
|
||||
"review_source": "live_strict_replay",
|
||||
"overall_status": "fail",
|
||||
"steps_total": 1,
|
||||
"steps_passed": 0,
|
||||
"steps_with_warning": 0,
|
||||
"steps_failed": 1,
|
||||
}
|
||||
|
||||
acceptance_matrix = sap.build_scenario_acceptance_matrix(spec, scenario_state, review_summary)
|
||||
pack_state = sap.derive_truth_harness_pack_state(spec, scenario_state, review_summary, acceptance_matrix)
|
||||
|
||||
self.assertEqual(pack_state["final_status"], "partial")
|
||||
self.assertFalse(pack_state["invariants"]["selected_object_continuity_ok"])
|
||||
self.assertFalse(pack_state["invariants"]["temporal_honesty_ok"])
|
||||
self.assertEqual(pack_state["unresolved_p0_count"], 2)
|
||||
|
||||
def test_accepts_when_all_review_and_acceptance_invariants_are_green(self) -> None:
|
||||
spec = {
|
||||
"scenario_id": "demo_phase7_green",
|
||||
"domain": "inventory_demo",
|
||||
"title": "Demo green",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01",
|
||||
"title": "Inventory root",
|
||||
"question_template": "какие остатки на складе на март 2021",
|
||||
"criticality": "critical",
|
||||
"expected_intents": ["inventory_on_hand_as_of_date"],
|
||||
}
|
||||
],
|
||||
}
|
||||
scenario_state = {
|
||||
"session_id": "asst-green",
|
||||
"step_outputs": {
|
||||
"step_01": {
|
||||
"review_status": "pass",
|
||||
"reply_type": "factual",
|
||||
"detected_intent": "inventory_on_hand_as_of_date",
|
||||
"capability_id": "confirmed_inventory_on_hand_as_of_date",
|
||||
"review_findings": [],
|
||||
}
|
||||
},
|
||||
}
|
||||
review_summary = {
|
||||
"review_source": "live_strict_replay",
|
||||
"overall_status": "pass",
|
||||
"steps_total": 1,
|
||||
"steps_passed": 1,
|
||||
"steps_with_warning": 0,
|
||||
"steps_failed": 0,
|
||||
}
|
||||
|
||||
acceptance_matrix = sap.build_scenario_acceptance_matrix(spec, scenario_state, review_summary)
|
||||
pack_state = sap.derive_truth_harness_pack_state(spec, scenario_state, review_summary, acceptance_matrix)
|
||||
|
||||
self.assertEqual(pack_state["final_status"], "accepted")
|
||||
self.assertTrue(pack_state["acceptance_gate_passed"])
|
||||
self.assertTrue(pack_state["critical_path_green"])
|
||||
self.assertTrue(all(pack_state["invariants"].values()))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
Reference in New Issue