From 0bd631c160a3ab268ca8b14474f5ab53670b0ed2 Mon Sep 17 00:00:00 2001
From: dctouch <support@dctouch.ru>
Date: Fri, 22 May 2026 16:21:59 +0300
Subject: [PATCH] =?UTF-8?q?=D0=9F=D1=80=D0=B8=D0=BD=D1=8F=D1=82=D1=8C=20ag?=
 =?UTF-8?q?ent=20replay=20=D0=BC=D0=B0=D1=80=D0=B6=D0=B8=D0=BD=D0=B0=D0=BB?=
 =?UTF-8?q?=D1=8C=D0=BD=D0=BE=D1=81=D1=82=D0=B8=20=D0=BD=D0=BE=D0=BC=D0=B5?=
 =?UTF-8?q?=D0=BD=D0=BA=D0=BB=D0=B0=D1=82=D1=83=D1=80=D1=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../data/autorun_generators/history.json      |  38 ++++++
 ..._20260522131931_gen-ag05221319-4035f5.json | 123 ++++++++++++++++++
 ..._20260522131931_gen-ag05221319-4035f5.json |  40 ++++++
 scripts/domain_truth_harness.py               |  98 +++++++++++++-
 4 files changed, 298 insertions(+), 1 deletion(-)
 create mode 100644 llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json
 create mode 100644 llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json

diff --git a/llm_normalizer/data/autorun_generators/history.json b/llm_normalizer/data/autorun_generators/history.json
index e6f4f6b..a2964c0 100644
--- a/llm_normalizer/data/autorun_generators/history.json
+++ b/llm_normalizer/data/autorun_generators/history.json
@@ -1,4 +1,42 @@
 [
+  {
+    "generation_id": "gen-ag05221319-4035f5",
+    "created_at": "2026-05-22T13:19:31+00:00",
+    "mode": "saved_user_sessions",
+    "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
+    "count": 5,
+    "domain": "inventory_margin_ranking",
+    "questions": [
+      "Какая номеклатура товара реализована с высокой прибылью какая с низкой",
+      "сентябрь 2017",
+      "покажи найденные строки себестоимостной базы",
+      "расширь до 2017 года",
+      "анализ по 41 счету а не 01"
+    ],
+    "generated_by": "codex_agent",
+    "saved_case_set_file": "assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json",
+    "context": {
+      "llm_provider": null,
+      "model": null,
+      "assistant_prompt_version": null,
+      "decomposition_prompt_version": null,
+      "prompt_fingerprint": null,
+      "autogen_personality_id": null,
+      "autogen_personality_prompt": null,
+      "source_session_id": null,
+      "saved_session_file": "assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json",
+      "saved_case_set_kind": "agent_semantic_scenario",
+      "agent_run": true,
+      "agent_focus": "inventory_margin_ranking_followup_and_business_modes",
+      "architecture_phase": "Business Answer Contract / margin-ranking safety gate",
+      "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json",
+      "scenario_id": "inventory_margin_ranking_agent_loop_20260522",
+      "semantic_tags": [],
+      "validation_status": "accepted_live_replay",
+      "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
+      "saved_after_validated_replay": true
+    }
+  },
   {
     "generation_id": "gen-ag05131312-2d0445",
     "created_at": "2026-05-13T13:12:37+00:00",
diff --git a/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json
new file mode 100644
index 0000000..f70b1ba
--- /dev/null
+++ b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json
@@ -0,0 +1,123 @@
+{
+  "saved_at": "2026-05-22T13:19:31+00:00",
+  "generation_id": "gen-ag05221319-4035f5",
+  "mode": "saved_user_sessions",
+  "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
+  "agent_run": true,
+  "questions": [
+    "Какая номеклатура товара реализована с высокой прибылью какая с низкой",
+    "сентябрь 2017",
+    "покажи найденные строки себестоимостной базы",
+    "расширь до 2017 года",
+    "анализ по 41 счету а не 01"
+  ],
+  "metadata": {
+    "assistant_prompt_version": null,
+    "decomposition_prompt_version": null,
+    "prompt_fingerprint": null,
+    "agent_focus": "inventory_margin_ranking_followup_and_business_modes",
+    "architecture_phase": "Business Answer Contract / margin-ranking safety gate",
+    "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json",
+    "scenario_id": "inventory_margin_ranking_agent_loop_20260522",
+    "semantic_tags": [],
+    "validation_status": "accepted_live_replay",
+    "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
+    "saved_after_validated_replay": true,
+    "save_gate": {
+      "schema_version": "agent_semantic_save_gate_v1",
+      "validation_status": "accepted_live_replay",
+      "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
+      "final_status": "accepted",
+      "review_overall_status": "pass",
+      "business_overall_status": "pass",
+      "steps_total": 5,
+      "steps_passed": 5,
+      "steps_failed": 0,
+      "steps_with_business_failures": 0,
+      "steps_with_business_warnings": 0,
+      "acceptance_gate_passed": true,
+      "saved_after_validated_replay": true
+    }
+  },
+  "source_session_id": null,
+  "session": {
+    "session_id": null,
+    "mode": "agent_semantic_run",
+    "items": [
+      {
+        "message_id": "agent-user-001",
+        "role": "user",
+        "text": "Какая номеклатура товара реализована с высокой прибылью какая с низкой",
+        "created_at": "2026-05-22T13:19:31+00:00",
+        "reply_type": null,
+        "trace_id": null,
+        "debug": null
+      },
+      {
+        "message_id": "agent-user-002",
+        "role": "user",
+        "text": "сентябрь 2017",
+        "created_at": "2026-05-22T13:19:31+00:00",
+        "reply_type": null,
+        "trace_id": null,
+        "debug": null
+      },
+      {
+        "message_id": "agent-user-003",
+        "role": "user",
+        "text": "покажи найденные строки себестоимостной базы",
+        "created_at": "2026-05-22T13:19:31+00:00",
+        "reply_type": null,
+        "trace_id": null,
+        "debug": null
+      },
+      {
+        "message_id": "agent-user-004",
+        "role": "user",
+        "text": "расширь до 2017 года",
+        "created_at": "2026-05-22T13:19:31+00:00",
+        "reply_type": null,
+        "trace_id": null,
+        "debug": null
+      },
+      {
+        "message_id": "agent-user-005",
+        "role": "user",
+        "text": "анализ по 41 счету а не 01",
+        "created_at": "2026-05-22T13:19:31+00:00",
+        "reply_type": null,
+        "trace_id": null,
+        "debug": null
+      }
+    ],
+    "agent_run": true,
+    "metadata": {
+      "assistant_prompt_version": null,
+      "decomposition_prompt_version": null,
+      "prompt_fingerprint": null,
+      "agent_focus": "inventory_margin_ranking_followup_and_business_modes",
+      "architecture_phase": "Business Answer Contract / margin-ranking safety gate",
+      "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json",
+      "scenario_id": "inventory_margin_ranking_agent_loop_20260522",
+      "semantic_tags": [],
+      "validation_status": "accepted_live_replay",
+      "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
+      "saved_after_validated_replay": true,
+      "save_gate": {
+        "schema_version": "agent_semantic_save_gate_v1",
+        "validation_status": "accepted_live_replay",
+        "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
+        "final_status": "accepted",
+        "review_overall_status": "pass",
+        "business_overall_status": "pass",
+        "steps_total": 5,
+        "steps_passed": 5,
+        "steps_failed": 0,
+        "steps_with_business_failures": 0,
+        "steps_with_business_warnings": 0,
+        "acceptance_gate_passed": true,
+        "saved_after_validated_replay": true
+      }
+    }
+  }
+}
diff --git a/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json
new file mode 100644
index 0000000..2d6f751
--- /dev/null
+++ b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json
@@ -0,0 +1,40 @@
+{
+  "suite_id": "assistant_saved_session_gen-ag05221319-4035f5",
+  "suite_version": "0.1.0",
+  "schema_version": "assistant_saved_session_suite_v0_1",
+  "generated_at": "2026-05-22T13:19:31+00:00",
+  "generation_id": "gen-ag05221319-4035f5",
+  "mode": "saved_user_sessions",
+  "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
+  "domain": "inventory_margin_ranking",
+  "scenario_count": 1,
+  "case_ids": [
+    "SAVED-001"
+  ],
+  "cases": [
+    {
+      "case_id": "SAVED-001",
+      "scenario_tag": "agent_saved_user_sessions",
+      "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
+      "question_type": "followup",
+      "broadness_level": "medium",
+      "turns": [
+        {
+          "user_message": "Какая номеклатура товара реализована с высокой прибылью какая с низкой"
+        },
+        {
+          "user_message": "сентябрь 2017"
+        },
+        {
+          "user_message": "покажи найденные строки себестоимостной базы"
+        },
+        {
+          "user_message": "расширь до 2017 года"
+        },
+        {
+          "user_message": "анализ по 41 счету а не 01"
+        }
+      ]
+    }
+  ]
+}
diff --git a/scripts/domain_truth_harness.py b/scripts/domain_truth_harness.py
index 14f6e81..b4a240a 100644
--- a/scripts/domain_truth_harness.py
+++ b/scripts/domain_truth_harness.py
@@ -386,6 +386,85 @@ def normalize_optional_bool(value: Any) -> bool | None:
     return None
 
 
+LEGACY_DEBUG_RESULT_MODES = {"confirmed_balance", "heuristic_candidates"}
+BUSINESS_EXPECTED_RESULT_MODES = {
+    "clarification_required",
+    "limited_accounting_answer",
+    "evidence_or_honest_boundary",
+    "ranking_or_limited_accounting_answer",
+    "same_inventory_margin_context_or_clarification",
+}
+
+
+def _business_review_is_clean(step_state: dict[str, Any]) -> bool:
+    business_review = step_state.get("business_first_review")
+    if not isinstance(business_review, dict):
+        return True
+    return len(dcl.normalize_string_list(business_review.get("issue_codes"))) == 0
+
+
+def business_expected_result_mode_matches(expected_result_mode: str, step_state: dict[str, Any]) -> bool:
+    reply_type = str(step_state.get("reply_type") or "").strip()
+    response_type = str(step_state.get("response_type") or "").strip()
+    truth_mode = str(step_state.get("truth_mode") or "").strip()
+    answer_shape = str(step_state.get("answer_shape") or "").strip()
+    detected_intent = str(step_state.get("detected_intent") or "").strip()
+    capability_id = str(step_state.get("capability_id") or "").strip()
+    assistant_text = str(step_state.get("assistant_text") or "").strip()
+    clean_business_review = _business_review_is_clean(step_state)
+    in_margin_context = (
+        detected_intent == "inventory_margin_ranking_for_nomenclature"
+        or capability_id == "inventory_inventory_margin_ranking_for_nomenclature"
+    )
+
+    if expected_result_mode == "clarification_required":
+        return (
+            clean_business_review
+            and (
+                truth_mode == "clarification_required"
+                or answer_shape == "clarification_required"
+                or (reply_type == "partial_coverage" and response_type == "LIMITED_WITH_REASON")
+            )
+        )
+
+    if expected_result_mode == "limited_accounting_answer":
+        return (
+            clean_business_review
+            and in_margin_context
+            and bool(assistant_text)
+            and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
+        )
+
+    if expected_result_mode == "evidence_or_honest_boundary":
+        return (
+            clean_business_review
+            and bool(assistant_text)
+            and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
+        )
+
+    if expected_result_mode == "ranking_or_limited_accounting_answer":
+        return (
+            clean_business_review
+            and in_margin_context
+            and bool(assistant_text)
+            and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
+        )
+
+    if expected_result_mode == "same_inventory_margin_context_or_clarification":
+        return (
+            clean_business_review
+            and bool(assistant_text)
+            and (
+                in_margin_context
+                or truth_mode == "clarification_required"
+                or answer_shape == "clarification_required"
+            )
+            and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
+        )
+
+    return False
+
+
 def evaluate_truth_step(
     *,
     step: dict[str, Any],
@@ -627,7 +706,24 @@ def evaluate_truth_step(
         resolve_nested_placeholders(step.get("expected_result_mode"), step_results, bindings, runtime_bindings) or ""
     ).strip()
     actual_result_mode = str(step_state.get("result_mode") or "").strip()
-    if expected_result_mode and actual_result_mode and not dcl.identifiers_match(actual_result_mode, expected_result_mode):
+    if expected_result_mode in BUSINESS_EXPECTED_RESULT_MODES:
+        if not business_expected_result_mode_matches(expected_result_mode, step_state):
+            append_finding(
+                findings,
+                step,
+                "wrong_result_mode",
+                "Business answer mode does not match the expected semantic answer contract.",
+                actual={
+                    "result_mode": actual_result_mode or None,
+                    "reply_type": reply_type or None,
+                    "truth_mode": step_state.get("truth_mode"),
+                    "answer_shape": step_state.get("answer_shape"),
+                    "intent": detected_intent or None,
+                    "capability": capability_id or None,
+                },
+                expected=expected_result_mode,
+            )
+    elif expected_result_mode and actual_result_mode and not dcl.identifiers_match(actual_result_mode, expected_result_mode):
         append_finding(
             findings,
             step,