From 0bd631c160a3ab268ca8b14474f5ab53670b0ed2 Mon Sep 17 00:00:00 2001 From: dctouch Date: Fri, 22 May 2026 16:21:59 +0300 Subject: [PATCH] =?UTF-8?q?=D0=9F=D1=80=D0=B8=D0=BD=D1=8F=D1=82=D1=8C=20ag?= =?UTF-8?q?ent=20replay=20=D0=BC=D0=B0=D1=80=D0=B6=D0=B8=D0=BD=D0=B0=D0=BB?= =?UTF-8?q?=D1=8C=D0=BD=D0=BE=D1=81=D1=82=D0=B8=20=D0=BD=D0=BE=D0=BC=D0=B5?= =?UTF-8?q?=D0=BD=D0=BA=D0=BB=D0=B0=D1=82=D1=83=D1=80=D1=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data/autorun_generators/history.json | 38 ++++++ ..._20260522131931_gen-ag05221319-4035f5.json | 123 ++++++++++++++++++ ..._20260522131931_gen-ag05221319-4035f5.json | 40 ++++++ scripts/domain_truth_harness.py | 98 +++++++++++++- 4 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json create mode 100644 llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json diff --git a/llm_normalizer/data/autorun_generators/history.json b/llm_normalizer/data/autorun_generators/history.json index e6f4f6b..a2964c0 100644 --- a/llm_normalizer/data/autorun_generators/history.json +++ b/llm_normalizer/data/autorun_generators/history.json @@ -1,4 +1,42 @@ [ + { + "generation_id": "gen-ag05221319-4035f5", + "created_at": "2026-05-22T13:19:31+00:00", + "mode": "saved_user_sessions", + "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22", + "count": 5, + "domain": "inventory_margin_ranking", + "questions": [ + "Какая номеклатура товара реализована с высокой прибылью какая с низкой", + "сентябрь 2017", + "покажи найденные строки себестоимостной базы", + "расширь до 2017 года", + "анализ по 41 счету а не 01" + ], + "generated_by": "codex_agent", + "saved_case_set_file": "assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json", + "context": { + "llm_provider": null, + "model": null, + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "autogen_personality_id": null, + "autogen_personality_prompt": null, + "source_session_id": null, + "saved_session_file": "assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json", + "saved_case_set_kind": "agent_semantic_scenario", + "agent_run": true, + "agent_focus": "inventory_margin_ranking_followup_and_business_modes", + "architecture_phase": "Business Answer Contract / margin-ranking safety gate", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json", + "scenario_id": "inventory_margin_ranking_agent_loop_20260522", + "semantic_tags": [], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7", + "saved_after_validated_replay": true + } + }, { "generation_id": "gen-ag05131312-2d0445", "created_at": "2026-05-13T13:12:37+00:00", diff --git a/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json new file mode 100644 index 0000000..f70b1ba --- /dev/null +++ b/llm_normalizer/data/autorun_generators/saved_sessions/assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json @@ -0,0 +1,123 @@ +{ + "saved_at": "2026-05-22T13:19:31+00:00", + "generation_id": "gen-ag05221319-4035f5", + "mode": "saved_user_sessions", + "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22", + "agent_run": true, + "questions": [ + "Какая номеклатура товара реализована с высокой прибылью какая с низкой", + "сентябрь 2017", + "покажи найденные строки себестоимостной базы", + "расширь до 2017 года", + "анализ по 41 счету а не 01" + ], + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "inventory_margin_ranking_followup_and_business_modes", + "architecture_phase": "Business Answer Contract / margin-ranking safety gate", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json", + "scenario_id": "inventory_margin_ranking_agent_loop_20260522", + "semantic_tags": [], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7", + "saved_after_validated_replay": true, + "save_gate": { + "schema_version": "agent_semantic_save_gate_v1", + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7", + "final_status": "accepted", + "review_overall_status": "pass", + "business_overall_status": "pass", + "steps_total": 5, + "steps_passed": 5, + "steps_failed": 0, + "steps_with_business_failures": 0, + "steps_with_business_warnings": 0, + "acceptance_gate_passed": true, + "saved_after_validated_replay": true + } + }, + "source_session_id": null, + "session": { + "session_id": null, + "mode": "agent_semantic_run", + "items": [ + { + "message_id": "agent-user-001", + "role": "user", + "text": "Какая номеклатура товара реализована с высокой прибылью какая с низкой", + "created_at": "2026-05-22T13:19:31+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-002", + "role": "user", + "text": "сентябрь 2017", + "created_at": "2026-05-22T13:19:31+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-003", + "role": "user", + "text": "покажи найденные строки себестоимостной базы", + "created_at": "2026-05-22T13:19:31+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-004", + "role": "user", + "text": "расширь до 2017 года", + "created_at": "2026-05-22T13:19:31+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + }, + { + "message_id": "agent-user-005", + "role": "user", + "text": "анализ по 41 счету а не 01", + "created_at": "2026-05-22T13:19:31+00:00", + "reply_type": null, + "trace_id": null, + "debug": null + } + ], + "agent_run": true, + "metadata": { + "assistant_prompt_version": null, + "decomposition_prompt_version": null, + "prompt_fingerprint": null, + "agent_focus": "inventory_margin_ranking_followup_and_business_modes", + "architecture_phase": "Business Answer Contract / margin-ranking safety gate", + "source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json", + "scenario_id": "inventory_margin_ranking_agent_loop_20260522", + "semantic_tags": [], + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7", + "saved_after_validated_replay": true, + "save_gate": { + "schema_version": "agent_semantic_save_gate_v1", + "validation_status": "accepted_live_replay", + "validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7", + "final_status": "accepted", + "review_overall_status": "pass", + "business_overall_status": "pass", + "steps_total": 5, + "steps_passed": 5, + "steps_failed": 0, + "steps_with_business_failures": 0, + "steps_with_business_warnings": 0, + "acceptance_gate_passed": true, + "saved_after_validated_replay": true + } + } + } +} diff --git a/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json new file mode 100644 index 0000000..2d6f751 --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json @@ -0,0 +1,40 @@ +{ + "suite_id": "assistant_saved_session_gen-ag05221319-4035f5", + "suite_version": "0.1.0", + "schema_version": "assistant_saved_session_suite_v0_1", + "generated_at": "2026-05-22T13:19:31+00:00", + "generation_id": "gen-ag05221319-4035f5", + "mode": "saved_user_sessions", + "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22", + "domain": "inventory_margin_ranking", + "scenario_count": 1, + "case_ids": [ + "SAVED-001" + ], + "cases": [ + { + "case_id": "SAVED-001", + "scenario_tag": "agent_saved_user_sessions", + "title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22", + "question_type": "followup", + "broadness_level": "medium", + "turns": [ + { + "user_message": "Какая номеклатура товара реализована с высокой прибылью какая с низкой" + }, + { + "user_message": "сентябрь 2017" + }, + { + "user_message": "покажи найденные строки себестоимостной базы" + }, + { + "user_message": "расширь до 2017 года" + }, + { + "user_message": "анализ по 41 счету а не 01" + } + ] + } + ] +} diff --git a/scripts/domain_truth_harness.py b/scripts/domain_truth_harness.py index 14f6e81..b4a240a 100644 --- a/scripts/domain_truth_harness.py +++ b/scripts/domain_truth_harness.py @@ -386,6 +386,85 @@ def normalize_optional_bool(value: Any) -> bool | None: return None +LEGACY_DEBUG_RESULT_MODES = {"confirmed_balance", "heuristic_candidates"} +BUSINESS_EXPECTED_RESULT_MODES = { + "clarification_required", + "limited_accounting_answer", + "evidence_or_honest_boundary", + "ranking_or_limited_accounting_answer", + "same_inventory_margin_context_or_clarification", +} + + +def _business_review_is_clean(step_state: dict[str, Any]) -> bool: + business_review = step_state.get("business_first_review") + if not isinstance(business_review, dict): + return True + return len(dcl.normalize_string_list(business_review.get("issue_codes"))) == 0 + + +def business_expected_result_mode_matches(expected_result_mode: str, step_state: dict[str, Any]) -> bool: + reply_type = str(step_state.get("reply_type") or "").strip() + response_type = str(step_state.get("response_type") or "").strip() + truth_mode = str(step_state.get("truth_mode") or "").strip() + answer_shape = str(step_state.get("answer_shape") or "").strip() + detected_intent = str(step_state.get("detected_intent") or "").strip() + capability_id = str(step_state.get("capability_id") or "").strip() + assistant_text = str(step_state.get("assistant_text") or "").strip() + clean_business_review = _business_review_is_clean(step_state) + in_margin_context = ( + detected_intent == "inventory_margin_ranking_for_nomenclature" + or capability_id == "inventory_inventory_margin_ranking_for_nomenclature" + ) + + if expected_result_mode == "clarification_required": + return ( + clean_business_review + and ( + truth_mode == "clarification_required" + or answer_shape == "clarification_required" + or (reply_type == "partial_coverage" and response_type == "LIMITED_WITH_REASON") + ) + ) + + if expected_result_mode == "limited_accounting_answer": + return ( + clean_business_review + and in_margin_context + and bool(assistant_text) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + if expected_result_mode == "evidence_or_honest_boundary": + return ( + clean_business_review + and bool(assistant_text) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + if expected_result_mode == "ranking_or_limited_accounting_answer": + return ( + clean_business_review + and in_margin_context + and bool(assistant_text) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + if expected_result_mode == "same_inventory_margin_context_or_clarification": + return ( + clean_business_review + and bool(assistant_text) + and ( + in_margin_context + or truth_mode == "clarification_required" + or answer_shape == "clarification_required" + ) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + return False + + def evaluate_truth_step( *, step: dict[str, Any], @@ -627,7 +706,24 @@ def evaluate_truth_step( resolve_nested_placeholders(step.get("expected_result_mode"), step_results, bindings, runtime_bindings) or "" ).strip() actual_result_mode = str(step_state.get("result_mode") or "").strip() - if expected_result_mode and actual_result_mode and not dcl.identifiers_match(actual_result_mode, expected_result_mode): + if expected_result_mode in BUSINESS_EXPECTED_RESULT_MODES: + if not business_expected_result_mode_matches(expected_result_mode, step_state): + append_finding( + findings, + step, + "wrong_result_mode", + "Business answer mode does not match the expected semantic answer contract.", + actual={ + "result_mode": actual_result_mode or None, + "reply_type": reply_type or None, + "truth_mode": step_state.get("truth_mode"), + "answer_shape": step_state.get("answer_shape"), + "intent": detected_intent or None, + "capability": capability_id or None, + }, + expected=expected_result_mode, + ) + elif expected_result_mode and actual_result_mode and not dcl.identifiers_match(actual_result_mode, expected_result_mode): append_finding( findings, step,