Принять agent replay маржинальности номенклатуры
This commit is contained in:
parent
09c6d1aa0e
commit
0bd631c160
|
|
@ -1,4 +1,42 @@
|
||||||
[
|
[
|
||||||
|
{
|
||||||
|
"generation_id": "gen-ag05221319-4035f5",
|
||||||
|
"created_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"mode": "saved_user_sessions",
|
||||||
|
"title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
|
||||||
|
"count": 5,
|
||||||
|
"domain": "inventory_margin_ranking",
|
||||||
|
"questions": [
|
||||||
|
"Какая номеклатура товара реализована с высокой прибылью какая с низкой",
|
||||||
|
"сентябрь 2017",
|
||||||
|
"покажи найденные строки себестоимостной базы",
|
||||||
|
"расширь до 2017 года",
|
||||||
|
"анализ по 41 счету а не 01"
|
||||||
|
],
|
||||||
|
"generated_by": "codex_agent",
|
||||||
|
"saved_case_set_file": "assistant_autogen_saved_user_sessions_20260522131931_gen-ag05221319-4035f5.json",
|
||||||
|
"context": {
|
||||||
|
"llm_provider": null,
|
||||||
|
"model": null,
|
||||||
|
"assistant_prompt_version": null,
|
||||||
|
"decomposition_prompt_version": null,
|
||||||
|
"prompt_fingerprint": null,
|
||||||
|
"autogen_personality_id": null,
|
||||||
|
"autogen_personality_prompt": null,
|
||||||
|
"source_session_id": null,
|
||||||
|
"saved_session_file": "assistant_saved_session_20260522131931_gen-ag05221319-4035f5.json",
|
||||||
|
"saved_case_set_kind": "agent_semantic_scenario",
|
||||||
|
"agent_run": true,
|
||||||
|
"agent_focus": "inventory_margin_ranking_followup_and_business_modes",
|
||||||
|
"architecture_phase": "Business Answer Contract / margin-ranking safety gate",
|
||||||
|
"source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json",
|
||||||
|
"scenario_id": "inventory_margin_ranking_agent_loop_20260522",
|
||||||
|
"semantic_tags": [],
|
||||||
|
"validation_status": "accepted_live_replay",
|
||||||
|
"validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
|
||||||
|
"saved_after_validated_replay": true
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"generation_id": "gen-ag05131312-2d0445",
|
"generation_id": "gen-ag05131312-2d0445",
|
||||||
"created_at": "2026-05-13T13:12:37+00:00",
|
"created_at": "2026-05-13T13:12:37+00:00",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,123 @@
|
||||||
|
{
|
||||||
|
"saved_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"generation_id": "gen-ag05221319-4035f5",
|
||||||
|
"mode": "saved_user_sessions",
|
||||||
|
"title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
|
||||||
|
"agent_run": true,
|
||||||
|
"questions": [
|
||||||
|
"Какая номеклатура товара реализована с высокой прибылью какая с низкой",
|
||||||
|
"сентябрь 2017",
|
||||||
|
"покажи найденные строки себестоимостной базы",
|
||||||
|
"расширь до 2017 года",
|
||||||
|
"анализ по 41 счету а не 01"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"assistant_prompt_version": null,
|
||||||
|
"decomposition_prompt_version": null,
|
||||||
|
"prompt_fingerprint": null,
|
||||||
|
"agent_focus": "inventory_margin_ranking_followup_and_business_modes",
|
||||||
|
"architecture_phase": "Business Answer Contract / margin-ranking safety gate",
|
||||||
|
"source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json",
|
||||||
|
"scenario_id": "inventory_margin_ranking_agent_loop_20260522",
|
||||||
|
"semantic_tags": [],
|
||||||
|
"validation_status": "accepted_live_replay",
|
||||||
|
"validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
|
||||||
|
"saved_after_validated_replay": true,
|
||||||
|
"save_gate": {
|
||||||
|
"schema_version": "agent_semantic_save_gate_v1",
|
||||||
|
"validation_status": "accepted_live_replay",
|
||||||
|
"validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
|
||||||
|
"final_status": "accepted",
|
||||||
|
"review_overall_status": "pass",
|
||||||
|
"business_overall_status": "pass",
|
||||||
|
"steps_total": 5,
|
||||||
|
"steps_passed": 5,
|
||||||
|
"steps_failed": 0,
|
||||||
|
"steps_with_business_failures": 0,
|
||||||
|
"steps_with_business_warnings": 0,
|
||||||
|
"acceptance_gate_passed": true,
|
||||||
|
"saved_after_validated_replay": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source_session_id": null,
|
||||||
|
"session": {
|
||||||
|
"session_id": null,
|
||||||
|
"mode": "agent_semantic_run",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"message_id": "agent-user-001",
|
||||||
|
"role": "user",
|
||||||
|
"text": "Какая номеклатура товара реализована с высокой прибылью какая с низкой",
|
||||||
|
"created_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"reply_type": null,
|
||||||
|
"trace_id": null,
|
||||||
|
"debug": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message_id": "agent-user-002",
|
||||||
|
"role": "user",
|
||||||
|
"text": "сентябрь 2017",
|
||||||
|
"created_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"reply_type": null,
|
||||||
|
"trace_id": null,
|
||||||
|
"debug": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message_id": "agent-user-003",
|
||||||
|
"role": "user",
|
||||||
|
"text": "покажи найденные строки себестоимостной базы",
|
||||||
|
"created_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"reply_type": null,
|
||||||
|
"trace_id": null,
|
||||||
|
"debug": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message_id": "agent-user-004",
|
||||||
|
"role": "user",
|
||||||
|
"text": "расширь до 2017 года",
|
||||||
|
"created_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"reply_type": null,
|
||||||
|
"trace_id": null,
|
||||||
|
"debug": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message_id": "agent-user-005",
|
||||||
|
"role": "user",
|
||||||
|
"text": "анализ по 41 счету а не 01",
|
||||||
|
"created_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"reply_type": null,
|
||||||
|
"trace_id": null,
|
||||||
|
"debug": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"agent_run": true,
|
||||||
|
"metadata": {
|
||||||
|
"assistant_prompt_version": null,
|
||||||
|
"decomposition_prompt_version": null,
|
||||||
|
"prompt_fingerprint": null,
|
||||||
|
"agent_focus": "inventory_margin_ranking_followup_and_business_modes",
|
||||||
|
"architecture_phase": "Business Answer Contract / margin-ranking safety gate",
|
||||||
|
"source_spec_file": "X:\\1C\\NDC_1C\\docs\\orchestration\\inventory_margin_ranking_agent_loop_20260522.json",
|
||||||
|
"scenario_id": "inventory_margin_ranking_agent_loop_20260522",
|
||||||
|
"semantic_tags": [],
|
||||||
|
"validation_status": "accepted_live_replay",
|
||||||
|
"validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
|
||||||
|
"saved_after_validated_replay": true,
|
||||||
|
"save_gate": {
|
||||||
|
"schema_version": "agent_semantic_save_gate_v1",
|
||||||
|
"validation_status": "accepted_live_replay",
|
||||||
|
"validated_run_dir": "artifacts\\domain_runs\\inventory_margin_ranking_agent_loop_live7",
|
||||||
|
"final_status": "accepted",
|
||||||
|
"review_overall_status": "pass",
|
||||||
|
"business_overall_status": "pass",
|
||||||
|
"steps_total": 5,
|
||||||
|
"steps_passed": 5,
|
||||||
|
"steps_failed": 0,
|
||||||
|
"steps_with_business_failures": 0,
|
||||||
|
"steps_with_business_warnings": 0,
|
||||||
|
"acceptance_gate_passed": true,
|
||||||
|
"saved_after_validated_replay": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,40 @@
|
||||||
|
{
|
||||||
|
"suite_id": "assistant_saved_session_gen-ag05221319-4035f5",
|
||||||
|
"suite_version": "0.1.0",
|
||||||
|
"schema_version": "assistant_saved_session_suite_v0_1",
|
||||||
|
"generated_at": "2026-05-22T13:19:31+00:00",
|
||||||
|
"generation_id": "gen-ag05221319-4035f5",
|
||||||
|
"mode": "saved_user_sessions",
|
||||||
|
"title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
|
||||||
|
"domain": "inventory_margin_ranking",
|
||||||
|
"scenario_count": 1,
|
||||||
|
"case_ids": [
|
||||||
|
"SAVED-001"
|
||||||
|
],
|
||||||
|
"cases": [
|
||||||
|
{
|
||||||
|
"case_id": "SAVED-001",
|
||||||
|
"scenario_tag": "agent_saved_user_sessions",
|
||||||
|
"title": "AGENT | Inventory margin-ranking accepted replay 2026-05-22",
|
||||||
|
"question_type": "followup",
|
||||||
|
"broadness_level": "medium",
|
||||||
|
"turns": [
|
||||||
|
{
|
||||||
|
"user_message": "Какая номеклатура товара реализована с высокой прибылью какая с низкой"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"user_message": "сентябрь 2017"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"user_message": "покажи найденные строки себестоимостной базы"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"user_message": "расширь до 2017 года"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"user_message": "анализ по 41 счету а не 01"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -386,6 +386,85 @@ def normalize_optional_bool(value: Any) -> bool | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
LEGACY_DEBUG_RESULT_MODES = {"confirmed_balance", "heuristic_candidates"}
|
||||||
|
BUSINESS_EXPECTED_RESULT_MODES = {
|
||||||
|
"clarification_required",
|
||||||
|
"limited_accounting_answer",
|
||||||
|
"evidence_or_honest_boundary",
|
||||||
|
"ranking_or_limited_accounting_answer",
|
||||||
|
"same_inventory_margin_context_or_clarification",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _business_review_is_clean(step_state: dict[str, Any]) -> bool:
|
||||||
|
business_review = step_state.get("business_first_review")
|
||||||
|
if not isinstance(business_review, dict):
|
||||||
|
return True
|
||||||
|
return len(dcl.normalize_string_list(business_review.get("issue_codes"))) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def business_expected_result_mode_matches(expected_result_mode: str, step_state: dict[str, Any]) -> bool:
|
||||||
|
reply_type = str(step_state.get("reply_type") or "").strip()
|
||||||
|
response_type = str(step_state.get("response_type") or "").strip()
|
||||||
|
truth_mode = str(step_state.get("truth_mode") or "").strip()
|
||||||
|
answer_shape = str(step_state.get("answer_shape") or "").strip()
|
||||||
|
detected_intent = str(step_state.get("detected_intent") or "").strip()
|
||||||
|
capability_id = str(step_state.get("capability_id") or "").strip()
|
||||||
|
assistant_text = str(step_state.get("assistant_text") or "").strip()
|
||||||
|
clean_business_review = _business_review_is_clean(step_state)
|
||||||
|
in_margin_context = (
|
||||||
|
detected_intent == "inventory_margin_ranking_for_nomenclature"
|
||||||
|
or capability_id == "inventory_inventory_margin_ranking_for_nomenclature"
|
||||||
|
)
|
||||||
|
|
||||||
|
if expected_result_mode == "clarification_required":
|
||||||
|
return (
|
||||||
|
clean_business_review
|
||||||
|
and (
|
||||||
|
truth_mode == "clarification_required"
|
||||||
|
or answer_shape == "clarification_required"
|
||||||
|
or (reply_type == "partial_coverage" and response_type == "LIMITED_WITH_REASON")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if expected_result_mode == "limited_accounting_answer":
|
||||||
|
return (
|
||||||
|
clean_business_review
|
||||||
|
and in_margin_context
|
||||||
|
and bool(assistant_text)
|
||||||
|
and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if expected_result_mode == "evidence_or_honest_boundary":
|
||||||
|
return (
|
||||||
|
clean_business_review
|
||||||
|
and bool(assistant_text)
|
||||||
|
and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if expected_result_mode == "ranking_or_limited_accounting_answer":
|
||||||
|
return (
|
||||||
|
clean_business_review
|
||||||
|
and in_margin_context
|
||||||
|
and bool(assistant_text)
|
||||||
|
and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if expected_result_mode == "same_inventory_margin_context_or_clarification":
|
||||||
|
return (
|
||||||
|
clean_business_review
|
||||||
|
and bool(assistant_text)
|
||||||
|
and (
|
||||||
|
in_margin_context
|
||||||
|
or truth_mode == "clarification_required"
|
||||||
|
or answer_shape == "clarification_required"
|
||||||
|
)
|
||||||
|
and reply_type in {"partial_coverage", "factual", "factual_with_explanation"}
|
||||||
|
)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def evaluate_truth_step(
|
def evaluate_truth_step(
|
||||||
*,
|
*,
|
||||||
step: dict[str, Any],
|
step: dict[str, Any],
|
||||||
|
|
@ -627,7 +706,24 @@ def evaluate_truth_step(
|
||||||
resolve_nested_placeholders(step.get("expected_result_mode"), step_results, bindings, runtime_bindings) or ""
|
resolve_nested_placeholders(step.get("expected_result_mode"), step_results, bindings, runtime_bindings) or ""
|
||||||
).strip()
|
).strip()
|
||||||
actual_result_mode = str(step_state.get("result_mode") or "").strip()
|
actual_result_mode = str(step_state.get("result_mode") or "").strip()
|
||||||
if expected_result_mode and actual_result_mode and not dcl.identifiers_match(actual_result_mode, expected_result_mode):
|
if expected_result_mode in BUSINESS_EXPECTED_RESULT_MODES:
|
||||||
|
if not business_expected_result_mode_matches(expected_result_mode, step_state):
|
||||||
|
append_finding(
|
||||||
|
findings,
|
||||||
|
step,
|
||||||
|
"wrong_result_mode",
|
||||||
|
"Business answer mode does not match the expected semantic answer contract.",
|
||||||
|
actual={
|
||||||
|
"result_mode": actual_result_mode or None,
|
||||||
|
"reply_type": reply_type or None,
|
||||||
|
"truth_mode": step_state.get("truth_mode"),
|
||||||
|
"answer_shape": step_state.get("answer_shape"),
|
||||||
|
"intent": detected_intent or None,
|
||||||
|
"capability": capability_id or None,
|
||||||
|
},
|
||||||
|
expected=expected_result_mode,
|
||||||
|
)
|
||||||
|
elif expected_result_mode and actual_result_mode and not dcl.identifiers_match(actual_result_mode, expected_result_mode):
|
||||||
append_finding(
|
append_finding(
|
||||||
findings,
|
findings,
|
||||||
step,
|
step,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue