Развить агентную semantic loop систему

This commit is contained in:
dctouch 2026-05-09 21:47:55 +03:00
parent 48c3b5340b
commit f86cb8e886
13 changed files with 3221 additions and 332 deletions

View File

@ -39,7 +39,7 @@ Use these repo-native capture paths:
- import existing technical export: `python scripts/domain_case_loop.py import-export ...` - import existing technical export: `python scripts/domain_case_loop.py import-export ...`
- `run-case` defaults to the repo's live local profile: `local / qwen2.5-14b-instruct-1m / http://127.0.0.1:1234/v1` - `run-case` defaults to the repo's live local profile: `local / qwen2.5-14b-instruct-1m / http://127.0.0.1:1234/v1`
- override with `--llm-provider`, `--llm-model`, `--llm-base-url`, `--llm-api-key` when needed - override with `--llm-provider`, `--llm-model`, `--llm-base-url`, `--llm-api-key` when needed
- `run-pack-loop` defaults to `gpt-5.4` for analyst and `gpt-5.4-mini` for coder; tune with `--analyst-codex-model`, `--coder-codex-model`, `--analyst-reasoning-effort`, `--coder-reasoning-effort` - `run-pack-loop` defaults to `gpt-5.4` for the independent business analyst and `lead-handoff` repair mode; opt into the old autonomous coder loop only with `--repair-mode auto-coder`
## Workflow ## Workflow
@ -77,13 +77,14 @@ In pack mode:
### Autonomous pack-loop mode ### Autonomous pack-loop mode
Use autonomous pack-loop mode when the user wants the system to continue with analyst/coder iterations until the analyst gate is reached or the loop hits a real blocker. Use pack-loop mode when the user wants the system to run live replay, produce a strong business-first analyst verdict, and continue toward repair evidence until the analyst gate is reached or the loop hits a real blocker.
In autonomous pack-loop mode: In autonomous pack-loop mode:
- run `python scripts/domain_case_loop.py run-pack-loop --manifest ...`; - run `python scripts/domain_case_loop.py run-pack-loop --manifest ...`;
- keep each iteration under `artifacts/domain_runs/<loop_id>/iterations/<iteration_id>/`; - keep each iteration under `artifacts/domain_runs/<loop_id>/iterations/<iteration_id>/`;
- read `analyst_verdict.json` before any coder patch; - read `analyst_verdict.json` before any coder patch;
- let coder patch only the highest-value domain targets from the current analyst verdict; - by default, stop after the analyst verdict with `business_audit.md` and `lead_coder_handoff.md` so Lead Codex repairs code in the main context;
- let an autonomous coder patch only when `--repair-mode auto-coder` is explicitly selected, and only against the highest-value domain targets from the current analyst verdict;
- stop only on `accepted`, `blocked`, explicit `requires_user_decision = true`, or `max_iterations`; - stop only on `accepted`, `blocked`, explicit `requires_user_decision = true`, or `max_iterations`;
- do not stop just because the analyst returns `needs_exact_capability` or `partial` if autonomous domain enablement work still remains. - do not stop just because the analyst returns `needs_exact_capability` or `partial` if autonomous domain enablement work still remains.
- treat `quality score >= 80` as the target gate, not as permission to keep pushing through hard blockers, missing essential observations, or unsafe fixes. - treat `quality score >= 80` as the target gate, not as permission to keep pushing through hard blockers, missing essential observations, or unsafe fixes.

View File

@ -3,7 +3,7 @@
"pack_id": "agentic_semantic_development_loop_stage_pack", "pack_id": "agentic_semantic_development_loop_stage_pack",
"domain": "agentic_semantic_development_loop_control", "domain": "agentic_semantic_development_loop_control",
"title": "Agentic semantic development loop control pack", "title": "Agentic semantic development loop control pack",
"description": "Compact stage pack for dogfooding the agentic development loop against business-overview, VAT, stale-scope, and legacy-canary questions.", "description": "Stage pack for dogfooding the agentic development loop against business overview, VAT, stale scope, counterparty pivots, legacy route canaries, and answer-shape quality.",
"analysis_context": { "analysis_context": {
"as_of_date": "2026-05-09", "as_of_date": "2026-05-09",
"source": "agentic_semantic_development_loop_stage_pack" "source": "agentic_semantic_development_loop_stage_pack"
@ -15,15 +15,17 @@
}, },
"scenarios": [ "scenarios": [
{ {
"scenario_id": "agentic_loop_business_overview_control", "scenario_id": "biz_scope",
"title": "Business overview and stale-scope control", "title": "Business overview and stale-scope control",
"description": "Checks direct business-answer shape, period carryover, all-time reset, VAT boundary, and organization scope hygiene.", "description": "Checks direct business-answer shape, period carryover, all-time reset, VAT boundary, and organization scope hygiene.",
"steps": [ "steps": [
{ {
"step_id": "step_01_business_overview", "step_id": "s01_biz",
"title": "Business overview for explicit period", "title": "Business overview for explicit period",
"node_role": "root",
"question": "Дай взрослый бизнес-обзор {{bindings.main_organization}} за {{bindings.control_year}} год по данным 1С: обороты, входящие и исходящие деньги, нетто, НДС, долги, склад, клиенты, поставщики и что пока нельзя утверждать.", "question": "Дай взрослый бизнес-обзор {{bindings.main_organization}} за {{bindings.control_year}} год по данным 1С: обороты, входящие и исходящие деньги, нетто, НДС, долги, склад, клиенты, поставщики и что пока нельзя утверждать.",
"expected_intents": ["business_overview"], "expected_intents": ["business_overview"],
"semantic_tags": ["business_overview", "money", "vat", "debt", "inventory", "scope_guard"],
"required_answer_shape": "direct_answer_first", "required_answer_shape": "direct_answer_first",
"forbidden_answer_patterns": [ "forbidden_answer_patterns": [
"(?i)capability_id", "(?i)capability_id",
@ -33,45 +35,51 @@
] ]
}, },
{ {
"step_id": "step_02_money_followup", "step_id": "s02_money",
"title": "Money follow-up", "title": "Money follow-up",
"question": "Раскрой деньги подробнее: сколько получили, сколько заплатили, какой чистый денежный поток, кто главный клиент и главный поставщик в {{bindings.control_year}}.", "question": "Раскрой деньги подробнее: сколько получили, сколько заплатили, какой чистый денежный поток, кто главный клиент и главный поставщик в {{bindings.control_year}}.",
"depends_on": ["step_01_business_overview"], "depends_on": ["s01_biz"],
"semantic_tags": ["money", "counterparty"],
"required_answer_shape": "direct_answer_first" "required_answer_shape": "direct_answer_first"
}, },
{ {
"step_id": "step_03_best_year_all_time", "step_id": "s03_best_year",
"title": "All-time best operating-flow year", "title": "All-time best operating-flow year",
"question": "А если смотреть за все доступное время, какой самый доходный год по подтвержденным оборотам и почему? Не называй это бухгалтерской прибылью, если чистой прибыли нет.", "question": "А если смотреть за все доступное время, какой самый доходный год по подтвержденным оборотам и почему? Не называй это бухгалтерской прибылью, если чистой прибыли нет.",
"depends_on": ["step_02_money_followup"], "depends_on": ["s02_money"],
"semantic_tags": ["money", "scope_guard"],
"required_answer_shape": "direct_answer_first" "required_answer_shape": "direct_answer_first"
}, },
{ {
"step_id": "step_04_vat_explicit_period", "step_id": "s04_vat",
"title": "VAT explicit period", "title": "VAT explicit period",
"question": "Что с НДС за {{bindings.control_year}} год по {{bindings.main_organization}}: какая позиция видна, на чем она основана и чего не хватает для налогового вывода?", "question": "Что с НДС за {{bindings.control_year}} год по {{bindings.main_organization}}: какая позиция видна, на чем она основана и чего не хватает для налогового вывода?",
"depends_on": ["step_03_best_year_all_time"], "depends_on": ["s03_best_year"],
"semantic_tags": ["vat", "documents", "scope_guard"],
"required_answer_shape": "direct_answer_first" "required_answer_shape": "direct_answer_first"
}, },
{ {
"step_id": "step_05_all_time_no_vat_carryover", "step_id": "s05_all_time",
"title": "All-time reset without stale VAT carryover", "title": "All-time reset without stale VAT carryover",
"question": "Теперь за все доступное время дай обзор компании в целом, но не тащи НДС за {{bindings.control_year}} как подтвержденную общую налоговую позицию.", "question": "Теперь за все доступное время дай обзор компании в целом, но не тащи НДС за {{bindings.control_year}} как подтвержденную общую налоговую позицию.",
"depends_on": ["step_04_vat_explicit_period"], "depends_on": ["s04_vat"],
"semantic_tags": ["business_overview", "vat", "scope_guard"],
"required_answer_shape": "direct_answer_first" "required_answer_shape": "direct_answer_first"
} }
] ]
}, },
{ {
"scenario_id": "agentic_loop_counterparty_pivot_control", "scenario_id": "svk_pivot",
"title": "Counterparty pivot and legacy canaries", "title": "Counterparty pivot and legacy canaries",
"description": "Checks explicit counterparty arbitration after organization context and keeps technical/debug details out of the final answer.", "description": "Checks explicit counterparty arbitration after organization context and keeps technical/debug details out of the final answer.",
"steps": [ "steps": [
{ {
"step_id": "step_01_svk_money", "step_id": "s01_svk_money",
"title": "Explicit counterparty money flow", "title": "Explicit counterparty money flow",
"question": "Теперь отдельно по контрагенту {{bindings.svk_counterparty}}: сколько денег прошло, что входящее, что исходящее и есть ли документы или движения, на которых это основано?", "node_role": "root",
"question": "Отдельно по контрагенту {{bindings.svk_counterparty}}, без опоры на прошлый диалог: сколько денег прошло, что входящее, что исходящее и есть ли документы или движения, на которых это основано?",
"expected_intents": ["value_flow"], "expected_intents": ["value_flow"],
"semantic_tags": ["counterparty", "money", "documents", "scope_guard"],
"required_answer_shape": "direct_answer_first", "required_answer_shape": "direct_answer_first",
"forbidden_answer_patterns": [ "forbidden_answer_patterns": [
"(?i)capability_id", "(?i)capability_id",
@ -81,17 +89,66 @@
] ]
}, },
{ {
"step_id": "step_02_svk_documents", "step_id": "s02_svk_docs",
"title": "Counterparty documents follow-up", "title": "Counterparty documents follow-up",
"question": "Покажи документы по этой цепочке и не смешивай {{bindings.svk_counterparty}} с организацией {{bindings.main_organization}}.", "question": "Покажи документы по этой цепочке и не смешивай {{bindings.svk_counterparty}} с организацией {{bindings.main_organization}}.",
"depends_on": ["step_01_svk_money"], "depends_on": ["s01_svk_money"],
"semantic_tags": ["counterparty", "documents", "scope_guard"],
"required_answer_shape": "direct_answer_first" "required_answer_shape": "direct_answer_first"
}, },
{ {
"step_id": "step_03_final_summary", "step_id": "s03_summary",
"title": "Final executive summary", "title": "Final executive summary",
"question": "Собери короткий итог: что мы подтвердили по компании, что отдельно по {{bindings.svk_counterparty}}, какие выводы можно делать и какие нельзя.", "question": "Собери короткий итог: что мы подтвердили по компании, что отдельно по {{bindings.svk_counterparty}}, какие выводы можно делать и какие нельзя.",
"depends_on": ["step_01_svk_money", "step_02_svk_documents"], "depends_on": ["s01_svk_money", "s02_svk_docs"],
"semantic_tags": ["business_overview", "counterparty", "scope_guard"],
"required_answer_shape": "direct_answer_first",
"required_answer_patterns_all": [
"СВК",
"компан"
]
}
]
},
{
"scenario_id": "legacy_canaries",
"title": "Legacy route canaries and context interruptions",
"description": "Keeps old deterministic routes and conversational interruptions in the stage pack so new agentic loop wiring does not hide regressions.",
"steps": [
{
"step_id": "s01_memory",
"title": "Memory checkpoint after prior business context",
"node_role": "root",
"question": "Сделай короткий стартовый чек контекста: есть ли уже выбранная компания или контрагент в текущем диалоге; если нет, скажи честно и не выдумывай память про {{bindings.svk_counterparty}}.",
"semantic_tags": ["memory", "business_overview", "counterparty", "scope_guard"],
"required_answer_shape": "direct_answer_first",
"forbidden_answer_patterns": [
"(?i)capability_id",
"(?i)runtime_"
]
},
{
"step_id": "s02_acc60",
"title": "Account 60 tail legacy canary",
"question": "Покажи хвосты по счету 60 на август {{bindings.control_year}} по {{bindings.main_organization}}; если точных данных нет, скажи это прямо и не подменяй ответ общим обзором.",
"depends_on": ["s01_memory"],
"semantic_tags": ["debt", "documents", "scope_guard"],
"required_answer_shape": "direct_answer_first"
},
{
"step_id": "s03_stock",
"title": "Inventory route canary",
"question": "Что было на складе на март 2021 по доступным данным? Дай прямой ответ и не уводи его в контрагента {{bindings.svk_counterparty}}.",
"depends_on": ["s02_acc60"],
"semantic_tags": ["inventory", "scope_guard"],
"required_answer_shape": "direct_answer_first"
},
{
"step_id": "s04_all_money",
"title": "All-money answer without counterparty leakage",
"question": "Вернись к {{bindings.main_organization}}: сколько всего денег получили и заплатили по всем подтвержденным данным, но не смешивай это с отдельной цепочкой {{bindings.svk_counterparty}} и не называй оборот чистой прибылью.",
"depends_on": ["s03_stock"],
"semantic_tags": ["money", "business_overview", "counterparty", "scope_guard"],
"required_answer_shape": "direct_answer_first" "required_answer_shape": "direct_answer_first"
} }
] ]

View File

@ -6,7 +6,7 @@ This repository now supports two outer-loop capture modes:
- `run-case` for one concrete domain question; - `run-case` for one concrete domain question;
- `run-scenario` for a linked multi-step domain chain that should reuse one assistant session. - `run-scenario` for a linked multi-step domain chain that should reuse one assistant session.
- `run-pack` for a whole domain question pool grouped into several scenarios. - `run-pack` for a whole domain question pool grouped into several scenarios.
- `run-pack-loop` for an autonomous analyst/coder loop over a whole domain pack. - `run-pack-loop` for a strong analyst review loop over a whole domain pack, with Lead Codex repair handoff by default.
`run-scenario` is the preferred capture mode for domains where the user's next question depends on the previous result set. `run-scenario` is the preferred capture mode for domains where the user's next question depends on the previous result set.
`run-pack` is the preferred capture mode when the user brings a full domain pool that should be kept in one aggregate backlog. `run-pack` is the preferred capture mode when the user brings a full domain pool that should be kept in one aggregate backlog.
@ -80,7 +80,7 @@ That path is explicitly marked as unvalidated and must not be treated as semanti
1. take the current global/local stage manifest; 1. take the current global/local stage manifest;
2. run `scripts/domain_case_loop.py run-pack-loop` for that stage pack; 2. run `scripts/domain_case_loop.py run-pack-loop` for that stage pack;
3. let the loop iterate through pack replay, business-first analyst verdict, coder patch, and rerun until the objective gate is accepted, blocked, or a real user decision is required; 3. let the loop run pack replay and a business-first analyst verdict; if the gate is not accepted, write `business_audit.md` and `lead_coder_handoff.md` instead of launching a weak coder by default;
4. if accepted, persist the validated AGENT pack into GUI autoruns through `scripts/save_agent_semantic_run.py --validated-run-dir`; 4. if accepted, persist the validated AGENT pack into GUI autoruns through `scripts/save_agent_semantic_run.py --validated-run-dir`;
5. write `stage_loop_summary.json` and `stage_loop_handoff.md` for the final human visual confirmation. 5. write `stage_loop_summary.json` and `stage_loop_handoff.md` for the final human visual confirmation.
@ -92,6 +92,7 @@ Canonical commands:
```powershell ```powershell
python scripts/stage_agent_loop.py plan --manifest docs/orchestration/<stage_loop>.json python scripts/stage_agent_loop.py plan --manifest docs/orchestration/<stage_loop>.json
python scripts/stage_agent_loop.py run --manifest docs/orchestration/<stage_loop>.json python scripts/stage_agent_loop.py run --manifest docs/orchestration/<stage_loop>.json
python scripts/stage_agent_loop.py review-questions --manifest docs/orchestration/<stage_loop>.json
python scripts/stage_agent_loop.py ingest-gui-run --manifest docs/orchestration/<stage_loop>.json --run-id assistant-stage1-<id> python scripts/stage_agent_loop.py ingest-gui-run --manifest docs/orchestration/<stage_loop>.json --run-id assistant-stage1-<id>
python scripts/stage_agent_loop.py prepare-repair --manifest docs/orchestration/<stage_loop>.json python scripts/stage_agent_loop.py prepare-repair --manifest docs/orchestration/<stage_loop>.json
python scripts/stage_agent_loop.py run-repair --manifest docs/orchestration/<stage_loop>.json --dry-run python scripts/stage_agent_loop.py run-repair --manifest docs/orchestration/<stage_loop>.json --dry-run
@ -100,7 +101,28 @@ python scripts/stage_agent_loop.py continue --manifest docs/orchestration/<stage
python scripts/stage_agent_loop.py summarize --manifest docs/orchestration/<stage_loop>.json python scripts/stage_agent_loop.py summarize --manifest docs/orchestration/<stage_loop>.json
``` ```
This is the intended path for “implement the stage, generate/check stage questions, analyze business answers, patch code, rerun, then ask the user for final visual confirmation”. This is the intended path for "implement the stage, generate/check stage questions, analyze business answers, patch code, rerun, then ask the user for final visual confirmation".
The default repair mode is `lead-handoff`. In this mode the expensive replay still runs live and the independent analyst still produces the strict business verdict, but code repair stays with the main Lead Codex context. The loop stops with `next_action = lead_coder_repair_required`, plus:
- `business_audit.md` for the user-facing semantic/business verdict;
- `lead_coder_handoff.md/json` for the concrete repair target, candidate files, and validation path;
- `stage_context_capsule.md/json` for the current stage contract, question quality, loop status, and operating model.
`auto-coder` remains available only as an explicit opt-in experiment:
```powershell
python scripts/stage_agent_loop.py run --manifest docs/orchestration/<stage_loop>.json --repair-mode auto-coder
```
That path must not be treated as the normal high-trust repair mode for this project.
Before launching an expensive live replay, run `review-questions`. It reads the stage pack, resolves `{{bindings.*}}` placeholders, checks scenario/follow-up density, direct-answer shape declarations, domain coverage, stale-scope canaries, dependency order, duplicates, mojibake in generated Russian questions, and estimated Windows artifact path length. It writes:
- `question_generation_review.json`;
- `question_generation_review.md`.
A strong question review is not semantic proof that the assistant answers correctly. It is the pre-flight gate that says the generated questions are worth spending a live replay on.
## GUI run review bridge ## GUI run review bridge
@ -128,7 +150,7 @@ This bridge is intentionally business-first:
- noisy direct answers, missing first-line answers, technical garbage, and over-broad business answers become findings; - noisy direct answers, missing first-line answers, technical garbage, and over-broad business answers become findings;
- generated question packs get a deterministic quality review for follow-up density, direct questions, report-style analysis, domain diversity, duplicates, and weak business anchors. - generated question packs get a deterministic quality review for follow-up density, direct questions, report-style analysis, domain diversity, duplicates, and weak business anchors.
Use this bridge when the operator would otherwise say “чекни прогон `assistant-stage1-...`. The expected next step is no longer manual eyeballing first; it is: review by id, inspect `run_review.md`, map `repair_targets.json` into the current stage loop, patch, and rerun. Use this bridge when the operator would otherwise say "чекни прогон `assistant-stage1-...`". The expected next step is no longer manual eyeballing first; it is: review by id, inspect `run_review.md`, map `repair_targets.json` into the current stage loop, patch, and rerun.
For stage work, prefer the integrated command: For stage work, prefer the integrated command:
@ -149,6 +171,8 @@ Use `python scripts/stage_agent_loop.py continue --manifest docs/orchestration/<
It also writes `stage_repair_handoff.md/json` next to the stage summary. That handoff is the preferred input for the next coder pass: it lists primary repair targets and sample user-facing failures without forcing the coder to reread the entire GUI conversation first. It also writes `stage_repair_handoff.md/json` next to the stage summary. That handoff is the preferred input for the next coder pass: it lists primary repair targets and sample user-facing failures without forcing the coder to reread the entire GUI conversation first.
For live stage-pack failures, prefer `lead_coder_handoff.md` over immediately preparing a coder pass. The intent is: strong business audit first, Lead Codex code repair second, same replay/GUI validation third.
To prepare the next repair iteration from that handoff, run: To prepare the next repair iteration from that handoff, run:
```powershell ```powershell

View File

@ -4,7 +4,7 @@
"module_name": "Agentic Semantic Development Loop", "module_name": "Agentic Semantic Development Loop",
"title": "Agentic semantic development loop dogfood gate", "title": "Agentic semantic development loop dogfood gate",
"architecture_phase": "turnaround_11_agentic_semantic_development_loop", "architecture_phase": "turnaround_11_agentic_semantic_development_loop",
"agent_focus": "Automate stage implementation, semantic replay review, repair handoff, coder pass, rerun validation, and final human confirmation.", "agent_focus": "Automate stage question review, live semantic replay, strong business audit, Lead Codex repair handoff, rerun validation, and final human confirmation.",
"current_stage_status": "active_dogfood", "current_stage_status": "active_dogfood",
"global_plan_refs": [ "global_plan_refs": [
"docs/orchestration/domain_scenario_loop_repo_adapter.md", "docs/orchestration/domain_scenario_loop_repo_adapter.md",
@ -12,12 +12,16 @@
"AGENTS.md codex_domain_loop and agent_semantic_runs" "AGENTS.md codex_domain_loop and agent_semantic_runs"
], ],
"pack_manifest": "docs/orchestration/agentic_semantic_development_loop_stage_pack.json", "pack_manifest": "docs/orchestration/agentic_semantic_development_loop_stage_pack.json",
"loop_id": "agentic_semantic_development_loop", "loop_id": "asl",
"artifact_path_warning_limit": 240,
"target_score": 88, "target_score": 88,
"max_iterations": 6, "max_iterations": 6,
"repair_mode": "lead-handoff",
"acceptance_invariants": [ "acceptance_invariants": [
"status command exposes next_action, repair state, validation state, and closing gate", "status command exposes next_action, repair state, validation state, and closing gate",
"run-pack-loop defaults to Lead Codex handoff instead of weak autonomous coding",
"continue command never runs the real coder pass without --execute-repair", "continue command never runs the real coder pass without --execute-repair",
"business_audit.md and lead_coder_handoff.md are produced before code repair when semantic replay is not accepted",
"patched repair cannot close the stage without successful rerun/ingest validation", "patched repair cannot close the stage without successful rerun/ingest validation",
"business answers remain direct, context-aware, and free of internal route/debug ids", "business answers remain direct, context-aware, and free of internal route/debug ids",
"manual GUI confirmation remains required after accepted semantic replay" "manual GUI confirmation remains required after accepted semantic replay"

File diff suppressed because it is too large Load Diff

View File

@ -387,6 +387,10 @@ def evaluate_truth_step(
assistant_text = str(step_state.get("assistant_text") or "") assistant_text = str(step_state.get("assistant_text") or "")
direct_answer = str(step_state.get("actual_direct_answer") or "").strip() direct_answer = str(step_state.get("actual_direct_answer") or "").strip()
detected_intent = str(step_state.get("detected_intent") or "").strip() detected_intent = str(step_state.get("detected_intent") or "").strip()
effective_intents = [
detected_intent,
*dcl.normalize_string_list(step_state.get("mcp_discovery_effective_intents")),
]
selected_recipe = str(step_state.get("selected_recipe") or "").strip() selected_recipe = str(step_state.get("selected_recipe") or "").strip()
capability_id = str(step_state.get("capability_id") or "").strip() capability_id = str(step_state.get("capability_id") or "").strip()
catalog_alignment_status = str(step_state.get("mcp_discovery_catalog_chain_alignment_status") or "").strip() catalog_alignment_status = str(step_state.get("mcp_discovery_catalog_chain_alignment_status") or "").strip()
@ -508,13 +512,13 @@ def evaluate_truth_step(
expected_intents = dcl.normalize_string_list( expected_intents = dcl.normalize_string_list(
resolve_nested_placeholders(step.get("expected_intents") or [], step_results, bindings, runtime_bindings) resolve_nested_placeholders(step.get("expected_intents") or [], step_results, bindings, runtime_bindings)
) )
if expected_intents and not dcl.identifier_in_list(detected_intent, expected_intents): if expected_intents and not any(dcl.identifier_in_list(intent, expected_intents) for intent in effective_intents if intent):
append_finding( append_finding(
findings, findings,
step, step,
"wrong_intent", "wrong_intent",
"Интент не соответствует ожидаемому бизнес-смыслу шага.", "Интент не соответствует ожидаемому бизнес-смыслу шага.",
actual=detected_intent or None, actual=effective_intents,
expected=expected_intents, expected=expected_intents,
) )

View File

@ -15,6 +15,7 @@ HISTORY_FILE = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "h
SAVED_SESSIONS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "saved_sessions" SAVED_SESSIONS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "saved_sessions"
EVAL_CASES_DIR = REPO_ROOT / "llm_normalizer" / "data" / "eval_cases" EVAL_CASES_DIR = REPO_ROOT / "llm_normalizer" / "data" / "eval_cases"
VALIDATED_AGENT_SAVE_SCHEMA_VERSION = "agent_semantic_save_gate_v1" VALIDATED_AGENT_SAVE_SCHEMA_VERSION = "agent_semantic_save_gate_v1"
BINDING_TOKEN_RE = re.compile(r"\{\{\s*bindings\.([A-Za-z0-9_-]+)\s*\}\}")
def now_utc() -> datetime: def now_utc() -> datetime:
@ -39,6 +40,36 @@ def sanitize_question(value: Any) -> str:
return text return text
def normalize_bindings(raw_bindings: Any) -> dict[str, str]:
if not isinstance(raw_bindings, dict):
return {}
result: dict[str, str] = {}
for key, value in raw_bindings.items():
normalized_key = str(key or "").strip()
normalized_value = str(value or "").strip()
if normalized_key and normalized_value:
result[normalized_key] = normalized_value
return result
def merge_bindings(*binding_sets: Any) -> dict[str, str]:
merged: dict[str, str] = {}
for raw_bindings in binding_sets:
merged.update(normalize_bindings(raw_bindings))
return merged
def render_question_template(value: Any, bindings: dict[str, str]) -> str:
question = sanitize_question(value)
def replace_binding(match: re.Match[str]) -> str:
binding_key = match.group(1)
replacement = bindings.get(binding_key)
return replacement if replacement is not None else match.group(0)
return sanitize_question(BINDING_TOKEN_RE.sub(replace_binding, question))
def ensure_agent_title(title: str) -> str: def ensure_agent_title(title: str) -> str:
normalized = title.strip() normalized = title.strip()
if not normalized: if not normalized:
@ -237,11 +268,11 @@ def build_save_gate_metadata(args: argparse.Namespace, spec: dict[str, Any], spe
) )
def normalize_questions(raw_questions: list[Any]) -> list[str]: def normalize_questions(raw_questions: list[Any], bindings: dict[str, str] | None = None) -> list[str]:
result: list[str] = [] result: list[str] = []
seen: set[str] = set() seen: set[str] = set()
for item in raw_questions: for item in raw_questions:
question = sanitize_question(item) question = render_question_template(item, bindings or {})
if not question or question in seen: if not question or question in seen:
continue continue
seen.add(question) seen.add(question)
@ -250,50 +281,84 @@ def normalize_questions(raw_questions: list[Any]) -> list[str]:
def extract_semantic_tags(spec: dict[str, Any]) -> list[str]: def extract_semantic_tags(spec: dict[str, Any]) -> list[str]:
steps = spec.get("steps")
if not isinstance(steps, list):
return []
tags: set[str] = set() tags: set[str] = set()
for step in steps: step_groups: list[Any] = []
if not isinstance(step, dict): steps = spec.get("steps")
continue if isinstance(steps, list):
raw_tags = step.get("semantic_tags") step_groups.append(steps)
if not isinstance(raw_tags, list): scenarios = spec.get("scenarios")
continue if isinstance(scenarios, list):
for raw_tag in raw_tags: for scenario in scenarios:
tag = str(raw_tag or "").strip() if isinstance(scenario, dict) and isinstance(scenario.get("steps"), list):
if tag: step_groups.append(scenario["steps"])
tags.add(tag) for step_group in step_groups:
for step in step_group:
if not isinstance(step, dict):
continue
raw_tags = step.get("semantic_tags")
if not isinstance(raw_tags, list):
continue
for raw_tag in raw_tags:
tag = str(raw_tag or "").strip()
if tag:
tags.add(tag)
return sorted(tags) return sorted(tags)
def assert_no_unresolved_bindings(questions: list[str]) -> None:
unresolved = [question for question in questions if BINDING_TOKEN_RE.search(question)]
if unresolved:
sample = unresolved[0]
raise RuntimeError(
"Refusing to save AGENT autorun with unresolved bindings in questions. "
f"First unresolved question: {sample}"
)
def extract_questions_from_spec(spec: dict[str, Any]) -> list[str]: def extract_questions_from_spec(spec: dict[str, Any]) -> list[str]:
global_bindings = normalize_bindings(spec.get("bindings"))
if isinstance(spec.get("questions"), list): if isinstance(spec.get("questions"), list):
return normalize_questions(list(spec["questions"])) questions = normalize_questions(list(spec["questions"]), global_bindings)
assert_no_unresolved_bindings(questions)
return questions
steps = spec.get("steps") steps = spec.get("steps")
if isinstance(steps, list): if isinstance(steps, list):
return normalize_questions( raw_questions = [
[ step.get("question") or step.get("question_template")
step.get("question") or step.get("question_template") for step in steps
for step in steps if isinstance(step, dict) and (step.get("question") or step.get("question_template"))
if isinstance(step, dict) and (step.get("question") or step.get("question_template")) ]
] questions = normalize_questions(raw_questions, global_bindings)
) assert_no_unresolved_bindings(questions)
return questions
scenarios = spec.get("scenarios") scenarios = spec.get("scenarios")
if isinstance(scenarios, list): if isinstance(scenarios, list):
raw_questions: list[Any] = [] questions: list[str] = []
seen: set[str] = set()
for scenario in scenarios: for scenario in scenarios:
if not isinstance(scenario, dict): if not isinstance(scenario, dict):
continue continue
scenario_steps = scenario.get("steps") scenario_steps = scenario.get("steps")
if not isinstance(scenario_steps, list): if not isinstance(scenario_steps, list):
continue continue
raw_questions.extend( scenario_bindings = merge_bindings(global_bindings, scenario.get("bindings"))
step.get("question") or step.get("question_template") for step in scenario_steps:
for step in scenario_steps if not isinstance(step, dict):
if isinstance(step, dict) and (step.get("question") or step.get("question_template")) continue
) raw_question = step.get("question") or step.get("question_template")
return normalize_questions(raw_questions) if not raw_question:
continue
step_bindings = merge_bindings(scenario_bindings, step.get("bindings"))
question = render_question_template(raw_question, step_bindings)
if not question or question in seen:
continue
seen.add(question)
questions.append(question)
assert_no_unresolved_bindings(questions)
return questions
raise RuntimeError( raise RuntimeError(
"Spec must define `questions[]`, `steps[].question`, `steps[].question_template`, " "Spec must define `questions[]`, `steps[].question`, `steps[].question_template`, "
"or `scenarios[].steps[]` questions" "or `scenarios[].steps[]` questions"

View File

@ -212,6 +212,10 @@ def build_scenario_acceptance_matrix(
"mcp_discovery_catalog_chain_alignment_status": step_state.get("mcp_discovery_catalog_chain_alignment_status"), "mcp_discovery_catalog_chain_alignment_status": step_state.get("mcp_discovery_catalog_chain_alignment_status"),
"mcp_discovery_catalog_chain_top_match": step_state.get("mcp_discovery_catalog_chain_top_match"), "mcp_discovery_catalog_chain_top_match": step_state.get("mcp_discovery_catalog_chain_top_match"),
"mcp_discovery_catalog_chain_selected_matches_top": step_state.get("mcp_discovery_catalog_chain_selected_matches_top"), "mcp_discovery_catalog_chain_selected_matches_top": step_state.get("mcp_discovery_catalog_chain_selected_matches_top"),
"mcp_discovery_response_applied": step_state.get("mcp_discovery_response_applied"),
"mcp_discovery_selected_chain_id": step_state.get("mcp_discovery_selected_chain_id"),
"mcp_discovery_response_candidate_status": step_state.get("mcp_discovery_response_candidate_status"),
"mcp_discovery_effective_intents": step_state.get("mcp_discovery_effective_intents"),
"selected_object_step": _has_selected_object_signal(step), "selected_object_step": _has_selected_object_signal(step),
"meta_context_step": _has_meta_context_signal(step), "meta_context_step": _has_meta_context_signal(step),
"highest_unresolved_priority": highest_priority, "highest_unresolved_priority": highest_priority,

View File

@ -6,6 +6,7 @@ import json
import re import re
import subprocess import subprocess
import sys import sys
from collections import Counter
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -19,6 +20,71 @@ DEFAULT_STAGE_OUTPUT_ROOT = REPO_ROOT / "artifacts" / "domain_runs" / "stage_age
DEFAULT_REPAIR_CODER_SCHEMA = REPO_ROOT / "docs" / "orchestration" / "schemas" / "domain_loop_coder_result.schema.json" DEFAULT_REPAIR_CODER_SCHEMA = REPO_ROOT / "docs" / "orchestration" / "schemas" / "domain_loop_coder_result.schema.json"
STAGE_LOOP_SCHEMA_VERSION = "stage_agent_loop_manifest_v1" STAGE_LOOP_SCHEMA_VERSION = "stage_agent_loop_manifest_v1"
STAGE_SUMMARY_SCHEMA_VERSION = "stage_agent_loop_summary_v1" STAGE_SUMMARY_SCHEMA_VERSION = "stage_agent_loop_summary_v1"
STAGE_QUESTION_REVIEW_SCHEMA_VERSION = "stage_question_generation_review_v1"
STAGE_CONTEXT_CAPSULE_SCHEMA_VERSION = "stage_context_capsule_v1"
STAGE_QUESTION_DOMAIN_MARKERS: dict[str, tuple[str, ...]] = {
"business_overview": ("бизнес-обзор", "обзор компании", "компании в целом", "взрослый", "вывод"),
"money": ("деньг", "получили", "заплатили", "денежн", "поток", "доходн", "оборот", "выруч"),
"vat": ("ндс", "налог", "налогов"),
"counterparty": ("контрагент", "группа свк", "свк", "клиент", "поставщик"),
"documents": ("документ", "движен", "цепочк", "основан"),
"inventory": ("склад", "товар", "остат", "номенклатур"),
"debt": ("долг", "долж", "счет 60", "счёт 60", "хвост"),
"scope_guard": ("не смешивай", "не тащи", "не называй", "что нельзя", "чего не хватает"),
"memory": ("напомни", "уже выяснил", "контекст", "вернись"),
}
STAGE_QUESTION_FOLLOWUP_MARKERS = (
"теперь",
"а если",
"по этой",
"по этому",
"по ней",
"по нему",
"вернись",
"собери",
"напомни",
"дальше",
)
STAGE_QUESTION_REPORT_MARKERS = (
"обзор",
"анализ",
"вывод",
"итог",
"оцен",
"что можно",
"что нельзя",
)
STAGE_QUESTION_MOJIBAKE_MARKERS = (
"\u0420\u00b0",
"\u0420\u00b1",
"\u0420\u0406",
"\u0420\u0456",
"\u0420\u0491",
"\u0420\u00b5",
"\u0420\u0451",
"\u0420\u2116",
"\u0420\u0454",
"\u0420\u00bb",
"\u0420\u0458",
"\u0420\u0405",
"\u0420\u0455",
"\u0420\u0457",
"\u0421\u0402",
"\u0421\u0403",
"\u0421\u201a",
"\u0421\u0453",
"\u0421\u201e",
"\u0421\u2026",
"\u0421\u2020",
"\u0421\u2021",
"\u0421\u20ac",
"\u0421\u2030",
"\u0421\u040a",
"\u0421\u2039",
"\u0421\u040b",
"\u0421\u040f",
)
def now_iso() -> str: def now_iso() -> str:
@ -75,6 +141,517 @@ def string_list(value: Any) -> list[str]:
return result return result
def normalize_review_text(value: Any) -> str:
return re.sub(r"\s+", " ", str(value or "").strip().lower())
def has_any_marker(text: str, markers: tuple[str, ...]) -> bool:
normalized = normalize_review_text(text)
return any(marker in normalized for marker in markers)
def looks_like_mojibake(value: Any) -> bool:
text = str(value or "")
return sum(1 for marker in STAGE_QUESTION_MOJIBAKE_MARKERS if marker in text) >= 2
def resolve_pack_bindings(value: str, bindings: dict[str, Any]) -> str:
def replace(match: re.Match[str]) -> str:
key = match.group(1)
replacement = bindings.get(key)
return str(replacement) if replacement is not None else match.group(0)
return re.sub(r"\{\{\s*bindings\.([a-zA-Z0-9_.-]+)\s*\}\}", replace, str(value or "")).strip()
def iter_stage_pack_steps(pack_manifest: dict[str, Any]) -> list[dict[str, Any]]:
bindings = pack_manifest.get("bindings") if isinstance(pack_manifest.get("bindings"), dict) else {}
scenarios = pack_manifest.get("scenarios") if isinstance(pack_manifest.get("scenarios"), list) else []
steps: list[dict[str, Any]] = []
for scenario_index, raw_scenario in enumerate(scenarios, start=1):
if not isinstance(raw_scenario, dict):
continue
scenario_id = str(raw_scenario.get("scenario_id") or f"scenario_{scenario_index:02d}").strip()
scenario_title = str(raw_scenario.get("title") or scenario_id).strip()
raw_steps = raw_scenario.get("steps") if isinstance(raw_scenario.get("steps"), list) else []
for step_index, raw_step in enumerate(raw_steps, start=1):
if isinstance(raw_step, str):
raw_question = raw_step.strip()
raw_step_object: dict[str, Any] = {
"step_id": f"step_{step_index:02d}",
"title": f"Step {step_index:02d}",
"question": raw_question,
}
elif isinstance(raw_step, dict):
raw_step_object = raw_step
raw_question = str(raw_step.get("question") or raw_step.get("question_template") or "").strip()
else:
continue
step_id = str(raw_step_object.get("step_id") or f"step_{step_index:02d}").strip()
depends_on = string_list(raw_step_object.get("depends_on"))
expected_intents = string_list(
raw_step_object.get("expected_intents") or raw_step_object.get("expected_intent")
)
semantic_tags = string_list(raw_step_object.get("semantic_tags"))
steps.append(
{
"global_index": len(steps) + 1,
"scenario_index": scenario_index,
"scenario_id": scenario_id,
"scenario_title": scenario_title,
"scenario_step_index": step_index,
"step_id": step_id,
"title": str(raw_step_object.get("title") or step_id).strip() or step_id,
"question_template": raw_question,
"question_resolved": resolve_pack_bindings(raw_question, bindings),
"depends_on": depends_on,
"expected_intents": expected_intents,
"semantic_tags": semantic_tags,
"required_answer_shape": (
str(
raw_step_object.get("required_answer_shape")
or raw_step_object.get("expected_answer_shape")
or ""
).strip()
or None
),
"forbidden_answer_patterns": string_list(raw_step_object.get("forbidden_answer_patterns")),
"node_role": str(raw_step_object.get("node_role") or raw_step_object.get("role") or "").strip()
or None,
}
)
return steps
def classify_stage_pack_step(step: dict[str, Any]) -> dict[str, Any]:
question = str(step.get("question_resolved") or step.get("question_template") or "")
title = str(step.get("title") or "")
expected_intents = string_list(step.get("expected_intents"))
semantic_tags = string_list(step.get("semantic_tags"))
combined = " ".join([question, title, *expected_intents, *semantic_tags])
domains: list[str] = []
for domain, markers in STAGE_QUESTION_DOMAIN_MARKERS.items():
if domain in expected_intents or domain in semantic_tags or has_any_marker(combined, markers):
domains.append(domain)
depends_on = string_list(step.get("depends_on"))
tags: list[str] = []
if depends_on or has_any_marker(question, STAGE_QUESTION_FOLLOWUP_MARKERS):
tags.append("contextual_followup")
else:
tags.append("root_question")
if step.get("required_answer_shape"):
tags.append("answer_shape_declared")
if str(step.get("required_answer_shape") or "") == "direct_answer_first":
tags.append("direct_answer_first_required")
if step.get("forbidden_answer_patterns"):
tags.append("canary_or_guarded_question")
if has_any_marker(question, STAGE_QUESTION_REPORT_MARKERS) or "business_overview" in domains:
tags.append("report_or_analysis_request")
if domains:
tags.append("domain_grounded")
weak_flags: list[str] = []
if looks_like_mojibake(question) or looks_like_mojibake(step.get("question_template")):
weak_flags.append("mojibake_question_text")
if int(step.get("scenario_step_index") or 0) == 1 and "contextual_followup" in tags:
weak_flags.append("root_question_requires_missing_context")
if len(question) > 700:
weak_flags.append("question_too_long")
if not domains and not expected_intents and not semantic_tags:
weak_flags.append("low_business_anchor")
if not step.get("required_answer_shape"):
weak_flags.append("missing_required_answer_shape")
return {
**step,
"question": question,
"domains": domains,
"tags": tags,
"weak_flags": weak_flags,
"length_chars": len(question),
}
def build_stage_question_recommendations(
*,
missing_domains: list[str],
weak_flag_counts: Counter[str],
score: int,
) -> list[str]:
recommendations: list[str] = []
if weak_flag_counts["mojibake_question_text"]:
recommendations.append("Repair generated question text to normal UTF-8 Russian before any live replay.")
if missing_domains:
recommendations.append("Add questions for missing control domains: " + ", ".join(missing_domains) + ".")
if weak_flag_counts["too_few_contextual_followups"]:
recommendations.append("Add more follow-up turns that depend on prior answers and test carryover.")
if weak_flag_counts["root_question_requires_missing_context"]:
recommendations.append("Rewrite scenario roots so they are self-contained and do not start as follow-ups.")
if weak_flag_counts["missing_required_answer_shape"]:
recommendations.append("Declare required_answer_shape for each business-critical step.")
if weak_flag_counts["no_canary_or_guarded_question"]:
recommendations.append("Add forbidden-answer canaries for internal ids, stale scope, and unsupported claims.")
if weak_flag_counts["artifact_path_too_long_for_windows"]:
recommendations.append("Shorten loop/scenario/step artifact ids before live replay on Windows.")
if score >= 85 and not recommendations:
recommendations.append("Question pack is strong enough for live semantic replay.")
return recommendations
def build_stage_question_generation_review(stage_manifest: dict[str, Any], stage_dir: Path | None = None) -> dict[str, Any]:
pack_manifest_path = repo_path(stage_manifest["pack_manifest"])
pack_manifest = load_json_object(pack_manifest_path, "Stage question pack manifest")
raw_steps = iter_stage_pack_steps(pack_manifest)
question_reviews = [classify_stage_pack_step(step) for step in raw_steps]
question_counter = Counter(normalize_review_text(item["question"]) for item in question_reviews if item["question"])
duplicate_questions = [question for question, count in question_counter.items() if count > 1]
tag_counts = Counter(tag for item in question_reviews for tag in item["tags"])
domain_counts = Counter(domain for item in question_reviews for domain in item["domains"])
weak_flag_counts = Counter(flag for item in question_reviews for flag in item["weak_flags"])
if duplicate_questions:
weak_flag_counts["duplicate_questions"] += len(duplicate_questions)
scenario_ids = {str(item.get("scenario_id") or "") for item in question_reviews if item.get("scenario_id")}
if len(question_reviews) < 8:
weak_flag_counts["too_few_questions_for_stage_replay"] += 1
if len(scenario_ids) < 2:
weak_flag_counts["too_few_scenarios"] += 1
if tag_counts["contextual_followup"] < max(2, len(question_reviews) // 3):
weak_flag_counts["too_few_contextual_followups"] += 1
if tag_counts["direct_answer_first_required"] < len(question_reviews):
weak_flag_counts["missing_direct_answer_shape_on_some_steps"] += 1
if tag_counts["canary_or_guarded_question"] < 2:
weak_flag_counts["no_canary_or_guarded_question"] += 1
if tag_counts["report_or_analysis_request"] < 1:
weak_flag_counts["missing_report_or_analysis_request"] += 1
if len(domain_counts) < 5:
weak_flag_counts["low_domain_diversity"] += 1
required_domains = ["business_overview", "money", "vat", "counterparty", "documents", "scope_guard"]
missing_domains = [domain for domain in required_domains if domain_counts[domain] == 0]
for domain in missing_domains:
weak_flag_counts[f"missing_domain_{domain}"] += 1
known_steps_by_scenario: dict[str, set[str]] = {}
for item in question_reviews:
scenario_id = str(item.get("scenario_id") or "")
known_steps_by_scenario.setdefault(scenario_id, set()).add(str(item.get("step_id") or ""))
dependency_errors: list[dict[str, str]] = []
seen_by_scenario: dict[str, set[str]] = {}
for item in question_reviews:
scenario_id = str(item.get("scenario_id") or "")
seen = seen_by_scenario.setdefault(scenario_id, set())
for dependency in string_list(item.get("depends_on")):
if dependency not in known_steps_by_scenario.get(scenario_id, set()):
dependency_errors.append(
{
"scenario_id": scenario_id,
"step_id": str(item.get("step_id") or ""),
"dependency": dependency,
"error": "unknown_dependency",
}
)
elif dependency not in seen:
dependency_errors.append(
{
"scenario_id": scenario_id,
"step_id": str(item.get("step_id") or ""),
"dependency": dependency,
"error": "forward_dependency",
}
)
seen.add(str(item.get("step_id") or ""))
if dependency_errors:
weak_flag_counts["dependency_order_errors"] += len(dependency_errors)
resolved_stage_dir = stage_dir or stage_dir_for(DEFAULT_STAGE_OUTPUT_ROOT, stage_manifest["stage_id"])
loop_dir = stage_loop_dir(resolved_stage_dir, stage_manifest)
artifact_path_budget = int(stage_manifest.get("artifact_path_warning_limit") or 240)
estimated_artifact_paths: list[dict[str, Any]] = []
for item in question_reviews:
estimated_path = (
loop_dir
/ "iterations"
/ "iteration_00"
/ "pack_output"
/ "pack_run"
/ "scenarios"
/ str(item.get("scenario_id") or "")
/ "steps"
/ str(item.get("step_id") or "")
).resolve()
estimated_artifact_paths.append(
{
"scenario_id": item.get("scenario_id"),
"step_id": item.get("step_id"),
"path": str(estimated_path),
"length": len(str(estimated_path)),
}
)
max_estimated_artifact_path = max((int(item["length"]) for item in estimated_artifact_paths), default=0)
if max_estimated_artifact_path >= artifact_path_budget:
weak_flag_counts["artifact_path_too_long_for_windows"] += 1
score = 100
score -= min(40, weak_flag_counts["mojibake_question_text"] * 10)
score -= 12 if weak_flag_counts["too_few_questions_for_stage_replay"] else 0
score -= 12 if weak_flag_counts["too_few_scenarios"] else 0
score -= 10 if weak_flag_counts["too_few_contextual_followups"] else 0
score -= 8 if weak_flag_counts["missing_direct_answer_shape_on_some_steps"] else 0
score -= 8 if weak_flag_counts["no_canary_or_guarded_question"] else 0
score -= 10 if weak_flag_counts["missing_report_or_analysis_request"] else 0
score -= 10 if weak_flag_counts["low_domain_diversity"] else 0
score -= min(20, weak_flag_counts["root_question_requires_missing_context"] * 10)
score -= min(24, len(missing_domains) * 6)
score -= min(24, weak_flag_counts["low_business_anchor"] * 6)
score -= min(20, weak_flag_counts["duplicate_questions"] * 5)
score -= min(30, weak_flag_counts["dependency_order_errors"] * 15)
score -= min(12, weak_flag_counts["question_too_long"] * 3)
score -= 20 if weak_flag_counts["artifact_path_too_long_for_windows"] else 0
score = max(0, min(100, score))
if score >= 85:
status = "strong"
elif score >= 70:
status = "usable_with_gaps"
else:
status = "weak"
return {
"schema_version": STAGE_QUESTION_REVIEW_SCHEMA_VERSION,
"created_at": now_iso(),
"stage_id": stage_manifest["stage_id"],
"module_name": stage_manifest.get("module_name"),
"title": stage_manifest.get("title"),
"pack_manifest": repo_relative(pack_manifest_path),
"pack_id": pack_manifest.get("pack_id"),
"status": status,
"score": score,
"question_count": len(question_reviews),
"scenario_count": len(scenario_ids),
"coverage": {
"contextual_followup_questions": tag_counts["contextual_followup"],
"root_questions": tag_counts["root_question"],
"direct_answer_shape_questions": tag_counts["direct_answer_first_required"],
"canary_or_guarded_questions": tag_counts["canary_or_guarded_question"],
"report_or_analysis_questions": tag_counts["report_or_analysis_request"],
},
"tag_counts": dict(sorted(tag_counts.items())),
"domain_counts": dict(sorted(domain_counts.items())),
"weak_flag_counts": dict(sorted(weak_flag_counts.items())),
"missing_domains": missing_domains,
"duplicate_questions": duplicate_questions[:20],
"dependency_errors": dependency_errors,
"artifact_path_budget": artifact_path_budget,
"max_estimated_artifact_path": max_estimated_artifact_path,
"estimated_artifact_paths": estimated_artifact_paths,
"recommendations": build_stage_question_recommendations(
missing_domains=missing_domains,
weak_flag_counts=weak_flag_counts,
score=score,
),
"questions": question_reviews,
}
def build_stage_question_generation_markdown(review: dict[str, Any]) -> str:
lines = [
"# Stage Question Generation Review",
"",
f"- stage_id: `{review.get('stage_id')}`",
f"- status: `{review.get('status')}`",
f"- score: `{review.get('score')}`",
f"- questions: `{review.get('question_count')}`",
f"- scenarios: `{review.get('scenario_count')}`",
f"- pack_manifest: `{review.get('pack_manifest')}`",
"",
"## Coverage",
"",
]
coverage = review.get("coverage") if isinstance(review.get("coverage"), dict) else {}
for key in sorted(coverage):
lines.append(f"- {key}: `{coverage[key]}`")
lines.extend(["", "## Domains", ""])
domains = review.get("domain_counts") if isinstance(review.get("domain_counts"), dict) else {}
for key in sorted(domains):
lines.append(f"- {key}: `{domains[key]}`")
lines.extend(["", "## Weak Flags", ""])
weak_flags = review.get("weak_flag_counts") if isinstance(review.get("weak_flag_counts"), dict) else {}
if weak_flags:
for key in sorted(weak_flags):
lines.append(f"- {key}: `{weak_flags[key]}`")
else:
lines.append("- none")
lines.extend(["", "## Artifact Path Budget", ""])
lines.append(f"- budget: `{review.get('artifact_path_budget')}`")
lines.append(f"- max_estimated_artifact_path: `{review.get('max_estimated_artifact_path')}`")
lines.extend(["", "## Recommendations", ""])
for recommendation in review.get("recommendations") or []:
lines.append(f"- {recommendation}")
lines.extend(
[
"",
"## Questions",
"",
"| # | scenario | step | domains | weak_flags | question |",
"|---:|---|---|---|---|---|",
]
)
for item in review.get("questions") or []:
question = str(item.get("question") or "").replace("|", "\\|")
if len(question) > 180:
question = question[:177].rstrip() + "..."
domains_text = ", ".join(item.get("domains") or []) or "-"
weak_text = ", ".join(item.get("weak_flags") or []) or "-"
lines.append(
"| "
+ " | ".join(
[
str(item.get("global_index") or ""),
str(item.get("scenario_id") or ""),
str(item.get("step_id") or ""),
domains_text,
weak_text,
question,
]
)
+ " |"
)
lines.append("")
return "\n".join(lines)
def build_stage_context_capsule(
stage_manifest: dict[str, Any],
stage_dir: Path,
*,
question_review: dict[str, Any] | None = None,
summary: dict[str, Any] | None = None,
) -> dict[str, Any]:
repair_mode = dcl.normalize_repair_mode(stage_manifest.get("repair_mode"))
review = question_review if isinstance(question_review, dict) else {}
loop_summary = summary if isinstance(summary, dict) else {}
return {
"schema_version": STAGE_CONTEXT_CAPSULE_SCHEMA_VERSION,
"created_at": now_iso(),
"stage_id": stage_manifest["stage_id"],
"module_name": stage_manifest.get("module_name"),
"title": stage_manifest.get("title"),
"architecture_phase": stage_manifest.get("architecture_phase"),
"current_stage_status": stage_manifest.get("current_stage_status"),
"repair_mode": repair_mode,
"operating_model": {
"lead": "Lead Codex keeps implementation responsibility in the main project context.",
"business_auditor": "A strong independent read-only Codex analyst reviews user-facing business meaning before technical metadata.",
"auto_coder": "Disabled by default; use repair_mode=auto-coder only as an explicit opt-in experiment.",
"human_operator": "Receives only final visual confirmation requests or real unresolved business/architecture decisions.",
},
"global_plan_refs": stage_manifest.get("global_plan_refs") or [],
"acceptance_invariants": stage_manifest.get("acceptance_invariants") or [],
"pack_manifest": repo_relative(repo_path(stage_manifest["pack_manifest"])),
"stage_dir": repo_relative(stage_dir),
"loop_dir": repo_relative(stage_loop_dir(stage_dir, stage_manifest)),
"question_generation": {
"status": review.get("status"),
"score": review.get("score"),
"question_count": review.get("question_count"),
"scenario_count": review.get("scenario_count"),
"missing_domains": review.get("missing_domains") or [],
"weak_flag_counts": review.get("weak_flag_counts") or {},
"review_json": repo_relative(stage_dir / "question_generation_review.json"),
"review_markdown": repo_relative(stage_dir / "question_generation_review.md"),
},
"latest_loop_summary": {
"loop_final_status": loop_summary.get("loop_final_status"),
"last_quality_score": loop_summary.get("last_quality_score"),
"last_analyst_decision": loop_summary.get("last_analyst_decision"),
"next_action": loop_summary.get("next_action"),
"latest_business_audit": loop_summary.get("latest_business_audit"),
"latest_lead_coder_handoff": loop_summary.get("latest_lead_coder_handoff"),
},
"quality_rules": [
"Review the human question and visible answer before internal route ids.",
"Treat direct-answer-first, business usefulness, temporal honesty, field truth, and answer layering as acceptance gates.",
"Treat deterministic P0/P1 repair targets as blockers even if the analyst wording sounds optimistic.",
"After code edits, run targeted tests/build and rebuild graphify before replay evidence is trusted.",
],
}
def build_stage_context_capsule_markdown(capsule: dict[str, Any]) -> str:
operating_model = capsule.get("operating_model") if isinstance(capsule.get("operating_model"), dict) else {}
question_generation = (
capsule.get("question_generation")
if isinstance(capsule.get("question_generation"), dict)
else {}
)
latest_loop_summary = (
capsule.get("latest_loop_summary")
if isinstance(capsule.get("latest_loop_summary"), dict)
else {}
)
lines = [
"# Stage Context Capsule",
"",
f"- stage_id: `{capsule.get('stage_id')}`",
f"- module_name: `{capsule.get('module_name')}`",
f"- title: {capsule.get('title')}",
f"- architecture_phase: `{capsule.get('architecture_phase') or 'n/a'}`",
f"- current_stage_status: `{capsule.get('current_stage_status') or 'n/a'}`",
f"- repair_mode: `{capsule.get('repair_mode')}`",
f"- pack_manifest: `{capsule.get('pack_manifest')}`",
f"- stage_dir: `{capsule.get('stage_dir')}`",
f"- loop_dir: `{capsule.get('loop_dir')}`",
"",
"## Operating Model",
]
lines.extend([f"- {key}: {value}" for key, value in operating_model.items()] or ["- n/a"])
lines.extend(["", "## Question Generation"])
lines.extend(
[
f"- status: `{question_generation.get('status')}`",
f"- score: `{question_generation.get('score')}`",
f"- question_count: `{question_generation.get('question_count')}`",
f"- scenario_count: `{question_generation.get('scenario_count')}`",
f"- review_markdown: `{question_generation.get('review_markdown')}`",
]
)
lines.extend(["", "## Latest Loop"])
for key in [
"loop_final_status",
"last_quality_score",
"last_analyst_decision",
"next_action",
"latest_business_audit",
"latest_lead_coder_handoff",
]:
lines.append(f"- {key}: `{latest_loop_summary.get(key) or 'n/a'}`")
lines.extend(["", "## Quality Rules"])
lines.extend([f"- {item}" for item in capsule.get("quality_rules") or []])
return "\n".join(lines).strip() + "\n"
def save_stage_context_capsule(
stage_manifest: dict[str, Any],
stage_dir: Path,
*,
question_review: dict[str, Any] | None = None,
summary: dict[str, Any] | None = None,
) -> dict[str, Any]:
if summary is None and (stage_dir / "stage_loop_summary.json").exists():
summary = load_json_object(stage_dir / "stage_loop_summary.json", "Existing stage summary")
capsule = build_stage_context_capsule(
stage_manifest,
stage_dir,
question_review=question_review,
summary=summary,
)
write_json(stage_dir / "stage_context_capsule.json", capsule)
write_text(stage_dir / "stage_context_capsule.md", build_stage_context_capsule_markdown(capsule))
return capsule
def load_stage_manifest(path: Path) -> dict[str, Any]: def load_stage_manifest(path: Path) -> dict[str, Any]:
raw = load_json_object(path, "Stage agent loop manifest") raw = load_json_object(path, "Stage agent loop manifest")
stage_id = slugify(str(raw.get("stage_id") or path.stem), path.stem) stage_id = slugify(str(raw.get("stage_id") or path.stem), path.stem)
@ -87,6 +664,7 @@ def load_stage_manifest(path: Path) -> dict[str, Any]:
raise RuntimeError("Stage manifest `target_score` must be between 0 and 100") raise RuntimeError("Stage manifest `target_score` must be between 0 and 100")
if max_iterations < 1: if max_iterations < 1:
raise RuntimeError("Stage manifest `max_iterations` must be >= 1") raise RuntimeError("Stage manifest `max_iterations` must be >= 1")
repair_mode = dcl.normalize_repair_mode(raw.get("repair_mode"))
return { return {
**raw, **raw,
"schema_version": str(raw.get("schema_version") or STAGE_LOOP_SCHEMA_VERSION), "schema_version": str(raw.get("schema_version") or STAGE_LOOP_SCHEMA_VERSION),
@ -96,6 +674,7 @@ def load_stage_manifest(path: Path) -> dict[str, Any]:
"pack_manifest": pack_manifest, "pack_manifest": pack_manifest,
"target_score": target_score, "target_score": target_score,
"max_iterations": max_iterations, "max_iterations": max_iterations,
"repair_mode": repair_mode,
"global_plan_refs": string_list(raw.get("global_plan_refs")), "global_plan_refs": string_list(raw.get("global_plan_refs")),
"acceptance_invariants": string_list(raw.get("acceptance_invariants")), "acceptance_invariants": string_list(raw.get("acceptance_invariants")),
"save_autorun_on_accept": bool(raw.get("save_autorun_on_accept", True)), "save_autorun_on_accept": bool(raw.get("save_autorun_on_accept", True)),
@ -118,6 +697,7 @@ def stage_gui_review_dir(stage_dir: Path, run_id: str) -> Path:
def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dict[str, Any], stage_dir: Path) -> list[str]: def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dict[str, Any], stage_dir: Path) -> list[str]:
loop_id = str(stage_manifest.get("loop_id") or stage_manifest["stage_id"]).strip() loop_id = str(stage_manifest.get("loop_id") or stage_manifest["stage_id"]).strip()
repair_mode = dcl.normalize_repair_mode(getattr(args, "repair_mode", None) or stage_manifest.get("repair_mode"))
command = [ command = [
sys.executable, sys.executable,
str(REPO_ROOT / "scripts" / "domain_case_loop.py"), str(REPO_ROOT / "scripts" / "domain_case_loop.py"),
@ -132,6 +712,8 @@ def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dic
str(int(stage_manifest["target_score"])), str(int(stage_manifest["target_score"])),
"--max-iterations", "--max-iterations",
str(int(stage_manifest["max_iterations"])), str(int(stage_manifest["max_iterations"])),
"--repair-mode",
repair_mode,
"--backend-url", "--backend-url",
str(args.backend_url), str(args.backend_url),
"--prompt-version", "--prompt-version",
@ -238,6 +820,8 @@ def build_stage_summary(
iterations = loop_state.get("iterations") if isinstance(loop_state.get("iterations"), list) else [] iterations = loop_state.get("iterations") if isinstance(loop_state.get("iterations"), list) else []
last_iteration = iterations[-1] if iterations and isinstance(iterations[-1], dict) else {} last_iteration = iterations[-1] if iterations and isinstance(iterations[-1], dict) else {}
final_status = str(loop_state.get("final_status") or "unknown").strip() final_status = str(loop_state.get("final_status") or "unknown").strip()
repair_mode = dcl.normalize_repair_mode(loop_state.get("repair_mode") or stage_manifest.get("repair_mode"))
lead_handoff_required = str(last_iteration.get("coder_status") or "") == "lead_handoff_required"
raw_loop_accepted = final_status == "accepted" and bool(last_iteration.get("accepted_gate")) raw_loop_accepted = final_status == "accepted" and bool(last_iteration.get("accepted_gate"))
closing_gate = build_stage_closing_gate(previous_summary) closing_gate = build_stage_closing_gate(previous_summary)
accepted = raw_loop_accepted and bool(closing_gate.get("passed")) accepted = raw_loop_accepted and bool(closing_gate.get("passed"))
@ -248,8 +832,12 @@ def build_stage_summary(
next_action = "stage_closed_without_manual_confirmation" next_action = "stage_closed_without_manual_confirmation"
elif raw_loop_accepted and not bool(closing_gate.get("passed")): elif raw_loop_accepted and not bool(closing_gate.get("passed")):
next_action = "rerun_same_stage_or_gui_and_ingest_result" next_action = "rerun_same_stage_or_gui_and_ingest_result"
elif lead_handoff_required:
next_action = "lead_coder_repair_required"
elif bool(loop_state.get("last_user_decision_prompt")): elif bool(loop_state.get("last_user_decision_prompt")):
next_action = "user_decision_required" next_action = "user_decision_required"
elif repair_mode == dcl.REPAIR_MODE_LEAD_HANDOFF and final_status in {"partial", "needs_exact_capability"}:
next_action = "rerun_stage_loop_for_lead_handoff"
else: else:
next_action = "continue_autonomous_or_fix_blocker" next_action = "continue_autonomous_or_fix_blocker"
summary = { summary = {
@ -260,6 +848,7 @@ def build_stage_summary(
"global_plan_refs": stage_manifest.get("global_plan_refs") or [], "global_plan_refs": stage_manifest.get("global_plan_refs") or [],
"target_score": stage_manifest.get("target_score"), "target_score": stage_manifest.get("target_score"),
"acceptance_invariants": stage_manifest.get("acceptance_invariants") or [], "acceptance_invariants": stage_manifest.get("acceptance_invariants") or [],
"repair_mode": repair_mode,
"loop_dir": repo_relative(loop_dir), "loop_dir": repo_relative(loop_dir),
"loop_final_status": final_status, "loop_final_status": final_status,
"stop_reason": loop_state.get("stop_reason"), "stop_reason": loop_state.get("stop_reason"),
@ -268,6 +857,9 @@ def build_stage_summary(
"last_analyst_decision": last_iteration.get("loop_decision") or loop_state.get("last_analyst_decision"), "last_analyst_decision": last_iteration.get("loop_decision") or loop_state.get("last_analyst_decision"),
"last_deterministic_gate_ok": last_iteration.get("deterministic_gate_ok"), "last_deterministic_gate_ok": last_iteration.get("deterministic_gate_ok"),
"last_deterministic_gate_reason": last_iteration.get("deterministic_gate_reason"), "last_deterministic_gate_reason": last_iteration.get("deterministic_gate_reason"),
"latest_business_audit": repo_relative(Path(str(last_iteration.get("business_audit_path")))) if last_iteration.get("business_audit_path") else None,
"latest_lead_coder_handoff": repo_relative(Path(str(last_iteration.get("lead_coder_handoff_markdown_path") or loop_state.get("latest_lead_coder_handoff_markdown_path")))) if (last_iteration.get("lead_coder_handoff_markdown_path") or loop_state.get("latest_lead_coder_handoff_markdown_path")) else None,
"latest_lead_coder_handoff_json": repo_relative(Path(str(last_iteration.get("lead_coder_handoff_path") or loop_state.get("latest_lead_coder_handoff_path")))) if (last_iteration.get("lead_coder_handoff_path") or loop_state.get("latest_lead_coder_handoff_path")) else None,
"loop_accepted_gate": bool(last_iteration.get("accepted_gate")), "loop_accepted_gate": bool(last_iteration.get("accepted_gate")),
"accepted_gate": accepted, "accepted_gate": accepted,
"stage_closing_gate": closing_gate, "stage_closing_gate": closing_gate,
@ -329,6 +921,15 @@ def build_next_step_guidance(next_action: str) -> dict[str, Any]:
"user_decision_required": [ "user_decision_required": [
"read stage_loop_handoff.md and resolve the recorded user decision point", "read stage_loop_handoff.md and resolve the recorded user decision point",
], ],
"lead_coder_repair_required": [
"read stage_loop_handoff.md, latest lead_coder_handoff.md, and business_audit.md",
"repair code in the main Lead Codex context; do not run weak auto-coder by default",
"run targeted tests/build/graphify, then rerun the same semantic pack or ingest the GUI validation run",
],
"rerun_stage_loop_for_lead_handoff": [
"python scripts/stage_agent_loop.py run --manifest <stage_manifest.json>",
"the previous summary predates lead-handoff repair mode; rerun the stage loop to generate business_audit.md and lead_coder_handoff.md",
],
"continue_autonomous_or_fix_blocker": [ "continue_autonomous_or_fix_blocker": [
"inspect stage_loop_handoff.md and rerun stage_agent_loop.py run after resolving the blocker", "inspect stage_loop_handoff.md and rerun stage_agent_loop.py run after resolving the blocker",
], ],
@ -354,6 +955,7 @@ def build_stage_handoff_markdown(summary: dict[str, Any]) -> str:
f"- module_name: `{summary.get('module_name')}`", f"- module_name: `{summary.get('module_name')}`",
f"- title: {summary.get('title')}", f"- title: {summary.get('title')}",
f"- loop_final_status: `{summary.get('loop_final_status')}`", f"- loop_final_status: `{summary.get('loop_final_status')}`",
f"- repair_mode: `{summary.get('repair_mode') or 'n/a'}`",
f"- target_score: `{summary.get('target_score')}`", f"- target_score: `{summary.get('target_score')}`",
f"- iterations_ran: `{summary.get('iterations_ran')}`", f"- iterations_ran: `{summary.get('iterations_ran')}`",
f"- last_quality_score: `{summary.get('last_quality_score')}`", f"- last_quality_score: `{summary.get('last_quality_score')}`",
@ -365,6 +967,8 @@ def build_stage_handoff_markdown(summary: dict[str, Any]) -> str:
f"- manual_confirmation_required: `{summary.get('manual_confirmation_required')}`", f"- manual_confirmation_required: `{summary.get('manual_confirmation_required')}`",
f"- next_action: `{summary.get('next_action')}`", f"- next_action: `{summary.get('next_action')}`",
f"- loop_dir: `{summary.get('loop_dir')}`", f"- loop_dir: `{summary.get('loop_dir')}`",
f"- latest_business_audit: `{summary.get('latest_business_audit') or 'n/a'}`",
f"- latest_lead_coder_handoff: `{summary.get('latest_lead_coder_handoff') or 'n/a'}`",
f"- stop_reason: {summary.get('stop_reason') or 'n/a'}", f"- stop_reason: {summary.get('stop_reason') or 'n/a'}",
"", "",
"## Plan refs", "## Plan refs",
@ -988,14 +1592,13 @@ def run_stage_repair(args: argparse.Namespace) -> dict[str, Any]:
write_json(iteration_dir / "repair_execution_summary.json", payload) write_json(iteration_dir / "repair_execution_summary.json", payload)
summary_path = stage_dir / "stage_loop_summary.json" summary_path = stage_dir / "stage_loop_summary.json"
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
save_stage_summary( summary = build_repair_execution_stage_summary(
stage_dir, stage_manifest=stage_manifest,
build_repair_execution_stage_summary( previous_summary=previous_summary,
stage_manifest=stage_manifest, execution=payload,
previous_summary=previous_summary,
execution=payload,
),
) )
save_stage_summary(stage_dir, summary)
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
return payload return payload
snapshots = dcl.snapshot_coder_candidate_files(repair_candidate_paths(plan)) snapshots = dcl.snapshot_coder_candidate_files(repair_candidate_paths(plan))
@ -1016,14 +1619,13 @@ def run_stage_repair(args: argparse.Namespace) -> dict[str, Any]:
write_json(iteration_dir / "repair_execution_summary.json", payload) write_json(iteration_dir / "repair_execution_summary.json", payload)
summary_path = stage_dir / "stage_loop_summary.json" summary_path = stage_dir / "stage_loop_summary.json"
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
save_stage_summary( summary = build_repair_execution_stage_summary(
stage_dir, stage_manifest=stage_manifest,
build_repair_execution_stage_summary( previous_summary=previous_summary,
stage_manifest=stage_manifest, execution=payload,
previous_summary=previous_summary,
execution=payload,
),
) )
save_stage_summary(stage_dir, summary)
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
return payload return payload
@ -1062,6 +1664,7 @@ def ingest_gui_run_review(args: argparse.Namespace) -> dict[str, Any]:
previous_summary=previous_summary, previous_summary=previous_summary,
) )
save_stage_summary(stage_dir, summary) save_stage_summary(stage_dir, summary)
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
save_stage_repair_handoff(stage_dir, build_stage_repair_handoff(summary, review)) save_stage_repair_handoff(stage_dir, build_stage_repair_handoff(summary, review))
return summary return summary
@ -1135,7 +1738,23 @@ def build_stage_status(stage_manifest: dict[str, Any], stage_dir: Path) -> dict[
else {} else {}
) )
domain_command_path = stage_dir / "domain_pack_loop.command.txt" domain_command_path = stage_dir / "domain_pack_loop.command.txt"
question_review_path = stage_dir / "question_generation_review.json"
question_review = (
load_json_object(question_review_path, "Stage question generation review")
if question_review_path.exists()
else {}
)
repair_mode = str(summary.get("repair_mode") or stage_manifest.get("repair_mode") or dcl.REPAIR_MODE_LEAD_HANDOFF)
next_action = str(summary.get("next_action") or "run_stage_loop_or_ingest_gui_run") next_action = str(summary.get("next_action") or "run_stage_loop_or_ingest_gui_run")
next_step_guidance = summary.get("next_step_guidance") or build_next_step_guidance(next_action)
if (
repair_mode == dcl.REPAIR_MODE_LEAD_HANDOFF
and next_action == "continue_autonomous_or_fix_blocker"
and not summary.get("latest_lead_coder_handoff")
and summary.get("loop_final_status") in {"partial", "needs_exact_capability"}
):
next_action = "rerun_stage_loop_for_lead_handoff"
next_step_guidance = build_next_step_guidance(next_action)
latest_gui_review = summary.get("latest_gui_review") if isinstance(summary.get("latest_gui_review"), dict) else {} latest_gui_review = summary.get("latest_gui_review") if isinstance(summary.get("latest_gui_review"), dict) else {}
latest_repair_execution = ( latest_repair_execution = (
summary.get("latest_repair_execution") summary.get("latest_repair_execution")
@ -1159,14 +1778,18 @@ def build_stage_status(stage_manifest: dict[str, Any], stage_dir: Path) -> dict[
"title": stage_manifest.get("title"), "title": stage_manifest.get("title"),
"stage_dir": repo_relative(stage_dir), "stage_dir": repo_relative(stage_dir),
"summary_exists": bool(summary), "summary_exists": bool(summary),
"repair_mode": repair_mode,
"loop_final_status": summary.get("loop_final_status"), "loop_final_status": summary.get("loop_final_status"),
"accepted_gate": summary.get("accepted_gate"), "accepted_gate": summary.get("accepted_gate"),
"loop_accepted_gate": summary.get("loop_accepted_gate"), "loop_accepted_gate": summary.get("loop_accepted_gate"),
"stage_closing_gate": stage_closing_gate or None, "stage_closing_gate": stage_closing_gate or None,
"next_action": next_action, "next_action": next_action,
"next_step_guidance": summary.get("next_step_guidance") or build_next_step_guidance(next_action), "next_step_guidance": next_step_guidance,
"latest_gui_run_id": latest_gui_review.get("run_id"), "latest_gui_run_id": latest_gui_review.get("run_id"),
"latest_gui_business_status": latest_gui_review.get("overall_business_status"), "latest_gui_business_status": latest_gui_review.get("overall_business_status"),
"latest_business_audit": summary.get("latest_business_audit"),
"latest_lead_coder_handoff": summary.get("latest_lead_coder_handoff"),
"latest_lead_coder_handoff_json": summary.get("latest_lead_coder_handoff_json"),
"latest_repair_coder_status": latest_repair_execution.get("coder_status"), "latest_repair_coder_status": latest_repair_execution.get("coder_status"),
"latest_repair_dry_run": latest_repair_execution.get("dry_run"), "latest_repair_dry_run": latest_repair_execution.get("dry_run"),
"latest_validation_run_id": latest_repair_validation.get("validation_run_id"), "latest_validation_run_id": latest_repair_validation.get("validation_run_id"),
@ -1175,6 +1798,9 @@ def build_stage_status(stage_manifest: dict[str, Any], stage_dir: Path) -> dict[
"summary_path": repo_relative(summary_path) if summary_path.exists() else None, "summary_path": repo_relative(summary_path) if summary_path.exists() else None,
"domain_pack_loop_command_exists": domain_command_path.exists(), "domain_pack_loop_command_exists": domain_command_path.exists(),
"domain_pack_loop_command_path": repo_relative(domain_command_path) if domain_command_path.exists() else None, "domain_pack_loop_command_path": repo_relative(domain_command_path) if domain_command_path.exists() else None,
"question_generation_review_status": question_review.get("status"),
"question_generation_review_score": question_review.get("score"),
"question_generation_review_path": repo_relative(question_review_path) if question_review_path.exists() else None,
"last_continue_action": continue_result.get("performed_action"), "last_continue_action": continue_result.get("performed_action"),
"last_continue_next_action": continue_result.get("next_action"), "last_continue_next_action": continue_result.get("next_action"),
"last_continue_result_path": repo_relative(continue_result_path) if continue_result_path.exists() else None, "last_continue_result_path": repo_relative(continue_result_path) if continue_result_path.exists() else None,
@ -1190,6 +1816,19 @@ def handle_status(args: argparse.Namespace) -> int:
return 0 return 0
def handle_review_questions(args: argparse.Namespace) -> int:
stage_manifest_path = repo_path(args.manifest)
stage_manifest = load_stage_manifest(stage_manifest_path)
stage_dir = stage_dir_for(repo_path(args.output_root), stage_manifest["stage_id"])
stage_dir.mkdir(parents=True, exist_ok=True)
review = build_stage_question_generation_review(stage_manifest, stage_dir=stage_dir)
write_json(stage_dir / "question_generation_review.json", review)
write_text(stage_dir / "question_generation_review.md", build_stage_question_generation_markdown(review))
save_stage_context_capsule(stage_manifest, stage_dir, question_review=review)
print(json.dumps(review, ensure_ascii=False, indent=2))
return 0
def args_with(args: argparse.Namespace, **overrides: Any) -> argparse.Namespace: def args_with(args: argparse.Namespace, **overrides: Any) -> argparse.Namespace:
values = vars(args).copy() values = vars(args).copy()
values.update(overrides) values.update(overrides)
@ -1277,6 +1916,36 @@ def handle_continue(args: argparse.Namespace) -> int:
), ),
} }
) )
elif next_action == "lead_coder_repair_required":
payload.update(
{
"performed_action": "wait_for_lead_codex_repair",
"next_action": next_action,
"business_audit": status_before.get("latest_business_audit"),
"lead_coder_handoff": status_before.get("latest_lead_coder_handoff"),
"suggested_next_steps": [
"repair code in the main Lead Codex context from the handoff artifacts",
"run targeted tests/build/graphify after code edits",
"rerun the same stage semantic pack or ingest the GUI validation run",
],
}
)
elif next_action == "rerun_stage_loop_for_lead_handoff":
write_json(stage_dir / "stage_manifest.json", stage_manifest)
write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n")
command = build_domain_pack_loop_command(args, stage_manifest, stage_dir)
write_text(stage_dir / "domain_pack_loop.command.txt", " ".join(command) + "\n")
payload.update(
{
"performed_action": "materialize_lead_handoff_stage_rerun",
"domain_pack_loop_command": command,
"next_action": next_action,
"suggested_command": (
"python scripts/stage_agent_loop.py run "
"--manifest <stage_manifest.json>"
),
}
)
elif next_action == "run_stage_loop_or_ingest_gui_run": elif next_action == "run_stage_loop_or_ingest_gui_run":
if getattr(args, "run_id", None): if getattr(args, "run_id", None):
ingest_summary = ingest_gui_run_review(args) ingest_summary = ingest_gui_run_review(args)
@ -1326,6 +1995,7 @@ def handle_summarize(args: argparse.Namespace) -> int:
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
summary = build_stage_summary(stage_manifest, loop_dir, previous_summary=previous_summary) summary = build_stage_summary(stage_manifest, loop_dir, previous_summary=previous_summary)
save_stage_summary(stage_dir, summary) save_stage_summary(stage_dir, summary)
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
print(json.dumps(summary, ensure_ascii=False, indent=2)) print(json.dumps(summary, ensure_ascii=False, indent=2))
return 0 return 0
@ -1337,6 +2007,7 @@ def handle_run(args: argparse.Namespace) -> int:
stage_dir.mkdir(parents=True, exist_ok=True) stage_dir.mkdir(parents=True, exist_ok=True)
write_json(stage_dir / "stage_manifest.json", stage_manifest) write_json(stage_dir / "stage_manifest.json", stage_manifest)
write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n") write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n")
save_stage_context_capsule(stage_manifest, stage_dir)
command = build_domain_pack_loop_command(args, stage_manifest, stage_dir) command = build_domain_pack_loop_command(args, stage_manifest, stage_dir)
write_text(stage_dir / "domain_pack_loop.command.txt", " ".join(command) + "\n") write_text(stage_dir / "domain_pack_loop.command.txt", " ".join(command) + "\n")
@ -1356,6 +2027,7 @@ def handle_run(args: argparse.Namespace) -> int:
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
summary = build_stage_summary(stage_manifest, loop_dir, previous_summary=previous_summary) summary = build_stage_summary(stage_manifest, loop_dir, previous_summary=previous_summary)
save_stage_summary(stage_dir, summary) save_stage_summary(stage_dir, summary)
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
if ( if (
bool(summary.get("accepted_gate")) bool(summary.get("accepted_gate"))
@ -1390,13 +2062,14 @@ def add_common_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--max-output-tokens", type=int, default=2048) parser.add_argument("--max-output-tokens", type=int, default=2048)
parser.add_argument("--timeout-seconds", type=int, default=180) parser.add_argument("--timeout-seconds", type=int, default=180)
parser.add_argument("--use-mock", action="store_true") parser.add_argument("--use-mock", action="store_true")
parser.add_argument("--repair-mode", choices=[dcl.REPAIR_MODE_LEAD_HANDOFF, dcl.REPAIR_MODE_AUTO_CODER])
parser.add_argument("--codex-binary", default="codex") parser.add_argument("--codex-binary", default="codex")
parser.add_argument("--codex-profile") parser.add_argument("--codex-profile")
parser.add_argument("--codex-model") parser.add_argument("--codex-model")
parser.add_argument("--analyst-codex-model", default="gpt-5.4") parser.add_argument("--analyst-codex-model", default="gpt-5.4")
parser.add_argument("--coder-codex-model", default="gpt-5.4-mini") parser.add_argument("--coder-codex-model", default="gpt-5.4")
parser.add_argument("--analyst-reasoning-effort", default="medium") parser.add_argument("--analyst-reasoning-effort", default="medium")
parser.add_argument("--coder-reasoning-effort", default="low") parser.add_argument("--coder-reasoning-effort", default="high")
parser.add_argument("--codex-timeout-seconds", type=int, default=1800) parser.add_argument("--codex-timeout-seconds", type=int, default=1800)
@ -1423,6 +2096,13 @@ def build_parser() -> argparse.ArgumentParser:
add_common_args(status_parser) add_common_args(status_parser)
status_parser.set_defaults(func=handle_status) status_parser.set_defaults(func=handle_status)
review_questions_parser = subparsers.add_parser(
"review-questions",
help="Review generated stage-pack questions before launching the expensive live replay.",
)
add_common_args(review_questions_parser)
review_questions_parser.set_defaults(func=handle_review_questions)
continue_parser = subparsers.add_parser( continue_parser = subparsers.add_parser(
"continue", "continue",
help="Execute the next safe stage-loop step derived from status.next_action.", help="Execute the next safe stage-loop step derived from status.next_action.",

View File

@ -0,0 +1,289 @@
from __future__ import annotations
import json
import sys
import tempfile
import unittest
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
import domain_case_loop as dcl
class DomainCaseLoopLeadHandoffTests(unittest.TestCase):
def test_normalize_repair_mode_defaults_to_lead_handoff(self) -> None:
self.assertEqual(dcl.normalize_repair_mode(None), "lead-handoff")
self.assertEqual(dcl.normalize_repair_mode("lead_codex"), "lead-handoff")
self.assertEqual(dcl.normalize_repair_mode("auto_coder"), "auto-coder")
def test_lead_handoff_captures_business_audit_and_primary_focus(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
pack_dir = root / "pack"
iteration_dir = root / "loop" / "iterations" / "iteration_00"
loop_dir = root / "loop"
business_audit_path = iteration_dir / "business_audit.md"
analyst_verdict_path = iteration_dir / "analyst_verdict.json"
repair_targets_path = pack_dir / "repair_targets.json"
repair_targets = {
"target_count": 1,
"severity_counts": {"P0": 1},
"priority_foci": [
{
"focus_id": "answer_shape",
"severity": "P0",
"issue_code": "business_direct_answer_missing",
"summary": "Direct answer is buried below service scaffolding.",
"candidate_files": [
"llm_normalizer/backend/src/services/address_runtime/composeStage.ts"
],
}
],
"targets": [
{
"severity": "P0",
"issue_code": "business_direct_answer_missing",
"step_id": "q01",
}
],
}
analyst_verdict = {
"quality_score": 42,
"loop_decision": "partial",
"user_intent_summary": "User asked for a direct business answer.",
"expected_direct_answer": "Direct first-line answer.",
"actual_direct_answer": "Scaffolded long answer.",
"root_cause_layers": ["answer_shape_mismatch"],
}
handoff = dcl.build_lead_coder_handoff(
loop_state={"loop_id": "demo"},
iteration_id="iteration_00",
pack_dir=pack_dir,
analyst_verdict_path=analyst_verdict_path,
repair_targets_path=repair_targets_path,
business_audit_path=business_audit_path,
analyst_verdict=analyst_verdict,
repair_targets=repair_targets,
target_score=88,
loop_decision="partial",
analyst_accepted_gate=False,
accepted_gate=False,
deterministic_gate_ok=False,
deterministic_gate_reason="repair_targets_remaining=P0:1",
requires_user_decision=False,
user_decision_type="none",
user_decision_prompt=None,
)
paths = dcl.save_lead_coder_handoff(
loop_dir=loop_dir,
iteration_dir=iteration_dir,
handoff=handoff,
)
saved = json.loads((iteration_dir / "lead_coder_handoff.json").read_text(encoding="utf-8"))
latest_handoff_exists = Path(paths["latest_lead_coder_handoff_path"]).exists()
self.assertEqual(saved["repair_mode"], "lead-handoff")
self.assertEqual(saved["status"], "lead_coder_repair_required")
self.assertEqual(saved["assigned_primary_focus"]["focus_id"], "answer_shape")
self.assertIn("business_audit", saved["artifact_refs"])
self.assertTrue(latest_handoff_exists)
def test_analyst_priority_targets_become_lead_repair_targets(self) -> None:
repair_targets = {
"pack_id": "demo_pack",
"domain": "demo",
"target_count": 0,
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
"priority_foci": [],
"targets": [],
}
analyst_verdict = {
"priority_targets": [
{
"scenario_id": "svk_pivot",
"step_id": "s03_summary",
"severity": "P0",
"problem_type": "bundle_reuse_gap",
"fix_goal": "Reuse the confirmed SVK value-flow bundle in the final summary.",
},
{
"scenario_id": "biz_scope",
"step_id": "s02_money",
"severity": "P1",
"problem_type": "field_mapping_gap",
"fix_goal": "Separate cash source/recipient labels from client/supplier labels.",
},
]
}
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
handoff = dcl.build_lead_coder_handoff(
loop_state={"loop_id": "demo"},
iteration_id="iteration_00",
pack_dir=Path("pack"),
analyst_verdict_path=Path("analyst_verdict.json"),
repair_targets_path=Path("semantic_repair_targets.json"),
business_audit_path=Path("business_audit.md"),
analyst_verdict={"quality_score": 73, "loop_decision": "continue"},
repair_targets=merged,
target_score=88,
loop_decision="continue",
analyst_accepted_gate=False,
accepted_gate=False,
deterministic_gate_ok=True,
deterministic_gate_reason="deterministic_gate_passed",
requires_user_decision=False,
user_decision_type="none",
user_decision_prompt=None,
)
self.assertEqual(merged["target_count"], 2)
self.assertEqual(merged["severity_counts"]["P0"], 1)
self.assertEqual(handoff["assigned_primary_focus"]["problem_type"], "bundle_reuse_gap")
self.assertEqual(handoff["top_repair_targets"][0]["target_id"], "svk_pivot:s03_summary")
self.assertIn(
"llm_normalizer/backend/src/services/assistantMcpDiscoveryResponseCandidate.ts",
handoff["candidate_files"],
)
def test_stale_analyst_validation_target_is_suppressed_by_step_state(self) -> None:
repair_targets = {
"pack_id": "demo_pack",
"domain": "demo",
"target_count": 0,
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
"priority_foci": [],
"targets": [],
"step_validation_index": {
"legacy_canaries:s02_acc60": {
"acceptance_status": "validated",
"violated_invariants": [],
"warnings": [],
"runtime_factual_answer_validated": False,
"guarded_insufficiency_validated": True,
}
},
}
analyst_verdict = {
"priority_targets": [
{
"scenario_id": "legacy_canaries",
"step_id": "s02_acc60",
"severity": "P0",
"problem_type": "evidence_gap",
"fix_goal": (
"partial heuristic answer without runtime_factual_answer_validated "
"or guarded_insufficiency_validated must not pass silently"
),
},
{
"scenario_id": "biz_scope",
"step_id": "s03_best_year",
"severity": "P2",
"problem_type": "presentation_gap",
"fix_goal": "Clarify why this year leads without implying pure profit.",
},
]
}
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
self.assertEqual(merged["suppressed_analyst_priority_target_count"], 1)
self.assertEqual(merged["target_count"], 1)
self.assertEqual(merged["targets"][0]["target_id"], "biz_scope:s03_best_year")
self.assertEqual(merged["severity_counts"]["P0"], 0)
self.assertEqual(merged["severity_counts"]["P2"], 1)
def test_bounded_mcp_evidence_gap_target_is_suppressed_by_step_state(self) -> None:
repair_targets = {
"pack_id": "demo_pack",
"domain": "demo",
"target_count": 0,
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
"priority_foci": [],
"targets": [],
"step_validation_index": {
"biz_scope:s03_best_year": {
"acceptance_status": "validated",
"violated_invariants": [],
"warnings": [],
"bounded_mcp_answer_validated": True,
"mcp_discovery_response_applied": True,
"mcp_discovery_response_candidate_status": "ready_for_guarded_use",
"assistant_text_excerpt": (
"Коротко: самый доходный год в доступном денежном контуре 1С — 2015. "
"Важно: входящие уперлись в лимит выборки MCP; это проверенный срез, "
"не чистая бухгалтерская прибыль."
),
}
},
}
analyst_verdict = {
"priority_targets": [
{
"scenario_id": "biz_scope",
"step_id": "s03_best_year",
"severity": "P0",
"problem_type": "evidence_gap",
"fix_goal": (
"Убрать asserted winner-year как подтвержденный факт, пока yearly ranking "
"не имеет exact validated compute; legacy metadata says unsupported/blocked."
),
}
]
}
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
self.assertEqual(merged["suppressed_analyst_priority_target_count"], 1)
self.assertEqual(merged["target_count"], 0)
self.assertEqual(merged["severity_counts"], {"P0": 0, "P1": 0, "P2": 0})
def test_runtime_exact_followup_target_is_suppressed_when_focus_is_proven(self) -> None:
repair_targets = {
"pack_id": "demo_pack",
"domain": "demo",
"target_count": 0,
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
"priority_foci": [],
"targets": [],
"step_validation_index": {
"svk_pivot:s02_svk_docs": {
"acceptance_status": "validated",
"violated_invariants": [],
"warnings": [],
"runtime_factual_answer_validated": True,
"assistant_text_excerpt": "Контрагент: Группа СВК. Найдено документов: 19.",
"extracted_filters": {"counterparty": "Группа СВК"},
"focus_object": {"label": "Группа СВК"},
}
},
}
analyst_verdict = {
"priority_targets": [
{
"scenario_id": "svk_pivot",
"step_id": "s02_svk_docs",
"severity": "P1",
"problem_type": "followup_action_resolution_gap",
"fix_goal": (
"Добавить pack-level validation на object-centric carryover: docs follow-up "
"и bundle reuse должны быть явно проверены через stable counterparty/focus."
),
}
]
}
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
self.assertEqual(merged["suppressed_analyst_priority_target_count"], 1)
self.assertEqual(merged["target_count"], 0)
self.assertEqual(merged["severity_counts"], {"P0": 0, "P1": 0, "P2": 0})
if __name__ == "__main__":
unittest.main()

View File

@ -50,6 +50,580 @@ class DomainCaseLoopStepStateTests(unittest.TestCase):
self.assertEqual(step_state["mcp_discovery_catalog_chain_top_match"], "value_flow") self.assertEqual(step_state["mcp_discovery_catalog_chain_top_match"], "value_flow")
self.assertTrue(step_state["mcp_discovery_catalog_chain_selected_matches_top"]) self.assertTrue(step_state["mcp_discovery_catalog_chain_selected_matches_top"])
def test_analysis_context_date_is_not_implicit_business_filter(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="stage_pack_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "All-time summary",
"depends_on": [],
"question_template": "all-time money summary",
},
step_index=1,
question_resolved="all-time money summary",
analysis_context={"as_of_date": "2026-05-09", "source": "stage_pack"},
turn_artifact={
"assistant_message": {
"reply_type": "factual_with_explanation",
"text": "Short: all-time confirmed money summary.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {},
"session_summary": {},
},
entries=[],
)
self.assertNotIn("missing_required_filter", step_state["violated_invariants"])
self.assertNotIn("wrong_as_of_date", step_state["violated_invariants"])
def test_analysis_context_date_is_required_for_explicit_date_carryover(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="date_carryover_demo",
domain="inventory",
step={
"step_id": "step_01",
"title": "Date carryover",
"depends_on": [],
"question_template": "stock on that date",
"required_carryover_invariants": ["date_scope"],
},
step_index=1,
question_resolved="stock on that date",
analysis_context={"as_of_date": "2021-03-31"},
turn_artifact={
"assistant_message": {
"reply_type": "factual",
"text": "Short: stock confirmed.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "inventory_on_hand_as_of_date",
"selected_recipe": "address_inventory_on_hand_as_of_date_v1",
"capability_id": "confirmed_inventory_on_hand_as_of_date",
"capability_route_mode": "exact",
"fallback_type": "none",
"extracted_filters": {"as_of_date": "2020-03-31"},
},
"session_summary": {},
},
entries=[],
)
self.assertIn("wrong_as_of_date", step_state["violated_invariants"])
def test_temporal_reset_question_skips_carried_date_scope(self) -> None:
self.assertTrue(dcl.question_resets_temporal_scope("show money za all time"))
self.assertTrue(dcl.question_resets_temporal_scope("сколько всего денег за все доступное время"))
carried = dcl.carry_forward_analysis_context(
{
"semantic_memory": {
"date_scope": {
"as_of_date": "2020-12-31",
"period_from": "2020-10-01",
"period_to": "2020-12-31",
},
"organization_scope": {"label": "ООО Альтернатива Плюс"},
}
},
{},
prefer_carryover=True,
carry_date_scope=False,
)
self.assertNotIn("as_of_date", carried)
self.assertEqual(carried["organization_scope"], {"label": "ООО Альтернатива Плюс"})
def test_merge_scenario_date_scope_keeps_current_scope_over_stale_previous(self) -> None:
merged = dcl.merge_scenario_date_scope(
{
"as_of_date": "2020-12-31",
"period_from": "2020-10-01",
"period_to": "2020-12-31",
"source": "scenario_state_carryover",
},
{
"as_of_date": "2021-03-31",
"period_from": "2021-03-01",
"period_to": "2021-03-31",
"source": "current_turn",
},
depends_on=["previous_step"],
)
self.assertEqual(merged["as_of_date"], "2021-03-31")
self.assertEqual(merged["source"], "current_turn")
def test_mcp_business_overview_all_time_scope_overrides_stale_session_date(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="business_overview_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "All-time money",
"depends_on": ["previous_step"],
"question_template": "all-time money summary",
"expected_intents": ["business_overview"],
},
step_index=1,
question_resolved="all-time money summary",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "partial_coverage",
"text": "Short: all-time confirmed money summary.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
"mcp_discovery_response_applied": True,
"mcp_discovery_selected_chain_id": "business_overview",
"mcp_discovery_catalog_chain_top_match": "business_overview",
"mcp_discovery_response_candidate_v1": {
"candidate_status": "ready_for_guarded_use",
"reply_type": "partial_coverage",
},
"assistant_mcp_discovery_entry_point_v1": {
"bridge": {
"pilot": {
"derived_business_overview": {
"period_scope": None,
}
}
}
},
},
"session_summary": {
"address_navigation_state": {
"session_context": {
"active_result_set_id": "rs-stale",
"date_scope": {
"as_of_date": "2020-12-31",
"period_from": "2020-10-01",
"period_to": "2020-12-31",
},
}
}
},
},
entries=[],
)
self.assertEqual(step_state["date_scope"]["scope"], "all_time")
self.assertIsNone(step_state["date_scope"]["as_of_date"])
self.assertEqual(step_state["active_result_set_id"], "mcp-discovery-msg-1")
self.assertNotIn("wrong_date_scope_state", step_state["violated_invariants"])
def test_applied_ready_mcp_discovery_chain_satisfies_expected_intent(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="business_overview_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Business overview",
"depends_on": [],
"question_template": "business overview for 2020",
"expected_intents": ["business_overview"],
},
step_index=1,
question_resolved="business overview for 2020",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "partial_coverage",
"text": "Short: business overview from confirmed 1C rows.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
"mcp_discovery_response_applied": True,
"mcp_discovery_selected_chain_id": "business_overview",
"mcp_discovery_catalog_chain_top_match": "business_overview",
"mcp_discovery_response_candidate_v1": {
"candidate_status": "ready_for_guarded_use",
"reply_type": "partial_coverage",
},
},
"session_summary": {},
},
entries=[],
)
self.assertEqual(step_state["mcp_discovery_effective_intents"], ["business_overview"])
self.assertNotIn("wrong_intent", step_state["violated_invariants"])
def test_ready_bounded_mcp_answer_can_validate_without_exact_route(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="business_overview_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Business overview",
"depends_on": [],
"question_template": "business overview for 2020",
"expected_intents": ["business_overview"],
"required_answer_shape": "direct_answer_first",
},
step_index=1,
question_resolved="business overview for 2020",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "partial_coverage",
"text": "Short: confirmed bounded business overview from 1C rows.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
"mcp_discovery_response_applied": True,
"mcp_discovery_selected_chain_id": "business_overview",
"mcp_discovery_catalog_chain_top_match": "business_overview",
"mcp_discovery_response_candidate_v1": {
"candidate_status": "ready_for_guarded_use",
"reply_type": "partial_coverage",
},
},
"session_summary": {},
},
entries=[],
)
self.assertEqual(step_state["execution_status"], "partial")
self.assertTrue(step_state["bounded_mcp_answer_validated"])
self.assertEqual(step_state["acceptance_status"], "validated")
def test_required_answer_patterns_block_generic_bounded_mcp_summary(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="summary_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Summary",
"depends_on": [],
"question_template": "summarize company and SVK separately",
"required_answer_shape": "direct_answer_first",
"required_answer_patterns_all": ["SVK", "company"],
},
step_index=1,
question_resolved="summarize company and SVK separately",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "partial_coverage",
"text": "Short: company money summary only.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"mcp_discovery_response_applied": True,
"mcp_discovery_selected_chain_id": "business_overview",
"mcp_discovery_catalog_chain_top_match": "business_overview",
"mcp_discovery_response_candidate_v1": {
"candidate_status": "ready_for_guarded_use",
"reply_type": "partial_coverage",
},
},
"session_summary": {},
},
entries=[],
)
self.assertIn("required_answer_patterns_all_missing", step_state["violated_invariants"])
self.assertFalse(step_state["bounded_mcp_answer_validated"])
self.assertEqual(step_state["acceptance_status"], "rejected")
def test_memory_checkpoint_can_validate_honest_no_scope_answer(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="memory_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Memory checkpoint",
"depends_on": [],
"question_template": "is any company or counterparty selected in the current dialog?",
"semantic_tags": ["memory", "scope_guard"],
"required_answer_shape": "direct_answer_first",
},
step_index=1,
question_resolved="is any company or counterparty selected in the current dialog?",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "partial_coverage",
"text": "В текущем диалоге не выбрана компания или контрагент; память не выдумываю.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "customer_revenue_and_payments",
"fallback_type": "no_rows",
},
"session_summary": {},
},
entries=[],
)
self.assertEqual(step_state["execution_status"], "partial")
self.assertTrue(step_state["memory_checkpoint_validated"])
self.assertEqual(step_state["acceptance_status"], "validated")
def test_deterministic_chat_memory_checkpoint_validates_without_exact_capability(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="memory_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Memory checkpoint",
"depends_on": [],
"question_template": "current dialog memory checkpoint",
"semantic_tags": ["memory", "scope_guard"],
"required_answer_shape": "direct_answer_first",
},
step_index=1,
question_resolved="current dialog memory checkpoint",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "factual_with_explanation",
"text": (
"Коротко: в текущем диалоге я не вижу выбранной компании, контрагента или позиции. "
"Память про «Группа СВК» в этом диалоге не подтверждена."
),
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "chat",
"fallback_type": "none",
"living_router_reason": "memory_recap_followup_detected",
"living_chat_response_source": "deterministic_memory_recap_contract",
},
"session_summary": {},
},
entries=[],
)
self.assertEqual(step_state["execution_status"], "partial")
self.assertTrue(step_state["memory_checkpoint_validated"])
self.assertEqual(step_state["acceptance_status"], "validated")
def test_confirmed_runtime_factual_answer_can_validate_without_exact_route_mode(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="runtime_factual_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Account 60 tails",
"depends_on": [],
"question_template": "show account 60 tails",
"required_answer_shape": "direct_answer_first",
},
step_index=1,
question_resolved="show account 60 tails",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "factual",
"text": "Коротко: по счету 60 найдено 8 строк хвостов; контрагентов с сигналом: 6.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "open_items_by_counterparty_or_contract",
"selected_recipe": "address_open_items_by_party_or_contract_v1",
"capability_id": "address_open_items_by_counterparty_or_contract",
"capability_route_mode": "heuristic",
"fallback_type": "none",
"mcp_call_status": "matched_non_empty",
"response_type": "FACTUAL_LIST",
"result_mode": "confirmed_balance",
},
"session_summary": {},
},
entries=[],
)
self.assertEqual(step_state["execution_status"], "partial")
self.assertTrue(step_state["runtime_factual_answer_validated"])
self.assertEqual(step_state["acceptance_status"], "validated")
def test_exact_confirmed_document_followup_sets_runtime_factual_validation(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="svk_pivot",
domain="agentic_loop",
step={
"step_id": "s02_svk_docs",
"title": "Counterparty documents follow-up",
"depends_on": ["s01_svk_money"],
"question_template": "show documents by this chain",
"semantic_tags": ["counterparty", "documents", "scope_guard"],
"required_answer_shape": "direct_answer_first",
},
step_index=2,
question_resolved="show documents by this chain",
analysis_context={"as_of_date": "2026-05-09"},
turn_artifact={
"assistant_message": {
"reply_type": "factual",
"text": "Контрагент: Группа СВК. Найдено документов: 19.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "list_documents_by_counterparty",
"selected_recipe": "address_documents_by_counterparty_v1",
"capability_id": "documents_drilldown",
"capability_route_mode": "exact",
"fallback_type": "none",
"mcp_call_status": "matched_non_empty",
"response_type": "FACTUAL_LIST",
"truth_mode": "confirmed",
"answer_shape": "confirmed_factual",
"coverage_status": "full",
"evidence_grade": "strong",
"extracted_filters": {"counterparty": "Группа СВК", "as_of_date": "2026-05-09"},
"focus_object": {
"object_type": "counterparty",
"object_id": "counterparty:группа свк",
"label": "Группа СВК",
},
},
"session_summary": {},
},
entries=[{"item": "2021-11-10T12:00:07Z"}],
)
self.assertEqual(step_state["execution_status"], "exact")
self.assertTrue(step_state["runtime_factual_answer_validated"])
self.assertEqual(step_state["acceptance_status"], "validated")
def test_heuristic_open_items_guarded_insufficiency_validates_separately(self) -> None:
answer_text = (
"\u041a\u043e\u0440\u043e\u0442\u043a\u043e: \u0442\u043e\u0447\u043d\u044b\u0439 "
"\u043e\u0442\u043a\u0440\u044b\u0442\u044b\u0439 \u043e\u0441\u0442\u0430\u0442\u043e\u043a "
"\u043f\u043e \u0441\u0447\u0435\u0442\u0443 60 \u043d\u0435 "
"\u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d; \u043d\u0438\u0436\u0435 "
"\u0442\u043e\u043b\u044c\u043a\u043e \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0435 "
"\u0441\u0438\u0433\u043d\u0430\u043b\u044b \u043f\u043e \u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f\u043c: 8 "
"\u0441\u0442\u0440\u043e\u043a.\n"
"\u042d\u0442\u043e \u043d\u0435 \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d\u043d\u043e\u0435 "
"\u0441\u0430\u043b\u044c\u0434\u043e: \u0442\u0435\u043a\u0443\u0449\u0438\u0439 "
"\u043a\u043e\u043d\u0442\u0443\u0440 \u0432\u0438\u0434\u0438\u0442 "
"\u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f-\u043a\u0430\u043d\u0434\u0438\u0434\u0430\u0442\u044b, "
"\u043d\u043e \u043d\u0435 \u0434\u043e\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442 "
"\u043e\u0441\u0442\u0430\u0442\u043e\u043a."
)
step_state = dcl.build_scenario_step_state(
scenario_id="runtime_factual_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Account 60 limited tails",
"depends_on": [],
"question_template": "show account 60 tails; say if exact data is unavailable",
"required_answer_shape": "direct_answer_first",
},
step_index=1,
question_resolved="show account 60 tails; say if exact data is unavailable",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "factual",
"text": answer_text,
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "open_items_by_counterparty_or_contract",
"selected_recipe": "address_open_items_by_party_or_contract_v1",
"capability_id": "address_open_items_by_counterparty_or_contract",
"capability_route_mode": "heuristic",
"fallback_type": "none",
"mcp_call_status": "matched_non_empty",
"response_type": "FACTUAL_LIST",
"result_mode": "heuristic_candidates",
"balance_confirmed": False,
"truth_mode": "limited",
"answer_shape": "limited_with_reason",
},
"session_summary": {},
},
entries=[],
)
self.assertEqual(step_state["execution_status"], "partial")
self.assertEqual(step_state["truth_mode"], "limited")
self.assertEqual(step_state["answer_shape"], "limited_with_reason")
self.assertFalse(step_state["runtime_factual_answer_validated"])
self.assertTrue(step_state["guarded_insufficiency_validated"])
self.assertEqual(step_state["acceptance_status"], "validated")
def test_heuristic_open_items_without_limitation_is_rejected(self) -> None:
step_state = dcl.build_scenario_step_state(
scenario_id="runtime_factual_demo",
domain="agentic_loop",
step={
"step_id": "step_01",
"title": "Account 60 unguarded tails",
"depends_on": [],
"question_template": "show account 60 tails",
"required_answer_shape": "direct_answer_first",
},
step_index=1,
question_resolved="show account 60 tails",
analysis_context={},
turn_artifact={
"assistant_message": {
"reply_type": "factual",
"text": "Short: account 60 has 8 open-item rows and 6 counterparties.",
"message_id": "msg-1",
"trace_id": "trace-1",
},
"technical_debug_payload": {
"detected_mode": "address_query",
"detected_intent": "open_items_by_counterparty_or_contract",
"selected_recipe": "address_open_items_by_party_or_contract_v1",
"capability_id": "address_open_items_by_counterparty_or_contract",
"capability_route_mode": "heuristic",
"fallback_type": "none",
"mcp_call_status": "matched_non_empty",
"response_type": "FACTUAL_LIST",
"result_mode": "heuristic_candidates",
"balance_confirmed": False,
"truth_mode": "limited",
"answer_shape": "limited_with_reason",
},
"session_summary": {},
},
entries=[],
)
self.assertEqual(step_state["execution_status"], "partial")
self.assertFalse(step_state["runtime_factual_answer_validated"])
self.assertFalse(step_state["guarded_insufficiency_validated"])
self.assertEqual(step_state["acceptance_status"], "rejected")
def test_truth_harness_warns_on_catalog_alignment_divergence(self) -> None: def test_truth_harness_warns_on_catalog_alignment_divergence(self) -> None:
reviewed = dth.evaluate_truth_step( reviewed = dth.evaluate_truth_step(
step={ step={

View File

@ -1,11 +1,8 @@
from __future__ import annotations from __future__ import annotations
import json
import sys import sys
import tempfile
import unittest import unittest
from pathlib import Path from pathlib import Path
from types import SimpleNamespace
sys.path.insert(0, str(Path(__file__).resolve().parent)) sys.path.insert(0, str(Path(__file__).resolve().parent))
@ -13,228 +10,55 @@ sys.path.insert(0, str(Path(__file__).resolve().parent))
import save_agent_semantic_run as saver import save_agent_semantic_run as saver
def write_json(path: Path, payload: object) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
class SaveAgentSemanticRunTests(unittest.TestCase): class SaveAgentSemanticRunTests(unittest.TestCase):
def test_extract_questions_accepts_truth_harness_question_template(self) -> None: def test_extract_questions_resolves_scenario_pack_bindings(self) -> None:
questions = saver.extract_questions_from_spec( spec = {
{ "schema_version": "domain_scenario_pack_v1",
"steps": [ "bindings": {
{"step_id": "step_01", "question_template": "first question"}, "main_organization": "ООО Альтернатива Плюс",
{"step_id": "step_02", "question": "second question"}, "control_year": "2020",
] "svk_counterparty": "Группа СВК",
} },
) "scenarios": [
self.assertEqual(questions, ["first question", "second question"])
def test_extract_questions_accepts_domain_pack_scenarios(self) -> None:
questions = saver.extract_questions_from_spec(
{
"pack_id": "demo_pack",
"scenarios": [
{
"scenario_id": "scenario_01",
"steps": [
{"step_id": "step_01", "question_template": "first question"},
{"step_id": "step_02", "question": "second question"},
],
},
{
"scenario_id": "scenario_02",
"steps": [
{"step_id": "step_01", "question": "first question"},
{"step_id": "step_02", "question": "third question"},
],
},
],
}
)
self.assertEqual(questions, ["first question", "second question", "third question"])
def test_validate_accepted_run_dir_accepts_clean_business_review(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
run_dir = Path(tmp)
write_json(
run_dir / "pack_state.json",
{ {
"final_status": "accepted", "scenario_id": "biz",
"review_overall_status": "pass", "steps": [
"acceptance_gate_passed": True,
"no_unresolved_p0": True,
"unresolved_p0_count": 0,
"steps_total": 1,
"steps_passed": 1,
"steps_failed": 0,
},
)
write_json(run_dir / "truth_review.json", {"summary": {"overall_status": "pass"}})
write_json(
run_dir / "business_review.json",
{
"overall_business_status": "pass",
"steps_with_business_failures": 0,
"steps_with_business_warnings": 0,
},
)
metadata = saver.validate_accepted_run_dir(run_dir)
self.assertEqual(metadata["validation_status"], "accepted_live_replay")
self.assertTrue(metadata["saved_after_validated_replay"])
def test_validate_accepted_run_dir_rejects_business_review_failures(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
run_dir = Path(tmp)
write_json(
run_dir / "pack_state.json",
{
"final_status": "accepted",
"review_overall_status": "pass",
"acceptance_gate_passed": True,
"no_unresolved_p0": True,
"unresolved_p0_count": 0,
},
)
write_json(run_dir / "truth_review.json", {"summary": {"overall_status": "pass"}})
write_json(
run_dir / "business_review.json",
{
"overall_business_status": "fail",
"steps_with_business_failures": 1,
},
)
with self.assertRaisesRegex(RuntimeError, "business_review"):
saver.validate_accepted_run_dir(run_dir)
def test_validate_accepted_run_dir_accepts_clean_domain_pack_loop(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
loop_dir = Path(tmp)
iteration_dir = loop_dir / "iterations" / "iteration_00"
analyst_path = iteration_dir / "analyst_verdict.json"
repair_targets_path = iteration_dir / "pack_output" / "pack_run" / "repair_targets.json"
write_json(
loop_dir / "loop_state.json",
{
"loop_id": "stage_demo",
"target_score": 88,
"final_status": "accepted",
"iterations": [
{ {
"iteration_id": "iteration_00", "question": "Дай обзор {{bindings.main_organization}} за {{bindings.control_year}} год.",
"quality_score": 91, "semantic_tags": ["business_overview", "money"],
"accepted_gate": True, },
"analyst_accepted_gate": True,
"deterministic_gate_ok": True,
"repair_target_count": 0,
"repair_target_severity_counts": {"P0": 0, "P1": 0, "P2": 0},
"analyst_verdict_path": str(analyst_path),
"repair_targets_path": str(repair_targets_path),
}
],
},
)
write_json(
analyst_path,
{
"loop_decision": "accepted",
"unresolved_p0_count": 0,
"regression_detected": False,
"direct_answer_ok": True,
"business_usefulness_ok": True,
"temporal_honesty_ok": True,
"field_truth_ok": True,
"answer_layering_ok": True,
},
)
write_json(repair_targets_path, {"severity_counts": {"P0": 0, "P1": 0, "P2": 0}})
metadata = saver.validate_accepted_run_dir(loop_dir)
self.assertEqual(metadata["validation_status"], "accepted_domain_pack_loop")
self.assertEqual(metadata["quality_score"], 91)
def test_validate_accepted_run_dir_rejects_domain_pack_loop_with_p1_targets(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
loop_dir = Path(tmp)
iteration_dir = loop_dir / "iterations" / "iteration_00"
analyst_path = iteration_dir / "analyst_verdict.json"
repair_targets_path = iteration_dir / "pack_output" / "pack_run" / "repair_targets.json"
write_json(
loop_dir / "loop_state.json",
{
"loop_id": "stage_demo",
"target_score": 88,
"final_status": "accepted",
"iterations": [
{ {
"quality_score": 91, "question": "Отдельно по {{bindings.svk_counterparty}} покажи документы.",
"accepted_gate": True, "semantic_tags": ["counterparty", "documents"],
"analyst_accepted_gate": True, },
"deterministic_gate_ok": True,
"analyst_verdict_path": str(analyst_path),
"repair_targets_path": str(repair_targets_path),
}
], ],
}, }
) ],
write_json( }
analyst_path,
{
"loop_decision": "accepted",
"unresolved_p0_count": 0,
"regression_detected": False,
"direct_answer_ok": True,
"business_usefulness_ok": True,
"temporal_honesty_ok": True,
"field_truth_ok": True,
"answer_layering_ok": True,
},
)
write_json(repair_targets_path, {"severity_counts": {"P0": 0, "P1": 1, "P2": 0}})
with self.assertRaisesRegex(RuntimeError, "repair_targets"): questions = saver.extract_questions_from_spec(spec)
saver.validate_accepted_run_dir(loop_dir)
def test_save_gate_refuses_real_write_without_validation(self) -> None: self.assertEqual(
args = SimpleNamespace( questions,
validated_run_dir=None, [
dry_run=False, "Дай обзор ООО Альтернатива Плюс за 2020 год.",
allow_unvalidated=False, "Отдельно по Группа СВК покажи документы.",
unvalidated_reason=None, ],
)
self.assertFalse(any("{{bindings." in question for question in questions))
self.assertEqual(
saver.extract_semantic_tags(spec),
["business_overview", "counterparty", "documents", "money"],
) )
with self.assertRaisesRegex(RuntimeError, "Refusing to save AGENT autorun"): def test_extract_questions_refuses_unresolved_bindings(self) -> None:
saver.build_save_gate_metadata(args, {}, Path("demo.json")) spec = {
"questions": ["Что с НДС за {{bindings.control_year}} год?"],
"bindings": {},
}
def test_save_gate_requires_reason_for_unvalidated_draft(self) -> None: with self.assertRaisesRegex(RuntimeError, "unresolved bindings"):
args = SimpleNamespace( saver.extract_questions_from_spec(spec)
validated_run_dir=None,
dry_run=False,
allow_unvalidated=True,
unvalidated_reason="",
)
with self.assertRaisesRegex(RuntimeError, "--unvalidated-reason"):
saver.build_save_gate_metadata(args, {}, Path("demo.json"))
def test_save_gate_marks_explicit_unvalidated_draft(self) -> None:
args = SimpleNamespace(
validated_run_dir=None,
dry_run=False,
allow_unvalidated=True,
unvalidated_reason="manual GUI canary before live replay",
)
metadata = saver.build_save_gate_metadata(args, {}, Path("demo.json"))
self.assertEqual(metadata["validation_status"], "explicitly_unvalidated")
self.assertFalse(metadata["saved_after_validated_replay"])
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -29,13 +29,14 @@ def args() -> argparse.Namespace:
temperature=0.0, temperature=0.0,
max_output_tokens=2048, max_output_tokens=2048,
timeout_seconds=180, timeout_seconds=180,
repair_mode=None,
codex_binary="codex", codex_binary="codex",
codex_profile=None, codex_profile=None,
codex_model=None, codex_model=None,
analyst_codex_model="gpt-5.4", analyst_codex_model="gpt-5.4",
coder_codex_model="gpt-5.4-mini", coder_codex_model="gpt-5.4",
analyst_reasoning_effort="medium", analyst_reasoning_effort="medium",
coder_reasoning_effort="low", coder_reasoning_effort="high",
codex_timeout_seconds=1800, codex_timeout_seconds=1800,
analysis_date=None, analysis_date=None,
max_scenarios=None, max_scenarios=None,
@ -81,6 +82,7 @@ class StageAgentLoopTests(unittest.TestCase):
self.assertEqual(manifest["target_score"], 88) self.assertEqual(manifest["target_score"], 88)
self.assertEqual(manifest["max_iterations"], 6) self.assertEqual(manifest["max_iterations"], 6)
self.assertEqual(manifest["repair_mode"], "lead-handoff")
self.assertTrue(manifest["save_autorun_on_accept"]) self.assertTrue(manifest["save_autorun_on_accept"])
self.assertTrue(manifest["manual_confirmation_required_after_accept"]) self.assertTrue(manifest["manual_confirmation_required_after_accept"])
@ -98,6 +100,8 @@ class StageAgentLoopTests(unittest.TestCase):
self.assertIn("91", command) self.assertIn("91", command)
self.assertIn("--max-iterations", command) self.assertIn("--max-iterations", command)
self.assertIn("4", command) self.assertIn("4", command)
self.assertIn("--repair-mode", command)
self.assertIn("lead-handoff", command)
self.assertIn("--output-root", command) self.assertIn("--output-root", command)
def test_build_stage_summary_requests_manual_confirmation_after_accept(self) -> None: def test_build_stage_summary_requests_manual_confirmation_after_accept(self) -> None:
@ -143,6 +147,7 @@ class StageAgentLoopTests(unittest.TestCase):
loop_dir / "loop_state.json", loop_dir / "loop_state.json",
{ {
"final_status": "partial", "final_status": "partial",
"repair_mode": "auto-coder",
"target_score": 88, "target_score": 88,
"iterations": [ "iterations": [
{ {
@ -162,6 +167,7 @@ class StageAgentLoopTests(unittest.TestCase):
"module_name": "Open-World Bounded Autonomy Breadth", "module_name": "Open-World Bounded Autonomy Breadth",
"title": "Open-world semantic control gate", "title": "Open-world semantic control gate",
"target_score": 88, "target_score": 88,
"repair_mode": "auto-coder",
}, },
loop_dir, loop_dir,
) )
@ -169,6 +175,81 @@ class StageAgentLoopTests(unittest.TestCase):
self.assertFalse(summary["manual_confirmation_required"]) self.assertFalse(summary["manual_confirmation_required"])
self.assertEqual(summary["next_action"], "continue_autonomous_or_fix_blocker") self.assertEqual(summary["next_action"], "continue_autonomous_or_fix_blocker")
def test_build_stage_summary_reruns_stale_partial_loop_for_lead_handoff(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
loop_dir = Path(tmp)
write_json(
loop_dir / "loop_state.json",
{
"final_status": "needs_exact_capability",
"target_score": 88,
"iterations": [
{
"quality_score": 32,
"loop_decision": "needs_exact_capability",
"accepted_gate": False,
"deterministic_gate_ok": False,
"coder_status": "no_changes",
}
],
},
)
summary = stage_loop.build_stage_summary(
{
"stage_id": "agent_loop",
"module_name": "Agent Loop",
"title": "Agent Loop",
"target_score": 88,
"repair_mode": "lead-handoff",
},
loop_dir,
)
self.assertEqual(summary["next_action"], "rerun_stage_loop_for_lead_handoff")
self.assertIn("lead-handoff", summary["next_step_guidance"]["command_templates"][1])
def test_build_stage_summary_routes_lead_handoff_to_lead_repair(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
loop_dir = Path(tmp)
iteration_dir = loop_dir / "iterations" / "iteration_00"
write_json(
loop_dir / "loop_state.json",
{
"final_status": "needs_exact_capability",
"repair_mode": "lead-handoff",
"target_score": 88,
"latest_lead_coder_handoff_markdown_path": str(loop_dir / "lead_coder_handoff.md"),
"iterations": [
{
"quality_score": 42,
"loop_decision": "needs_exact_capability",
"accepted_gate": False,
"deterministic_gate_ok": False,
"business_audit_path": str(iteration_dir / "business_audit.md"),
"lead_coder_handoff_markdown_path": str(iteration_dir / "lead_coder_handoff.md"),
"coder_status": "lead_handoff_required",
}
],
},
)
summary = stage_loop.build_stage_summary(
{
"stage_id": "agent_loop",
"module_name": "Agent Loop",
"title": "Agent Loop",
"target_score": 88,
"repair_mode": "lead-handoff",
},
loop_dir,
)
self.assertEqual(summary["repair_mode"], "lead-handoff")
self.assertEqual(summary["next_action"], "lead_coder_repair_required")
self.assertIn("lead_coder_handoff", summary["latest_lead_coder_handoff"])
self.assertIn("business_audit", summary["latest_business_audit"])
def test_build_stage_summary_blocks_close_when_repair_lacks_validation(self) -> None: def test_build_stage_summary_blocks_close_when_repair_lacks_validation(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
loop_dir = Path(tmp) loop_dir = Path(tmp)
@ -817,6 +898,205 @@ class StageAgentLoopTests(unittest.TestCase):
self.assertEqual(result["performed_action"], "materialize_stage_run_dry_run") self.assertEqual(result["performed_action"], "materialize_stage_run_dry_run")
self.assertIn("run-pack-loop", command_text) self.assertIn("run-pack-loop", command_text)
def test_handle_review_questions_scores_stage_pack_quality(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
manifest_path = root / "stage.json"
pack_path = root / "pack.json"
output_root = root / "stage_runs"
write_json(
manifest_path,
{
"stage_id": "agent_loop",
"module_name": "Agent Loop",
"title": "Agent Loop",
"pack_manifest": str(pack_path),
"target_score": 88,
},
)
write_json(
pack_path,
{
"schema_version": "domain_scenario_pack_v1",
"pack_id": "agent_loop_pack",
"bindings": {
"organization": "ООО Альтернатива Плюс",
"counterparty": "Группа СВК",
},
"scenarios": [
{
"scenario_id": "company_overview",
"steps": [
{
"step_id": "step_01",
"question": "Дай бизнес-обзор {{bindings.organization}}: деньги, НДС, долги и что нельзя утверждать.",
"semantic_tags": ["business_overview", "money", "vat", "debt", "scope_guard"],
"required_answer_shape": "direct_answer_first",
"forbidden_answer_patterns": ["(?i)runtime_"],
},
{
"step_id": "step_02",
"question": "Раскрой деньги подробнее: сколько получили и заплатили.",
"depends_on": ["step_01"],
"semantic_tags": ["money"],
"required_answer_shape": "direct_answer_first",
},
{
"step_id": "step_03",
"question": "Что с НДС за 2020 год и на каких документах это основано?",
"depends_on": ["step_02"],
"semantic_tags": ["vat", "documents"],
"required_answer_shape": "direct_answer_first",
},
{
"step_id": "step_04",
"question": "Теперь за все время не тащи НДС за 2020 как общую позицию.",
"depends_on": ["step_03"],
"semantic_tags": ["vat", "scope_guard"],
"required_answer_shape": "direct_answer_first",
},
],
},
{
"scenario_id": "counterparty_pivot",
"steps": [
{
"step_id": "step_01",
"question": "Отдельно по контрагенту {{bindings.counterparty}}: сколько денег прошло и какие документы есть?",
"semantic_tags": ["counterparty", "money", "documents", "scope_guard"],
"required_answer_shape": "direct_answer_first",
"forbidden_answer_patterns": ["(?i)capability_id"],
},
{
"step_id": "step_02",
"question": "Покажи документы по этой цепочке и не смешивай контрагента с организацией.",
"depends_on": ["step_01"],
"semantic_tags": ["counterparty", "documents", "scope_guard"],
"required_answer_shape": "direct_answer_first",
},
{
"step_id": "step_03",
"question": "Что было на складе на март 2021?",
"depends_on": ["step_02"],
"semantic_tags": ["inventory"],
"required_answer_shape": "direct_answer_first",
},
{
"step_id": "step_04",
"question": "Собери итог: что можно выводить по компании и что нельзя.",
"depends_on": ["step_03"],
"semantic_tags": ["business_overview", "scope_guard"],
"required_answer_shape": "direct_answer_first",
},
],
},
],
},
)
exit_code = stage_loop.handle_review_questions(
stage_args(manifest=str(manifest_path), output_root=str(output_root))
)
review = json.loads(
(output_root / "agent_loop" / "question_generation_review.json").read_text(encoding="utf-8")
)
capsule = json.loads(
(output_root / "agent_loop" / "stage_context_capsule.json").read_text(encoding="utf-8")
)
self.assertEqual(exit_code, 0)
self.assertEqual(review["status"], "strong")
self.assertEqual(capsule["repair_mode"], "lead-handoff")
self.assertEqual(capsule["question_generation"]["status"], "strong")
self.assertGreaterEqual(review["score"], 85)
self.assertEqual(review["question_count"], 8)
self.assertGreaterEqual(review["domain_counts"]["vat"], 2)
self.assertFalse(review["weak_flag_counts"])
def test_review_questions_flags_mojibake_before_live_replay(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
pack_path = root / "pack.json"
write_json(
pack_path,
{
"schema_version": "domain_scenario_pack_v1",
"pack_id": "broken_pack",
"scenarios": [
{
"scenario_id": "broken",
"steps": [
{
"step_id": "step_01",
"question": "\u0420\u201d\u0420\u00b0\u0420\u2116 \u0420\u00b1\u0420\u0451\u0420\u00b7\u0420\u0405\u0420\u00b5\u0421\u0403-\u0420\u0455\u0420\u00b1\u0420\u00b7\u0420\u0455\u0421\u0402 \u0420\u0454\u0420\u0455\u0420\u0458\u0420\u0457\u0420\u00b0\u0420\u0405\u0420\u0451\u0420\u0451.",
"required_answer_shape": "direct_answer_first",
}
],
}
],
},
)
review = stage_loop.build_stage_question_generation_review(
{
"stage_id": "broken_stage",
"module_name": "Broken Stage",
"title": "Broken Stage",
"pack_manifest": str(pack_path),
}
)
self.assertEqual(review["status"], "weak")
self.assertEqual(review["weak_flag_counts"]["mojibake_question_text"], 1)
def test_review_questions_flags_windows_artifact_path_risk(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
pack_path = root / "pack.json"
write_json(
pack_path,
{
"schema_version": "domain_scenario_pack_v1",
"pack_id": "path_pack",
"scenarios": [
{
"scenario_id": "very_long_scenario_id_for_windows_path_budget",
"steps": [
{
"step_id": "very_long_step_id_for_windows_path_budget",
"question": "Дай бизнес-обзор компании: деньги, НДС, документы и что нельзя утверждать.",
"semantic_tags": [
"business_overview",
"money",
"vat",
"documents",
"counterparty",
"scope_guard",
],
"required_answer_shape": "direct_answer_first",
"forbidden_answer_patterns": ["(?i)runtime_"],
}
],
}
],
},
)
review = stage_loop.build_stage_question_generation_review(
{
"stage_id": "long_stage_name_for_path_budget",
"module_name": "Path Budget",
"title": "Path Budget",
"pack_manifest": str(pack_path),
"loop_id": "long_loop_name_for_path_budget",
"artifact_path_warning_limit": 120,
},
stage_dir=root / "stage_runs" / "long_stage_name_for_path_budget",
)
self.assertEqual(review["weak_flag_counts"]["artifact_path_too_long_for_windows"], 1)
self.assertGreaterEqual(review["max_estimated_artifact_path"], 120)
def test_build_stage_status_reports_cold_start_continue_artifacts(self) -> None: def test_build_stage_status_reports_cold_start_continue_artifacts(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp) root = Path(tmp)
@ -937,12 +1217,14 @@ class StageAgentLoopTests(unittest.TestCase):
summary = json.loads((stage_dir / "stage_loop_summary.json").read_text(encoding="utf-8")) summary = json.loads((stage_dir / "stage_loop_summary.json").read_text(encoding="utf-8"))
handoff_exists = (stage_dir / "stage_loop_handoff.md").exists() handoff_exists = (stage_dir / "stage_loop_handoff.md").exists()
repair_handoff_exists = (stage_dir / "stage_repair_handoff.md").exists() repair_handoff_exists = (stage_dir / "stage_repair_handoff.md").exists()
context_capsule_exists = (stage_dir / "stage_context_capsule.md").exists()
review_exists = (stage_dir / "gui_run_reviews" / run_id / "run_review.json").exists() review_exists = (stage_dir / "gui_run_reviews" / run_id / "run_review.json").exists()
self.assertEqual(exit_code, 0) self.assertEqual(exit_code, 0)
self.assertEqual(summary["next_action"], "continue_repair_from_gui_review_p0") self.assertEqual(summary["next_action"], "continue_repair_from_gui_review_p0")
self.assertTrue(handoff_exists) self.assertTrue(handoff_exists)
self.assertTrue(repair_handoff_exists) self.assertTrue(repair_handoff_exists)
self.assertTrue(context_capsule_exists)
self.assertTrue(review_exists) self.assertTrue(review_exists)