Развить агентную semantic loop систему
This commit is contained in:
parent
48c3b5340b
commit
f86cb8e886
|
|
@ -39,7 +39,7 @@ Use these repo-native capture paths:
|
|||
- import existing technical export: `python scripts/domain_case_loop.py import-export ...`
|
||||
- `run-case` defaults to the repo's live local profile: `local / qwen2.5-14b-instruct-1m / http://127.0.0.1:1234/v1`
|
||||
- override with `--llm-provider`, `--llm-model`, `--llm-base-url`, `--llm-api-key` when needed
|
||||
- `run-pack-loop` defaults to `gpt-5.4` for analyst and `gpt-5.4-mini` for coder; tune with `--analyst-codex-model`, `--coder-codex-model`, `--analyst-reasoning-effort`, `--coder-reasoning-effort`
|
||||
- `run-pack-loop` defaults to `gpt-5.4` for the independent business analyst and `lead-handoff` repair mode; opt into the old autonomous coder loop only with `--repair-mode auto-coder`
|
||||
|
||||
## Workflow
|
||||
|
||||
|
|
@ -77,13 +77,14 @@ In pack mode:
|
|||
|
||||
### Autonomous pack-loop mode
|
||||
|
||||
Use autonomous pack-loop mode when the user wants the system to continue with analyst/coder iterations until the analyst gate is reached or the loop hits a real blocker.
|
||||
Use pack-loop mode when the user wants the system to run live replay, produce a strong business-first analyst verdict, and continue toward repair evidence until the analyst gate is reached or the loop hits a real blocker.
|
||||
|
||||
In autonomous pack-loop mode:
|
||||
- run `python scripts/domain_case_loop.py run-pack-loop --manifest ...`;
|
||||
- keep each iteration under `artifacts/domain_runs/<loop_id>/iterations/<iteration_id>/`;
|
||||
- read `analyst_verdict.json` before any coder patch;
|
||||
- let coder patch only the highest-value domain targets from the current analyst verdict;
|
||||
- by default, stop after the analyst verdict with `business_audit.md` and `lead_coder_handoff.md` so Lead Codex repairs code in the main context;
|
||||
- let an autonomous coder patch only when `--repair-mode auto-coder` is explicitly selected, and only against the highest-value domain targets from the current analyst verdict;
|
||||
- stop only on `accepted`, `blocked`, explicit `requires_user_decision = true`, or `max_iterations`;
|
||||
- do not stop just because the analyst returns `needs_exact_capability` or `partial` if autonomous domain enablement work still remains.
|
||||
- treat `quality score >= 80` as the target gate, not as permission to keep pushing through hard blockers, missing essential observations, or unsafe fixes.
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
"pack_id": "agentic_semantic_development_loop_stage_pack",
|
||||
"domain": "agentic_semantic_development_loop_control",
|
||||
"title": "Agentic semantic development loop control pack",
|
||||
"description": "Compact stage pack for dogfooding the agentic development loop against business-overview, VAT, stale-scope, and legacy-canary questions.",
|
||||
"description": "Stage pack for dogfooding the agentic development loop against business overview, VAT, stale scope, counterparty pivots, legacy route canaries, and answer-shape quality.",
|
||||
"analysis_context": {
|
||||
"as_of_date": "2026-05-09",
|
||||
"source": "agentic_semantic_development_loop_stage_pack"
|
||||
|
|
@ -15,15 +15,17 @@
|
|||
},
|
||||
"scenarios": [
|
||||
{
|
||||
"scenario_id": "agentic_loop_business_overview_control",
|
||||
"scenario_id": "biz_scope",
|
||||
"title": "Business overview and stale-scope control",
|
||||
"description": "Checks direct business-answer shape, period carryover, all-time reset, VAT boundary, and organization scope hygiene.",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01_business_overview",
|
||||
"step_id": "s01_biz",
|
||||
"title": "Business overview for explicit period",
|
||||
"node_role": "root",
|
||||
"question": "Дай взрослый бизнес-обзор {{bindings.main_organization}} за {{bindings.control_year}} год по данным 1С: обороты, входящие и исходящие деньги, нетто, НДС, долги, склад, клиенты, поставщики и что пока нельзя утверждать.",
|
||||
"expected_intents": ["business_overview"],
|
||||
"semantic_tags": ["business_overview", "money", "vat", "debt", "inventory", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)capability_id",
|
||||
|
|
@ -33,45 +35,51 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_02_money_followup",
|
||||
"step_id": "s02_money",
|
||||
"title": "Money follow-up",
|
||||
"question": "Раскрой деньги подробнее: сколько получили, сколько заплатили, какой чистый денежный поток, кто главный клиент и главный поставщик в {{bindings.control_year}}.",
|
||||
"depends_on": ["step_01_business_overview"],
|
||||
"depends_on": ["s01_biz"],
|
||||
"semantic_tags": ["money", "counterparty"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
},
|
||||
{
|
||||
"step_id": "step_03_best_year_all_time",
|
||||
"step_id": "s03_best_year",
|
||||
"title": "All-time best operating-flow year",
|
||||
"question": "А если смотреть за все доступное время, какой самый доходный год по подтвержденным оборотам и почему? Не называй это бухгалтерской прибылью, если чистой прибыли нет.",
|
||||
"depends_on": ["step_02_money_followup"],
|
||||
"depends_on": ["s02_money"],
|
||||
"semantic_tags": ["money", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
},
|
||||
{
|
||||
"step_id": "step_04_vat_explicit_period",
|
||||
"step_id": "s04_vat",
|
||||
"title": "VAT explicit period",
|
||||
"question": "Что с НДС за {{bindings.control_year}} год по {{bindings.main_organization}}: какая позиция видна, на чем она основана и чего не хватает для налогового вывода?",
|
||||
"depends_on": ["step_03_best_year_all_time"],
|
||||
"depends_on": ["s03_best_year"],
|
||||
"semantic_tags": ["vat", "documents", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
},
|
||||
{
|
||||
"step_id": "step_05_all_time_no_vat_carryover",
|
||||
"step_id": "s05_all_time",
|
||||
"title": "All-time reset without stale VAT carryover",
|
||||
"question": "Теперь за все доступное время дай обзор компании в целом, но не тащи НДС за {{bindings.control_year}} как подтвержденную общую налоговую позицию.",
|
||||
"depends_on": ["step_04_vat_explicit_period"],
|
||||
"depends_on": ["s04_vat"],
|
||||
"semantic_tags": ["business_overview", "vat", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario_id": "agentic_loop_counterparty_pivot_control",
|
||||
"scenario_id": "svk_pivot",
|
||||
"title": "Counterparty pivot and legacy canaries",
|
||||
"description": "Checks explicit counterparty arbitration after organization context and keeps technical/debug details out of the final answer.",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01_svk_money",
|
||||
"step_id": "s01_svk_money",
|
||||
"title": "Explicit counterparty money flow",
|
||||
"question": "Теперь отдельно по контрагенту {{bindings.svk_counterparty}}: сколько денег прошло, что входящее, что исходящее и есть ли документы или движения, на которых это основано?",
|
||||
"node_role": "root",
|
||||
"question": "Отдельно по контрагенту {{bindings.svk_counterparty}}, без опоры на прошлый диалог: сколько денег прошло, что входящее, что исходящее и есть ли документы или движения, на которых это основано?",
|
||||
"expected_intents": ["value_flow"],
|
||||
"semantic_tags": ["counterparty", "money", "documents", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)capability_id",
|
||||
|
|
@ -81,17 +89,66 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_02_svk_documents",
|
||||
"step_id": "s02_svk_docs",
|
||||
"title": "Counterparty documents follow-up",
|
||||
"question": "Покажи документы по этой цепочке и не смешивай {{bindings.svk_counterparty}} с организацией {{bindings.main_organization}}.",
|
||||
"depends_on": ["step_01_svk_money"],
|
||||
"depends_on": ["s01_svk_money"],
|
||||
"semantic_tags": ["counterparty", "documents", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
},
|
||||
{
|
||||
"step_id": "step_03_final_summary",
|
||||
"step_id": "s03_summary",
|
||||
"title": "Final executive summary",
|
||||
"question": "Собери короткий итог: что мы подтвердили по компании, что отдельно по {{bindings.svk_counterparty}}, какие выводы можно делать и какие нельзя.",
|
||||
"depends_on": ["step_01_svk_money", "step_02_svk_documents"],
|
||||
"depends_on": ["s01_svk_money", "s02_svk_docs"],
|
||||
"semantic_tags": ["business_overview", "counterparty", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"required_answer_patterns_all": [
|
||||
"СВК",
|
||||
"компан"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario_id": "legacy_canaries",
|
||||
"title": "Legacy route canaries and context interruptions",
|
||||
"description": "Keeps old deterministic routes and conversational interruptions in the stage pack so new agentic loop wiring does not hide regressions.",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "s01_memory",
|
||||
"title": "Memory checkpoint after prior business context",
|
||||
"node_role": "root",
|
||||
"question": "Сделай короткий стартовый чек контекста: есть ли уже выбранная компания или контрагент в текущем диалоге; если нет, скажи честно и не выдумывай память про {{bindings.svk_counterparty}}.",
|
||||
"semantic_tags": ["memory", "business_overview", "counterparty", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)capability_id",
|
||||
"(?i)runtime_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "s02_acc60",
|
||||
"title": "Account 60 tail legacy canary",
|
||||
"question": "Покажи хвосты по счету 60 на август {{bindings.control_year}} по {{bindings.main_organization}}; если точных данных нет, скажи это прямо и не подменяй ответ общим обзором.",
|
||||
"depends_on": ["s01_memory"],
|
||||
"semantic_tags": ["debt", "documents", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
},
|
||||
{
|
||||
"step_id": "s03_stock",
|
||||
"title": "Inventory route canary",
|
||||
"question": "Что было на складе на март 2021 по доступным данным? Дай прямой ответ и не уводи его в контрагента {{bindings.svk_counterparty}}.",
|
||||
"depends_on": ["s02_acc60"],
|
||||
"semantic_tags": ["inventory", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
},
|
||||
{
|
||||
"step_id": "s04_all_money",
|
||||
"title": "All-money answer without counterparty leakage",
|
||||
"question": "Вернись к {{bindings.main_organization}}: сколько всего денег получили и заплатили по всем подтвержденным данным, но не смешивай это с отдельной цепочкой {{bindings.svk_counterparty}} и не называй оборот чистой прибылью.",
|
||||
"depends_on": ["s03_stock"],
|
||||
"semantic_tags": ["money", "business_overview", "counterparty", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first"
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ This repository now supports two outer-loop capture modes:
|
|||
- `run-case` for one concrete domain question;
|
||||
- `run-scenario` for a linked multi-step domain chain that should reuse one assistant session.
|
||||
- `run-pack` for a whole domain question pool grouped into several scenarios.
|
||||
- `run-pack-loop` for an autonomous analyst/coder loop over a whole domain pack.
|
||||
- `run-pack-loop` for a strong analyst review loop over a whole domain pack, with Lead Codex repair handoff by default.
|
||||
|
||||
`run-scenario` is the preferred capture mode for domains where the user's next question depends on the previous result set.
|
||||
`run-pack` is the preferred capture mode when the user brings a full domain pool that should be kept in one aggregate backlog.
|
||||
|
|
@ -80,7 +80,7 @@ That path is explicitly marked as unvalidated and must not be treated as semanti
|
|||
|
||||
1. take the current global/local stage manifest;
|
||||
2. run `scripts/domain_case_loop.py run-pack-loop` for that stage pack;
|
||||
3. let the loop iterate through pack replay, business-first analyst verdict, coder patch, and rerun until the objective gate is accepted, blocked, or a real user decision is required;
|
||||
3. let the loop run pack replay and a business-first analyst verdict; if the gate is not accepted, write `business_audit.md` and `lead_coder_handoff.md` instead of launching a weak coder by default;
|
||||
4. if accepted, persist the validated AGENT pack into GUI autoruns through `scripts/save_agent_semantic_run.py --validated-run-dir`;
|
||||
5. write `stage_loop_summary.json` and `stage_loop_handoff.md` for the final human visual confirmation.
|
||||
|
||||
|
|
@ -92,6 +92,7 @@ Canonical commands:
|
|||
```powershell
|
||||
python scripts/stage_agent_loop.py plan --manifest docs/orchestration/<stage_loop>.json
|
||||
python scripts/stage_agent_loop.py run --manifest docs/orchestration/<stage_loop>.json
|
||||
python scripts/stage_agent_loop.py review-questions --manifest docs/orchestration/<stage_loop>.json
|
||||
python scripts/stage_agent_loop.py ingest-gui-run --manifest docs/orchestration/<stage_loop>.json --run-id assistant-stage1-<id>
|
||||
python scripts/stage_agent_loop.py prepare-repair --manifest docs/orchestration/<stage_loop>.json
|
||||
python scripts/stage_agent_loop.py run-repair --manifest docs/orchestration/<stage_loop>.json --dry-run
|
||||
|
|
@ -100,7 +101,28 @@ python scripts/stage_agent_loop.py continue --manifest docs/orchestration/<stage
|
|||
python scripts/stage_agent_loop.py summarize --manifest docs/orchestration/<stage_loop>.json
|
||||
```
|
||||
|
||||
This is the intended path for “implement the stage, generate/check stage questions, analyze business answers, patch code, rerun, then ask the user for final visual confirmation”.
|
||||
This is the intended path for "implement the stage, generate/check stage questions, analyze business answers, patch code, rerun, then ask the user for final visual confirmation".
|
||||
|
||||
The default repair mode is `lead-handoff`. In this mode the expensive replay still runs live and the independent analyst still produces the strict business verdict, but code repair stays with the main Lead Codex context. The loop stops with `next_action = lead_coder_repair_required`, plus:
|
||||
|
||||
- `business_audit.md` for the user-facing semantic/business verdict;
|
||||
- `lead_coder_handoff.md/json` for the concrete repair target, candidate files, and validation path;
|
||||
- `stage_context_capsule.md/json` for the current stage contract, question quality, loop status, and operating model.
|
||||
|
||||
`auto-coder` remains available only as an explicit opt-in experiment:
|
||||
|
||||
```powershell
|
||||
python scripts/stage_agent_loop.py run --manifest docs/orchestration/<stage_loop>.json --repair-mode auto-coder
|
||||
```
|
||||
|
||||
That path must not be treated as the normal high-trust repair mode for this project.
|
||||
|
||||
Before launching an expensive live replay, run `review-questions`. It reads the stage pack, resolves `{{bindings.*}}` placeholders, checks scenario/follow-up density, direct-answer shape declarations, domain coverage, stale-scope canaries, dependency order, duplicates, mojibake in generated Russian questions, and estimated Windows artifact path length. It writes:
|
||||
|
||||
- `question_generation_review.json`;
|
||||
- `question_generation_review.md`.
|
||||
|
||||
A strong question review is not semantic proof that the assistant answers correctly. It is the pre-flight gate that says the generated questions are worth spending a live replay on.
|
||||
|
||||
## GUI run review bridge
|
||||
|
||||
|
|
@ -128,7 +150,7 @@ This bridge is intentionally business-first:
|
|||
- noisy direct answers, missing first-line answers, technical garbage, and over-broad business answers become findings;
|
||||
- generated question packs get a deterministic quality review for follow-up density, direct questions, report-style analysis, domain diversity, duplicates, and weak business anchors.
|
||||
|
||||
Use this bridge when the operator would otherwise say “чекни прогон `assistant-stage1-...`”. The expected next step is no longer manual eyeballing first; it is: review by id, inspect `run_review.md`, map `repair_targets.json` into the current stage loop, patch, and rerun.
|
||||
Use this bridge when the operator would otherwise say "чекни прогон `assistant-stage1-...`". The expected next step is no longer manual eyeballing first; it is: review by id, inspect `run_review.md`, map `repair_targets.json` into the current stage loop, patch, and rerun.
|
||||
|
||||
For stage work, prefer the integrated command:
|
||||
|
||||
|
|
@ -149,6 +171,8 @@ Use `python scripts/stage_agent_loop.py continue --manifest docs/orchestration/<
|
|||
|
||||
It also writes `stage_repair_handoff.md/json` next to the stage summary. That handoff is the preferred input for the next coder pass: it lists primary repair targets and sample user-facing failures without forcing the coder to reread the entire GUI conversation first.
|
||||
|
||||
For live stage-pack failures, prefer `lead_coder_handoff.md` over immediately preparing a coder pass. The intent is: strong business audit first, Lead Codex code repair second, same replay/GUI validation third.
|
||||
|
||||
To prepare the next repair iteration from that handoff, run:
|
||||
|
||||
```powershell
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
"module_name": "Agentic Semantic Development Loop",
|
||||
"title": "Agentic semantic development loop dogfood gate",
|
||||
"architecture_phase": "turnaround_11_agentic_semantic_development_loop",
|
||||
"agent_focus": "Automate stage implementation, semantic replay review, repair handoff, coder pass, rerun validation, and final human confirmation.",
|
||||
"agent_focus": "Automate stage question review, live semantic replay, strong business audit, Lead Codex repair handoff, rerun validation, and final human confirmation.",
|
||||
"current_stage_status": "active_dogfood",
|
||||
"global_plan_refs": [
|
||||
"docs/orchestration/domain_scenario_loop_repo_adapter.md",
|
||||
|
|
@ -12,12 +12,16 @@
|
|||
"AGENTS.md codex_domain_loop and agent_semantic_runs"
|
||||
],
|
||||
"pack_manifest": "docs/orchestration/agentic_semantic_development_loop_stage_pack.json",
|
||||
"loop_id": "agentic_semantic_development_loop",
|
||||
"loop_id": "asl",
|
||||
"artifact_path_warning_limit": 240,
|
||||
"target_score": 88,
|
||||
"max_iterations": 6,
|
||||
"repair_mode": "lead-handoff",
|
||||
"acceptance_invariants": [
|
||||
"status command exposes next_action, repair state, validation state, and closing gate",
|
||||
"run-pack-loop defaults to Lead Codex handoff instead of weak autonomous coding",
|
||||
"continue command never runs the real coder pass without --execute-repair",
|
||||
"business_audit.md and lead_coder_handoff.md are produced before code repair when semantic replay is not accepted",
|
||||
"patched repair cannot close the stage without successful rerun/ingest validation",
|
||||
"business answers remain direct, context-aware, and free of internal route/debug ids",
|
||||
"manual GUI confirmation remains required after accepted semantic replay"
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -387,6 +387,10 @@ def evaluate_truth_step(
|
|||
assistant_text = str(step_state.get("assistant_text") or "")
|
||||
direct_answer = str(step_state.get("actual_direct_answer") or "").strip()
|
||||
detected_intent = str(step_state.get("detected_intent") or "").strip()
|
||||
effective_intents = [
|
||||
detected_intent,
|
||||
*dcl.normalize_string_list(step_state.get("mcp_discovery_effective_intents")),
|
||||
]
|
||||
selected_recipe = str(step_state.get("selected_recipe") or "").strip()
|
||||
capability_id = str(step_state.get("capability_id") or "").strip()
|
||||
catalog_alignment_status = str(step_state.get("mcp_discovery_catalog_chain_alignment_status") or "").strip()
|
||||
|
|
@ -508,13 +512,13 @@ def evaluate_truth_step(
|
|||
expected_intents = dcl.normalize_string_list(
|
||||
resolve_nested_placeholders(step.get("expected_intents") or [], step_results, bindings, runtime_bindings)
|
||||
)
|
||||
if expected_intents and not dcl.identifier_in_list(detected_intent, expected_intents):
|
||||
if expected_intents and not any(dcl.identifier_in_list(intent, expected_intents) for intent in effective_intents if intent):
|
||||
append_finding(
|
||||
findings,
|
||||
step,
|
||||
"wrong_intent",
|
||||
"Интент не соответствует ожидаемому бизнес-смыслу шага.",
|
||||
actual=detected_intent or None,
|
||||
actual=effective_intents,
|
||||
expected=expected_intents,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ HISTORY_FILE = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "h
|
|||
SAVED_SESSIONS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "saved_sessions"
|
||||
EVAL_CASES_DIR = REPO_ROOT / "llm_normalizer" / "data" / "eval_cases"
|
||||
VALIDATED_AGENT_SAVE_SCHEMA_VERSION = "agent_semantic_save_gate_v1"
|
||||
BINDING_TOKEN_RE = re.compile(r"\{\{\s*bindings\.([A-Za-z0-9_-]+)\s*\}\}")
|
||||
|
||||
|
||||
def now_utc() -> datetime:
|
||||
|
|
@ -39,6 +40,36 @@ def sanitize_question(value: Any) -> str:
|
|||
return text
|
||||
|
||||
|
||||
def normalize_bindings(raw_bindings: Any) -> dict[str, str]:
|
||||
if not isinstance(raw_bindings, dict):
|
||||
return {}
|
||||
result: dict[str, str] = {}
|
||||
for key, value in raw_bindings.items():
|
||||
normalized_key = str(key or "").strip()
|
||||
normalized_value = str(value or "").strip()
|
||||
if normalized_key and normalized_value:
|
||||
result[normalized_key] = normalized_value
|
||||
return result
|
||||
|
||||
|
||||
def merge_bindings(*binding_sets: Any) -> dict[str, str]:
|
||||
merged: dict[str, str] = {}
|
||||
for raw_bindings in binding_sets:
|
||||
merged.update(normalize_bindings(raw_bindings))
|
||||
return merged
|
||||
|
||||
|
||||
def render_question_template(value: Any, bindings: dict[str, str]) -> str:
|
||||
question = sanitize_question(value)
|
||||
|
||||
def replace_binding(match: re.Match[str]) -> str:
|
||||
binding_key = match.group(1)
|
||||
replacement = bindings.get(binding_key)
|
||||
return replacement if replacement is not None else match.group(0)
|
||||
|
||||
return sanitize_question(BINDING_TOKEN_RE.sub(replace_binding, question))
|
||||
|
||||
|
||||
def ensure_agent_title(title: str) -> str:
|
||||
normalized = title.strip()
|
||||
if not normalized:
|
||||
|
|
@ -237,11 +268,11 @@ def build_save_gate_metadata(args: argparse.Namespace, spec: dict[str, Any], spe
|
|||
)
|
||||
|
||||
|
||||
def normalize_questions(raw_questions: list[Any]) -> list[str]:
|
||||
def normalize_questions(raw_questions: list[Any], bindings: dict[str, str] | None = None) -> list[str]:
|
||||
result: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in raw_questions:
|
||||
question = sanitize_question(item)
|
||||
question = render_question_template(item, bindings or {})
|
||||
if not question or question in seen:
|
||||
continue
|
||||
seen.add(question)
|
||||
|
|
@ -250,50 +281,84 @@ def normalize_questions(raw_questions: list[Any]) -> list[str]:
|
|||
|
||||
|
||||
def extract_semantic_tags(spec: dict[str, Any]) -> list[str]:
|
||||
steps = spec.get("steps")
|
||||
if not isinstance(steps, list):
|
||||
return []
|
||||
tags: set[str] = set()
|
||||
for step in steps:
|
||||
if not isinstance(step, dict):
|
||||
continue
|
||||
raw_tags = step.get("semantic_tags")
|
||||
if not isinstance(raw_tags, list):
|
||||
continue
|
||||
for raw_tag in raw_tags:
|
||||
tag = str(raw_tag or "").strip()
|
||||
if tag:
|
||||
tags.add(tag)
|
||||
step_groups: list[Any] = []
|
||||
steps = spec.get("steps")
|
||||
if isinstance(steps, list):
|
||||
step_groups.append(steps)
|
||||
scenarios = spec.get("scenarios")
|
||||
if isinstance(scenarios, list):
|
||||
for scenario in scenarios:
|
||||
if isinstance(scenario, dict) and isinstance(scenario.get("steps"), list):
|
||||
step_groups.append(scenario["steps"])
|
||||
for step_group in step_groups:
|
||||
for step in step_group:
|
||||
if not isinstance(step, dict):
|
||||
continue
|
||||
raw_tags = step.get("semantic_tags")
|
||||
if not isinstance(raw_tags, list):
|
||||
continue
|
||||
for raw_tag in raw_tags:
|
||||
tag = str(raw_tag or "").strip()
|
||||
if tag:
|
||||
tags.add(tag)
|
||||
return sorted(tags)
|
||||
|
||||
|
||||
def assert_no_unresolved_bindings(questions: list[str]) -> None:
|
||||
unresolved = [question for question in questions if BINDING_TOKEN_RE.search(question)]
|
||||
if unresolved:
|
||||
sample = unresolved[0]
|
||||
raise RuntimeError(
|
||||
"Refusing to save AGENT autorun with unresolved bindings in questions. "
|
||||
f"First unresolved question: {sample}"
|
||||
)
|
||||
|
||||
|
||||
def extract_questions_from_spec(spec: dict[str, Any]) -> list[str]:
|
||||
global_bindings = normalize_bindings(spec.get("bindings"))
|
||||
if isinstance(spec.get("questions"), list):
|
||||
return normalize_questions(list(spec["questions"]))
|
||||
questions = normalize_questions(list(spec["questions"]), global_bindings)
|
||||
assert_no_unresolved_bindings(questions)
|
||||
return questions
|
||||
|
||||
steps = spec.get("steps")
|
||||
if isinstance(steps, list):
|
||||
return normalize_questions(
|
||||
[
|
||||
step.get("question") or step.get("question_template")
|
||||
for step in steps
|
||||
if isinstance(step, dict) and (step.get("question") or step.get("question_template"))
|
||||
]
|
||||
)
|
||||
raw_questions = [
|
||||
step.get("question") or step.get("question_template")
|
||||
for step in steps
|
||||
if isinstance(step, dict) and (step.get("question") or step.get("question_template"))
|
||||
]
|
||||
questions = normalize_questions(raw_questions, global_bindings)
|
||||
assert_no_unresolved_bindings(questions)
|
||||
return questions
|
||||
|
||||
scenarios = spec.get("scenarios")
|
||||
if isinstance(scenarios, list):
|
||||
raw_questions: list[Any] = []
|
||||
questions: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for scenario in scenarios:
|
||||
if not isinstance(scenario, dict):
|
||||
continue
|
||||
scenario_steps = scenario.get("steps")
|
||||
if not isinstance(scenario_steps, list):
|
||||
continue
|
||||
raw_questions.extend(
|
||||
step.get("question") or step.get("question_template")
|
||||
for step in scenario_steps
|
||||
if isinstance(step, dict) and (step.get("question") or step.get("question_template"))
|
||||
)
|
||||
return normalize_questions(raw_questions)
|
||||
scenario_bindings = merge_bindings(global_bindings, scenario.get("bindings"))
|
||||
for step in scenario_steps:
|
||||
if not isinstance(step, dict):
|
||||
continue
|
||||
raw_question = step.get("question") or step.get("question_template")
|
||||
if not raw_question:
|
||||
continue
|
||||
step_bindings = merge_bindings(scenario_bindings, step.get("bindings"))
|
||||
question = render_question_template(raw_question, step_bindings)
|
||||
if not question or question in seen:
|
||||
continue
|
||||
seen.add(question)
|
||||
questions.append(question)
|
||||
assert_no_unresolved_bindings(questions)
|
||||
return questions
|
||||
|
||||
raise RuntimeError(
|
||||
"Spec must define `questions[]`, `steps[].question`, `steps[].question_template`, "
|
||||
"or `scenarios[].steps[]` questions"
|
||||
|
|
|
|||
|
|
@ -212,6 +212,10 @@ def build_scenario_acceptance_matrix(
|
|||
"mcp_discovery_catalog_chain_alignment_status": step_state.get("mcp_discovery_catalog_chain_alignment_status"),
|
||||
"mcp_discovery_catalog_chain_top_match": step_state.get("mcp_discovery_catalog_chain_top_match"),
|
||||
"mcp_discovery_catalog_chain_selected_matches_top": step_state.get("mcp_discovery_catalog_chain_selected_matches_top"),
|
||||
"mcp_discovery_response_applied": step_state.get("mcp_discovery_response_applied"),
|
||||
"mcp_discovery_selected_chain_id": step_state.get("mcp_discovery_selected_chain_id"),
|
||||
"mcp_discovery_response_candidate_status": step_state.get("mcp_discovery_response_candidate_status"),
|
||||
"mcp_discovery_effective_intents": step_state.get("mcp_discovery_effective_intents"),
|
||||
"selected_object_step": _has_selected_object_signal(step),
|
||||
"meta_context_step": _has_meta_context_signal(step),
|
||||
"highest_unresolved_priority": highest_priority,
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import json
|
|||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import Counter
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
|
@ -19,6 +20,71 @@ DEFAULT_STAGE_OUTPUT_ROOT = REPO_ROOT / "artifacts" / "domain_runs" / "stage_age
|
|||
DEFAULT_REPAIR_CODER_SCHEMA = REPO_ROOT / "docs" / "orchestration" / "schemas" / "domain_loop_coder_result.schema.json"
|
||||
STAGE_LOOP_SCHEMA_VERSION = "stage_agent_loop_manifest_v1"
|
||||
STAGE_SUMMARY_SCHEMA_VERSION = "stage_agent_loop_summary_v1"
|
||||
STAGE_QUESTION_REVIEW_SCHEMA_VERSION = "stage_question_generation_review_v1"
|
||||
STAGE_CONTEXT_CAPSULE_SCHEMA_VERSION = "stage_context_capsule_v1"
|
||||
|
||||
STAGE_QUESTION_DOMAIN_MARKERS: dict[str, tuple[str, ...]] = {
|
||||
"business_overview": ("бизнес-обзор", "обзор компании", "компании в целом", "взрослый", "вывод"),
|
||||
"money": ("деньг", "получили", "заплатили", "денежн", "поток", "доходн", "оборот", "выруч"),
|
||||
"vat": ("ндс", "налог", "налогов"),
|
||||
"counterparty": ("контрагент", "группа свк", "свк", "клиент", "поставщик"),
|
||||
"documents": ("документ", "движен", "цепочк", "основан"),
|
||||
"inventory": ("склад", "товар", "остат", "номенклатур"),
|
||||
"debt": ("долг", "долж", "счет 60", "счёт 60", "хвост"),
|
||||
"scope_guard": ("не смешивай", "не тащи", "не называй", "что нельзя", "чего не хватает"),
|
||||
"memory": ("напомни", "уже выяснил", "контекст", "вернись"),
|
||||
}
|
||||
STAGE_QUESTION_FOLLOWUP_MARKERS = (
|
||||
"теперь",
|
||||
"а если",
|
||||
"по этой",
|
||||
"по этому",
|
||||
"по ней",
|
||||
"по нему",
|
||||
"вернись",
|
||||
"собери",
|
||||
"напомни",
|
||||
"дальше",
|
||||
)
|
||||
STAGE_QUESTION_REPORT_MARKERS = (
|
||||
"обзор",
|
||||
"анализ",
|
||||
"вывод",
|
||||
"итог",
|
||||
"оцен",
|
||||
"что можно",
|
||||
"что нельзя",
|
||||
)
|
||||
STAGE_QUESTION_MOJIBAKE_MARKERS = (
|
||||
"\u0420\u00b0",
|
||||
"\u0420\u00b1",
|
||||
"\u0420\u0406",
|
||||
"\u0420\u0456",
|
||||
"\u0420\u0491",
|
||||
"\u0420\u00b5",
|
||||
"\u0420\u0451",
|
||||
"\u0420\u2116",
|
||||
"\u0420\u0454",
|
||||
"\u0420\u00bb",
|
||||
"\u0420\u0458",
|
||||
"\u0420\u0405",
|
||||
"\u0420\u0455",
|
||||
"\u0420\u0457",
|
||||
"\u0421\u0402",
|
||||
"\u0421\u0403",
|
||||
"\u0421\u201a",
|
||||
"\u0421\u0453",
|
||||
"\u0421\u201e",
|
||||
"\u0421\u2026",
|
||||
"\u0421\u2020",
|
||||
"\u0421\u2021",
|
||||
"\u0421\u20ac",
|
||||
"\u0421\u2030",
|
||||
"\u0421\u040a",
|
||||
"\u0421\u2039",
|
||||
"\u0421\u040b",
|
||||
"\u0421\u040f",
|
||||
)
|
||||
|
||||
|
||||
def now_iso() -> str:
|
||||
|
|
@ -75,6 +141,517 @@ def string_list(value: Any) -> list[str]:
|
|||
return result
|
||||
|
||||
|
||||
def normalize_review_text(value: Any) -> str:
|
||||
return re.sub(r"\s+", " ", str(value or "").strip().lower())
|
||||
|
||||
|
||||
def has_any_marker(text: str, markers: tuple[str, ...]) -> bool:
|
||||
normalized = normalize_review_text(text)
|
||||
return any(marker in normalized for marker in markers)
|
||||
|
||||
|
||||
def looks_like_mojibake(value: Any) -> bool:
|
||||
text = str(value or "")
|
||||
return sum(1 for marker in STAGE_QUESTION_MOJIBAKE_MARKERS if marker in text) >= 2
|
||||
|
||||
|
||||
def resolve_pack_bindings(value: str, bindings: dict[str, Any]) -> str:
|
||||
def replace(match: re.Match[str]) -> str:
|
||||
key = match.group(1)
|
||||
replacement = bindings.get(key)
|
||||
return str(replacement) if replacement is not None else match.group(0)
|
||||
|
||||
return re.sub(r"\{\{\s*bindings\.([a-zA-Z0-9_.-]+)\s*\}\}", replace, str(value or "")).strip()
|
||||
|
||||
|
||||
def iter_stage_pack_steps(pack_manifest: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
bindings = pack_manifest.get("bindings") if isinstance(pack_manifest.get("bindings"), dict) else {}
|
||||
scenarios = pack_manifest.get("scenarios") if isinstance(pack_manifest.get("scenarios"), list) else []
|
||||
steps: list[dict[str, Any]] = []
|
||||
for scenario_index, raw_scenario in enumerate(scenarios, start=1):
|
||||
if not isinstance(raw_scenario, dict):
|
||||
continue
|
||||
scenario_id = str(raw_scenario.get("scenario_id") or f"scenario_{scenario_index:02d}").strip()
|
||||
scenario_title = str(raw_scenario.get("title") or scenario_id).strip()
|
||||
raw_steps = raw_scenario.get("steps") if isinstance(raw_scenario.get("steps"), list) else []
|
||||
for step_index, raw_step in enumerate(raw_steps, start=1):
|
||||
if isinstance(raw_step, str):
|
||||
raw_question = raw_step.strip()
|
||||
raw_step_object: dict[str, Any] = {
|
||||
"step_id": f"step_{step_index:02d}",
|
||||
"title": f"Step {step_index:02d}",
|
||||
"question": raw_question,
|
||||
}
|
||||
elif isinstance(raw_step, dict):
|
||||
raw_step_object = raw_step
|
||||
raw_question = str(raw_step.get("question") or raw_step.get("question_template") or "").strip()
|
||||
else:
|
||||
continue
|
||||
step_id = str(raw_step_object.get("step_id") or f"step_{step_index:02d}").strip()
|
||||
depends_on = string_list(raw_step_object.get("depends_on"))
|
||||
expected_intents = string_list(
|
||||
raw_step_object.get("expected_intents") or raw_step_object.get("expected_intent")
|
||||
)
|
||||
semantic_tags = string_list(raw_step_object.get("semantic_tags"))
|
||||
steps.append(
|
||||
{
|
||||
"global_index": len(steps) + 1,
|
||||
"scenario_index": scenario_index,
|
||||
"scenario_id": scenario_id,
|
||||
"scenario_title": scenario_title,
|
||||
"scenario_step_index": step_index,
|
||||
"step_id": step_id,
|
||||
"title": str(raw_step_object.get("title") or step_id).strip() or step_id,
|
||||
"question_template": raw_question,
|
||||
"question_resolved": resolve_pack_bindings(raw_question, bindings),
|
||||
"depends_on": depends_on,
|
||||
"expected_intents": expected_intents,
|
||||
"semantic_tags": semantic_tags,
|
||||
"required_answer_shape": (
|
||||
str(
|
||||
raw_step_object.get("required_answer_shape")
|
||||
or raw_step_object.get("expected_answer_shape")
|
||||
or ""
|
||||
).strip()
|
||||
or None
|
||||
),
|
||||
"forbidden_answer_patterns": string_list(raw_step_object.get("forbidden_answer_patterns")),
|
||||
"node_role": str(raw_step_object.get("node_role") or raw_step_object.get("role") or "").strip()
|
||||
or None,
|
||||
}
|
||||
)
|
||||
return steps
|
||||
|
||||
|
||||
def classify_stage_pack_step(step: dict[str, Any]) -> dict[str, Any]:
|
||||
question = str(step.get("question_resolved") or step.get("question_template") or "")
|
||||
title = str(step.get("title") or "")
|
||||
expected_intents = string_list(step.get("expected_intents"))
|
||||
semantic_tags = string_list(step.get("semantic_tags"))
|
||||
combined = " ".join([question, title, *expected_intents, *semantic_tags])
|
||||
domains: list[str] = []
|
||||
for domain, markers in STAGE_QUESTION_DOMAIN_MARKERS.items():
|
||||
if domain in expected_intents or domain in semantic_tags or has_any_marker(combined, markers):
|
||||
domains.append(domain)
|
||||
|
||||
depends_on = string_list(step.get("depends_on"))
|
||||
tags: list[str] = []
|
||||
if depends_on or has_any_marker(question, STAGE_QUESTION_FOLLOWUP_MARKERS):
|
||||
tags.append("contextual_followup")
|
||||
else:
|
||||
tags.append("root_question")
|
||||
if step.get("required_answer_shape"):
|
||||
tags.append("answer_shape_declared")
|
||||
if str(step.get("required_answer_shape") or "") == "direct_answer_first":
|
||||
tags.append("direct_answer_first_required")
|
||||
if step.get("forbidden_answer_patterns"):
|
||||
tags.append("canary_or_guarded_question")
|
||||
if has_any_marker(question, STAGE_QUESTION_REPORT_MARKERS) or "business_overview" in domains:
|
||||
tags.append("report_or_analysis_request")
|
||||
if domains:
|
||||
tags.append("domain_grounded")
|
||||
|
||||
weak_flags: list[str] = []
|
||||
if looks_like_mojibake(question) or looks_like_mojibake(step.get("question_template")):
|
||||
weak_flags.append("mojibake_question_text")
|
||||
if int(step.get("scenario_step_index") or 0) == 1 and "contextual_followup" in tags:
|
||||
weak_flags.append("root_question_requires_missing_context")
|
||||
if len(question) > 700:
|
||||
weak_flags.append("question_too_long")
|
||||
if not domains and not expected_intents and not semantic_tags:
|
||||
weak_flags.append("low_business_anchor")
|
||||
if not step.get("required_answer_shape"):
|
||||
weak_flags.append("missing_required_answer_shape")
|
||||
|
||||
return {
|
||||
**step,
|
||||
"question": question,
|
||||
"domains": domains,
|
||||
"tags": tags,
|
||||
"weak_flags": weak_flags,
|
||||
"length_chars": len(question),
|
||||
}
|
||||
|
||||
|
||||
def build_stage_question_recommendations(
|
||||
*,
|
||||
missing_domains: list[str],
|
||||
weak_flag_counts: Counter[str],
|
||||
score: int,
|
||||
) -> list[str]:
|
||||
recommendations: list[str] = []
|
||||
if weak_flag_counts["mojibake_question_text"]:
|
||||
recommendations.append("Repair generated question text to normal UTF-8 Russian before any live replay.")
|
||||
if missing_domains:
|
||||
recommendations.append("Add questions for missing control domains: " + ", ".join(missing_domains) + ".")
|
||||
if weak_flag_counts["too_few_contextual_followups"]:
|
||||
recommendations.append("Add more follow-up turns that depend on prior answers and test carryover.")
|
||||
if weak_flag_counts["root_question_requires_missing_context"]:
|
||||
recommendations.append("Rewrite scenario roots so they are self-contained and do not start as follow-ups.")
|
||||
if weak_flag_counts["missing_required_answer_shape"]:
|
||||
recommendations.append("Declare required_answer_shape for each business-critical step.")
|
||||
if weak_flag_counts["no_canary_or_guarded_question"]:
|
||||
recommendations.append("Add forbidden-answer canaries for internal ids, stale scope, and unsupported claims.")
|
||||
if weak_flag_counts["artifact_path_too_long_for_windows"]:
|
||||
recommendations.append("Shorten loop/scenario/step artifact ids before live replay on Windows.")
|
||||
if score >= 85 and not recommendations:
|
||||
recommendations.append("Question pack is strong enough for live semantic replay.")
|
||||
return recommendations
|
||||
|
||||
|
||||
def build_stage_question_generation_review(stage_manifest: dict[str, Any], stage_dir: Path | None = None) -> dict[str, Any]:
|
||||
pack_manifest_path = repo_path(stage_manifest["pack_manifest"])
|
||||
pack_manifest = load_json_object(pack_manifest_path, "Stage question pack manifest")
|
||||
raw_steps = iter_stage_pack_steps(pack_manifest)
|
||||
question_reviews = [classify_stage_pack_step(step) for step in raw_steps]
|
||||
question_counter = Counter(normalize_review_text(item["question"]) for item in question_reviews if item["question"])
|
||||
duplicate_questions = [question for question, count in question_counter.items() if count > 1]
|
||||
|
||||
tag_counts = Counter(tag for item in question_reviews for tag in item["tags"])
|
||||
domain_counts = Counter(domain for item in question_reviews for domain in item["domains"])
|
||||
weak_flag_counts = Counter(flag for item in question_reviews for flag in item["weak_flags"])
|
||||
if duplicate_questions:
|
||||
weak_flag_counts["duplicate_questions"] += len(duplicate_questions)
|
||||
scenario_ids = {str(item.get("scenario_id") or "") for item in question_reviews if item.get("scenario_id")}
|
||||
if len(question_reviews) < 8:
|
||||
weak_flag_counts["too_few_questions_for_stage_replay"] += 1
|
||||
if len(scenario_ids) < 2:
|
||||
weak_flag_counts["too_few_scenarios"] += 1
|
||||
if tag_counts["contextual_followup"] < max(2, len(question_reviews) // 3):
|
||||
weak_flag_counts["too_few_contextual_followups"] += 1
|
||||
if tag_counts["direct_answer_first_required"] < len(question_reviews):
|
||||
weak_flag_counts["missing_direct_answer_shape_on_some_steps"] += 1
|
||||
if tag_counts["canary_or_guarded_question"] < 2:
|
||||
weak_flag_counts["no_canary_or_guarded_question"] += 1
|
||||
if tag_counts["report_or_analysis_request"] < 1:
|
||||
weak_flag_counts["missing_report_or_analysis_request"] += 1
|
||||
if len(domain_counts) < 5:
|
||||
weak_flag_counts["low_domain_diversity"] += 1
|
||||
|
||||
required_domains = ["business_overview", "money", "vat", "counterparty", "documents", "scope_guard"]
|
||||
missing_domains = [domain for domain in required_domains if domain_counts[domain] == 0]
|
||||
for domain in missing_domains:
|
||||
weak_flag_counts[f"missing_domain_{domain}"] += 1
|
||||
|
||||
known_steps_by_scenario: dict[str, set[str]] = {}
|
||||
for item in question_reviews:
|
||||
scenario_id = str(item.get("scenario_id") or "")
|
||||
known_steps_by_scenario.setdefault(scenario_id, set()).add(str(item.get("step_id") or ""))
|
||||
dependency_errors: list[dict[str, str]] = []
|
||||
seen_by_scenario: dict[str, set[str]] = {}
|
||||
for item in question_reviews:
|
||||
scenario_id = str(item.get("scenario_id") or "")
|
||||
seen = seen_by_scenario.setdefault(scenario_id, set())
|
||||
for dependency in string_list(item.get("depends_on")):
|
||||
if dependency not in known_steps_by_scenario.get(scenario_id, set()):
|
||||
dependency_errors.append(
|
||||
{
|
||||
"scenario_id": scenario_id,
|
||||
"step_id": str(item.get("step_id") or ""),
|
||||
"dependency": dependency,
|
||||
"error": "unknown_dependency",
|
||||
}
|
||||
)
|
||||
elif dependency not in seen:
|
||||
dependency_errors.append(
|
||||
{
|
||||
"scenario_id": scenario_id,
|
||||
"step_id": str(item.get("step_id") or ""),
|
||||
"dependency": dependency,
|
||||
"error": "forward_dependency",
|
||||
}
|
||||
)
|
||||
seen.add(str(item.get("step_id") or ""))
|
||||
if dependency_errors:
|
||||
weak_flag_counts["dependency_order_errors"] += len(dependency_errors)
|
||||
|
||||
resolved_stage_dir = stage_dir or stage_dir_for(DEFAULT_STAGE_OUTPUT_ROOT, stage_manifest["stage_id"])
|
||||
loop_dir = stage_loop_dir(resolved_stage_dir, stage_manifest)
|
||||
artifact_path_budget = int(stage_manifest.get("artifact_path_warning_limit") or 240)
|
||||
estimated_artifact_paths: list[dict[str, Any]] = []
|
||||
for item in question_reviews:
|
||||
estimated_path = (
|
||||
loop_dir
|
||||
/ "iterations"
|
||||
/ "iteration_00"
|
||||
/ "pack_output"
|
||||
/ "pack_run"
|
||||
/ "scenarios"
|
||||
/ str(item.get("scenario_id") or "")
|
||||
/ "steps"
|
||||
/ str(item.get("step_id") or "")
|
||||
).resolve()
|
||||
estimated_artifact_paths.append(
|
||||
{
|
||||
"scenario_id": item.get("scenario_id"),
|
||||
"step_id": item.get("step_id"),
|
||||
"path": str(estimated_path),
|
||||
"length": len(str(estimated_path)),
|
||||
}
|
||||
)
|
||||
max_estimated_artifact_path = max((int(item["length"]) for item in estimated_artifact_paths), default=0)
|
||||
if max_estimated_artifact_path >= artifact_path_budget:
|
||||
weak_flag_counts["artifact_path_too_long_for_windows"] += 1
|
||||
|
||||
score = 100
|
||||
score -= min(40, weak_flag_counts["mojibake_question_text"] * 10)
|
||||
score -= 12 if weak_flag_counts["too_few_questions_for_stage_replay"] else 0
|
||||
score -= 12 if weak_flag_counts["too_few_scenarios"] else 0
|
||||
score -= 10 if weak_flag_counts["too_few_contextual_followups"] else 0
|
||||
score -= 8 if weak_flag_counts["missing_direct_answer_shape_on_some_steps"] else 0
|
||||
score -= 8 if weak_flag_counts["no_canary_or_guarded_question"] else 0
|
||||
score -= 10 if weak_flag_counts["missing_report_or_analysis_request"] else 0
|
||||
score -= 10 if weak_flag_counts["low_domain_diversity"] else 0
|
||||
score -= min(20, weak_flag_counts["root_question_requires_missing_context"] * 10)
|
||||
score -= min(24, len(missing_domains) * 6)
|
||||
score -= min(24, weak_flag_counts["low_business_anchor"] * 6)
|
||||
score -= min(20, weak_flag_counts["duplicate_questions"] * 5)
|
||||
score -= min(30, weak_flag_counts["dependency_order_errors"] * 15)
|
||||
score -= min(12, weak_flag_counts["question_too_long"] * 3)
|
||||
score -= 20 if weak_flag_counts["artifact_path_too_long_for_windows"] else 0
|
||||
score = max(0, min(100, score))
|
||||
|
||||
if score >= 85:
|
||||
status = "strong"
|
||||
elif score >= 70:
|
||||
status = "usable_with_gaps"
|
||||
else:
|
||||
status = "weak"
|
||||
|
||||
return {
|
||||
"schema_version": STAGE_QUESTION_REVIEW_SCHEMA_VERSION,
|
||||
"created_at": now_iso(),
|
||||
"stage_id": stage_manifest["stage_id"],
|
||||
"module_name": stage_manifest.get("module_name"),
|
||||
"title": stage_manifest.get("title"),
|
||||
"pack_manifest": repo_relative(pack_manifest_path),
|
||||
"pack_id": pack_manifest.get("pack_id"),
|
||||
"status": status,
|
||||
"score": score,
|
||||
"question_count": len(question_reviews),
|
||||
"scenario_count": len(scenario_ids),
|
||||
"coverage": {
|
||||
"contextual_followup_questions": tag_counts["contextual_followup"],
|
||||
"root_questions": tag_counts["root_question"],
|
||||
"direct_answer_shape_questions": tag_counts["direct_answer_first_required"],
|
||||
"canary_or_guarded_questions": tag_counts["canary_or_guarded_question"],
|
||||
"report_or_analysis_questions": tag_counts["report_or_analysis_request"],
|
||||
},
|
||||
"tag_counts": dict(sorted(tag_counts.items())),
|
||||
"domain_counts": dict(sorted(domain_counts.items())),
|
||||
"weak_flag_counts": dict(sorted(weak_flag_counts.items())),
|
||||
"missing_domains": missing_domains,
|
||||
"duplicate_questions": duplicate_questions[:20],
|
||||
"dependency_errors": dependency_errors,
|
||||
"artifact_path_budget": artifact_path_budget,
|
||||
"max_estimated_artifact_path": max_estimated_artifact_path,
|
||||
"estimated_artifact_paths": estimated_artifact_paths,
|
||||
"recommendations": build_stage_question_recommendations(
|
||||
missing_domains=missing_domains,
|
||||
weak_flag_counts=weak_flag_counts,
|
||||
score=score,
|
||||
),
|
||||
"questions": question_reviews,
|
||||
}
|
||||
|
||||
|
||||
def build_stage_question_generation_markdown(review: dict[str, Any]) -> str:
|
||||
lines = [
|
||||
"# Stage Question Generation Review",
|
||||
"",
|
||||
f"- stage_id: `{review.get('stage_id')}`",
|
||||
f"- status: `{review.get('status')}`",
|
||||
f"- score: `{review.get('score')}`",
|
||||
f"- questions: `{review.get('question_count')}`",
|
||||
f"- scenarios: `{review.get('scenario_count')}`",
|
||||
f"- pack_manifest: `{review.get('pack_manifest')}`",
|
||||
"",
|
||||
"## Coverage",
|
||||
"",
|
||||
]
|
||||
coverage = review.get("coverage") if isinstance(review.get("coverage"), dict) else {}
|
||||
for key in sorted(coverage):
|
||||
lines.append(f"- {key}: `{coverage[key]}`")
|
||||
lines.extend(["", "## Domains", ""])
|
||||
domains = review.get("domain_counts") if isinstance(review.get("domain_counts"), dict) else {}
|
||||
for key in sorted(domains):
|
||||
lines.append(f"- {key}: `{domains[key]}`")
|
||||
lines.extend(["", "## Weak Flags", ""])
|
||||
weak_flags = review.get("weak_flag_counts") if isinstance(review.get("weak_flag_counts"), dict) else {}
|
||||
if weak_flags:
|
||||
for key in sorted(weak_flags):
|
||||
lines.append(f"- {key}: `{weak_flags[key]}`")
|
||||
else:
|
||||
lines.append("- none")
|
||||
lines.extend(["", "## Artifact Path Budget", ""])
|
||||
lines.append(f"- budget: `{review.get('artifact_path_budget')}`")
|
||||
lines.append(f"- max_estimated_artifact_path: `{review.get('max_estimated_artifact_path')}`")
|
||||
lines.extend(["", "## Recommendations", ""])
|
||||
for recommendation in review.get("recommendations") or []:
|
||||
lines.append(f"- {recommendation}")
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Questions",
|
||||
"",
|
||||
"| # | scenario | step | domains | weak_flags | question |",
|
||||
"|---:|---|---|---|---|---|",
|
||||
]
|
||||
)
|
||||
for item in review.get("questions") or []:
|
||||
question = str(item.get("question") or "").replace("|", "\\|")
|
||||
if len(question) > 180:
|
||||
question = question[:177].rstrip() + "..."
|
||||
domains_text = ", ".join(item.get("domains") or []) or "-"
|
||||
weak_text = ", ".join(item.get("weak_flags") or []) or "-"
|
||||
lines.append(
|
||||
"| "
|
||||
+ " | ".join(
|
||||
[
|
||||
str(item.get("global_index") or ""),
|
||||
str(item.get("scenario_id") or ""),
|
||||
str(item.get("step_id") or ""),
|
||||
domains_text,
|
||||
weak_text,
|
||||
question,
|
||||
]
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_stage_context_capsule(
|
||||
stage_manifest: dict[str, Any],
|
||||
stage_dir: Path,
|
||||
*,
|
||||
question_review: dict[str, Any] | None = None,
|
||||
summary: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
repair_mode = dcl.normalize_repair_mode(stage_manifest.get("repair_mode"))
|
||||
review = question_review if isinstance(question_review, dict) else {}
|
||||
loop_summary = summary if isinstance(summary, dict) else {}
|
||||
return {
|
||||
"schema_version": STAGE_CONTEXT_CAPSULE_SCHEMA_VERSION,
|
||||
"created_at": now_iso(),
|
||||
"stage_id": stage_manifest["stage_id"],
|
||||
"module_name": stage_manifest.get("module_name"),
|
||||
"title": stage_manifest.get("title"),
|
||||
"architecture_phase": stage_manifest.get("architecture_phase"),
|
||||
"current_stage_status": stage_manifest.get("current_stage_status"),
|
||||
"repair_mode": repair_mode,
|
||||
"operating_model": {
|
||||
"lead": "Lead Codex keeps implementation responsibility in the main project context.",
|
||||
"business_auditor": "A strong independent read-only Codex analyst reviews user-facing business meaning before technical metadata.",
|
||||
"auto_coder": "Disabled by default; use repair_mode=auto-coder only as an explicit opt-in experiment.",
|
||||
"human_operator": "Receives only final visual confirmation requests or real unresolved business/architecture decisions.",
|
||||
},
|
||||
"global_plan_refs": stage_manifest.get("global_plan_refs") or [],
|
||||
"acceptance_invariants": stage_manifest.get("acceptance_invariants") or [],
|
||||
"pack_manifest": repo_relative(repo_path(stage_manifest["pack_manifest"])),
|
||||
"stage_dir": repo_relative(stage_dir),
|
||||
"loop_dir": repo_relative(stage_loop_dir(stage_dir, stage_manifest)),
|
||||
"question_generation": {
|
||||
"status": review.get("status"),
|
||||
"score": review.get("score"),
|
||||
"question_count": review.get("question_count"),
|
||||
"scenario_count": review.get("scenario_count"),
|
||||
"missing_domains": review.get("missing_domains") or [],
|
||||
"weak_flag_counts": review.get("weak_flag_counts") or {},
|
||||
"review_json": repo_relative(stage_dir / "question_generation_review.json"),
|
||||
"review_markdown": repo_relative(stage_dir / "question_generation_review.md"),
|
||||
},
|
||||
"latest_loop_summary": {
|
||||
"loop_final_status": loop_summary.get("loop_final_status"),
|
||||
"last_quality_score": loop_summary.get("last_quality_score"),
|
||||
"last_analyst_decision": loop_summary.get("last_analyst_decision"),
|
||||
"next_action": loop_summary.get("next_action"),
|
||||
"latest_business_audit": loop_summary.get("latest_business_audit"),
|
||||
"latest_lead_coder_handoff": loop_summary.get("latest_lead_coder_handoff"),
|
||||
},
|
||||
"quality_rules": [
|
||||
"Review the human question and visible answer before internal route ids.",
|
||||
"Treat direct-answer-first, business usefulness, temporal honesty, field truth, and answer layering as acceptance gates.",
|
||||
"Treat deterministic P0/P1 repair targets as blockers even if the analyst wording sounds optimistic.",
|
||||
"After code edits, run targeted tests/build and rebuild graphify before replay evidence is trusted.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def build_stage_context_capsule_markdown(capsule: dict[str, Any]) -> str:
|
||||
operating_model = capsule.get("operating_model") if isinstance(capsule.get("operating_model"), dict) else {}
|
||||
question_generation = (
|
||||
capsule.get("question_generation")
|
||||
if isinstance(capsule.get("question_generation"), dict)
|
||||
else {}
|
||||
)
|
||||
latest_loop_summary = (
|
||||
capsule.get("latest_loop_summary")
|
||||
if isinstance(capsule.get("latest_loop_summary"), dict)
|
||||
else {}
|
||||
)
|
||||
lines = [
|
||||
"# Stage Context Capsule",
|
||||
"",
|
||||
f"- stage_id: `{capsule.get('stage_id')}`",
|
||||
f"- module_name: `{capsule.get('module_name')}`",
|
||||
f"- title: {capsule.get('title')}",
|
||||
f"- architecture_phase: `{capsule.get('architecture_phase') or 'n/a'}`",
|
||||
f"- current_stage_status: `{capsule.get('current_stage_status') or 'n/a'}`",
|
||||
f"- repair_mode: `{capsule.get('repair_mode')}`",
|
||||
f"- pack_manifest: `{capsule.get('pack_manifest')}`",
|
||||
f"- stage_dir: `{capsule.get('stage_dir')}`",
|
||||
f"- loop_dir: `{capsule.get('loop_dir')}`",
|
||||
"",
|
||||
"## Operating Model",
|
||||
]
|
||||
lines.extend([f"- {key}: {value}" for key, value in operating_model.items()] or ["- n/a"])
|
||||
lines.extend(["", "## Question Generation"])
|
||||
lines.extend(
|
||||
[
|
||||
f"- status: `{question_generation.get('status')}`",
|
||||
f"- score: `{question_generation.get('score')}`",
|
||||
f"- question_count: `{question_generation.get('question_count')}`",
|
||||
f"- scenario_count: `{question_generation.get('scenario_count')}`",
|
||||
f"- review_markdown: `{question_generation.get('review_markdown')}`",
|
||||
]
|
||||
)
|
||||
lines.extend(["", "## Latest Loop"])
|
||||
for key in [
|
||||
"loop_final_status",
|
||||
"last_quality_score",
|
||||
"last_analyst_decision",
|
||||
"next_action",
|
||||
"latest_business_audit",
|
||||
"latest_lead_coder_handoff",
|
||||
]:
|
||||
lines.append(f"- {key}: `{latest_loop_summary.get(key) or 'n/a'}`")
|
||||
lines.extend(["", "## Quality Rules"])
|
||||
lines.extend([f"- {item}" for item in capsule.get("quality_rules") or []])
|
||||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
def save_stage_context_capsule(
|
||||
stage_manifest: dict[str, Any],
|
||||
stage_dir: Path,
|
||||
*,
|
||||
question_review: dict[str, Any] | None = None,
|
||||
summary: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
if summary is None and (stage_dir / "stage_loop_summary.json").exists():
|
||||
summary = load_json_object(stage_dir / "stage_loop_summary.json", "Existing stage summary")
|
||||
capsule = build_stage_context_capsule(
|
||||
stage_manifest,
|
||||
stage_dir,
|
||||
question_review=question_review,
|
||||
summary=summary,
|
||||
)
|
||||
write_json(stage_dir / "stage_context_capsule.json", capsule)
|
||||
write_text(stage_dir / "stage_context_capsule.md", build_stage_context_capsule_markdown(capsule))
|
||||
return capsule
|
||||
|
||||
|
||||
def load_stage_manifest(path: Path) -> dict[str, Any]:
|
||||
raw = load_json_object(path, "Stage agent loop manifest")
|
||||
stage_id = slugify(str(raw.get("stage_id") or path.stem), path.stem)
|
||||
|
|
@ -87,6 +664,7 @@ def load_stage_manifest(path: Path) -> dict[str, Any]:
|
|||
raise RuntimeError("Stage manifest `target_score` must be between 0 and 100")
|
||||
if max_iterations < 1:
|
||||
raise RuntimeError("Stage manifest `max_iterations` must be >= 1")
|
||||
repair_mode = dcl.normalize_repair_mode(raw.get("repair_mode"))
|
||||
return {
|
||||
**raw,
|
||||
"schema_version": str(raw.get("schema_version") or STAGE_LOOP_SCHEMA_VERSION),
|
||||
|
|
@ -96,6 +674,7 @@ def load_stage_manifest(path: Path) -> dict[str, Any]:
|
|||
"pack_manifest": pack_manifest,
|
||||
"target_score": target_score,
|
||||
"max_iterations": max_iterations,
|
||||
"repair_mode": repair_mode,
|
||||
"global_plan_refs": string_list(raw.get("global_plan_refs")),
|
||||
"acceptance_invariants": string_list(raw.get("acceptance_invariants")),
|
||||
"save_autorun_on_accept": bool(raw.get("save_autorun_on_accept", True)),
|
||||
|
|
@ -118,6 +697,7 @@ def stage_gui_review_dir(stage_dir: Path, run_id: str) -> Path:
|
|||
|
||||
def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dict[str, Any], stage_dir: Path) -> list[str]:
|
||||
loop_id = str(stage_manifest.get("loop_id") or stage_manifest["stage_id"]).strip()
|
||||
repair_mode = dcl.normalize_repair_mode(getattr(args, "repair_mode", None) or stage_manifest.get("repair_mode"))
|
||||
command = [
|
||||
sys.executable,
|
||||
str(REPO_ROOT / "scripts" / "domain_case_loop.py"),
|
||||
|
|
@ -132,6 +712,8 @@ def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dic
|
|||
str(int(stage_manifest["target_score"])),
|
||||
"--max-iterations",
|
||||
str(int(stage_manifest["max_iterations"])),
|
||||
"--repair-mode",
|
||||
repair_mode,
|
||||
"--backend-url",
|
||||
str(args.backend_url),
|
||||
"--prompt-version",
|
||||
|
|
@ -238,6 +820,8 @@ def build_stage_summary(
|
|||
iterations = loop_state.get("iterations") if isinstance(loop_state.get("iterations"), list) else []
|
||||
last_iteration = iterations[-1] if iterations and isinstance(iterations[-1], dict) else {}
|
||||
final_status = str(loop_state.get("final_status") or "unknown").strip()
|
||||
repair_mode = dcl.normalize_repair_mode(loop_state.get("repair_mode") or stage_manifest.get("repair_mode"))
|
||||
lead_handoff_required = str(last_iteration.get("coder_status") or "") == "lead_handoff_required"
|
||||
raw_loop_accepted = final_status == "accepted" and bool(last_iteration.get("accepted_gate"))
|
||||
closing_gate = build_stage_closing_gate(previous_summary)
|
||||
accepted = raw_loop_accepted and bool(closing_gate.get("passed"))
|
||||
|
|
@ -248,8 +832,12 @@ def build_stage_summary(
|
|||
next_action = "stage_closed_without_manual_confirmation"
|
||||
elif raw_loop_accepted and not bool(closing_gate.get("passed")):
|
||||
next_action = "rerun_same_stage_or_gui_and_ingest_result"
|
||||
elif lead_handoff_required:
|
||||
next_action = "lead_coder_repair_required"
|
||||
elif bool(loop_state.get("last_user_decision_prompt")):
|
||||
next_action = "user_decision_required"
|
||||
elif repair_mode == dcl.REPAIR_MODE_LEAD_HANDOFF and final_status in {"partial", "needs_exact_capability"}:
|
||||
next_action = "rerun_stage_loop_for_lead_handoff"
|
||||
else:
|
||||
next_action = "continue_autonomous_or_fix_blocker"
|
||||
summary = {
|
||||
|
|
@ -260,6 +848,7 @@ def build_stage_summary(
|
|||
"global_plan_refs": stage_manifest.get("global_plan_refs") or [],
|
||||
"target_score": stage_manifest.get("target_score"),
|
||||
"acceptance_invariants": stage_manifest.get("acceptance_invariants") or [],
|
||||
"repair_mode": repair_mode,
|
||||
"loop_dir": repo_relative(loop_dir),
|
||||
"loop_final_status": final_status,
|
||||
"stop_reason": loop_state.get("stop_reason"),
|
||||
|
|
@ -268,6 +857,9 @@ def build_stage_summary(
|
|||
"last_analyst_decision": last_iteration.get("loop_decision") or loop_state.get("last_analyst_decision"),
|
||||
"last_deterministic_gate_ok": last_iteration.get("deterministic_gate_ok"),
|
||||
"last_deterministic_gate_reason": last_iteration.get("deterministic_gate_reason"),
|
||||
"latest_business_audit": repo_relative(Path(str(last_iteration.get("business_audit_path")))) if last_iteration.get("business_audit_path") else None,
|
||||
"latest_lead_coder_handoff": repo_relative(Path(str(last_iteration.get("lead_coder_handoff_markdown_path") or loop_state.get("latest_lead_coder_handoff_markdown_path")))) if (last_iteration.get("lead_coder_handoff_markdown_path") or loop_state.get("latest_lead_coder_handoff_markdown_path")) else None,
|
||||
"latest_lead_coder_handoff_json": repo_relative(Path(str(last_iteration.get("lead_coder_handoff_path") or loop_state.get("latest_lead_coder_handoff_path")))) if (last_iteration.get("lead_coder_handoff_path") or loop_state.get("latest_lead_coder_handoff_path")) else None,
|
||||
"loop_accepted_gate": bool(last_iteration.get("accepted_gate")),
|
||||
"accepted_gate": accepted,
|
||||
"stage_closing_gate": closing_gate,
|
||||
|
|
@ -329,6 +921,15 @@ def build_next_step_guidance(next_action: str) -> dict[str, Any]:
|
|||
"user_decision_required": [
|
||||
"read stage_loop_handoff.md and resolve the recorded user decision point",
|
||||
],
|
||||
"lead_coder_repair_required": [
|
||||
"read stage_loop_handoff.md, latest lead_coder_handoff.md, and business_audit.md",
|
||||
"repair code in the main Lead Codex context; do not run weak auto-coder by default",
|
||||
"run targeted tests/build/graphify, then rerun the same semantic pack or ingest the GUI validation run",
|
||||
],
|
||||
"rerun_stage_loop_for_lead_handoff": [
|
||||
"python scripts/stage_agent_loop.py run --manifest <stage_manifest.json>",
|
||||
"the previous summary predates lead-handoff repair mode; rerun the stage loop to generate business_audit.md and lead_coder_handoff.md",
|
||||
],
|
||||
"continue_autonomous_or_fix_blocker": [
|
||||
"inspect stage_loop_handoff.md and rerun stage_agent_loop.py run after resolving the blocker",
|
||||
],
|
||||
|
|
@ -354,6 +955,7 @@ def build_stage_handoff_markdown(summary: dict[str, Any]) -> str:
|
|||
f"- module_name: `{summary.get('module_name')}`",
|
||||
f"- title: {summary.get('title')}",
|
||||
f"- loop_final_status: `{summary.get('loop_final_status')}`",
|
||||
f"- repair_mode: `{summary.get('repair_mode') or 'n/a'}`",
|
||||
f"- target_score: `{summary.get('target_score')}`",
|
||||
f"- iterations_ran: `{summary.get('iterations_ran')}`",
|
||||
f"- last_quality_score: `{summary.get('last_quality_score')}`",
|
||||
|
|
@ -365,6 +967,8 @@ def build_stage_handoff_markdown(summary: dict[str, Any]) -> str:
|
|||
f"- manual_confirmation_required: `{summary.get('manual_confirmation_required')}`",
|
||||
f"- next_action: `{summary.get('next_action')}`",
|
||||
f"- loop_dir: `{summary.get('loop_dir')}`",
|
||||
f"- latest_business_audit: `{summary.get('latest_business_audit') or 'n/a'}`",
|
||||
f"- latest_lead_coder_handoff: `{summary.get('latest_lead_coder_handoff') or 'n/a'}`",
|
||||
f"- stop_reason: {summary.get('stop_reason') or 'n/a'}",
|
||||
"",
|
||||
"## Plan refs",
|
||||
|
|
@ -988,14 +1592,13 @@ def run_stage_repair(args: argparse.Namespace) -> dict[str, Any]:
|
|||
write_json(iteration_dir / "repair_execution_summary.json", payload)
|
||||
summary_path = stage_dir / "stage_loop_summary.json"
|
||||
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
|
||||
save_stage_summary(
|
||||
stage_dir,
|
||||
build_repair_execution_stage_summary(
|
||||
stage_manifest=stage_manifest,
|
||||
previous_summary=previous_summary,
|
||||
execution=payload,
|
||||
),
|
||||
summary = build_repair_execution_stage_summary(
|
||||
stage_manifest=stage_manifest,
|
||||
previous_summary=previous_summary,
|
||||
execution=payload,
|
||||
)
|
||||
save_stage_summary(stage_dir, summary)
|
||||
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
|
||||
return payload
|
||||
|
||||
snapshots = dcl.snapshot_coder_candidate_files(repair_candidate_paths(plan))
|
||||
|
|
@ -1016,14 +1619,13 @@ def run_stage_repair(args: argparse.Namespace) -> dict[str, Any]:
|
|||
write_json(iteration_dir / "repair_execution_summary.json", payload)
|
||||
summary_path = stage_dir / "stage_loop_summary.json"
|
||||
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
|
||||
save_stage_summary(
|
||||
stage_dir,
|
||||
build_repair_execution_stage_summary(
|
||||
stage_manifest=stage_manifest,
|
||||
previous_summary=previous_summary,
|
||||
execution=payload,
|
||||
),
|
||||
summary = build_repair_execution_stage_summary(
|
||||
stage_manifest=stage_manifest,
|
||||
previous_summary=previous_summary,
|
||||
execution=payload,
|
||||
)
|
||||
save_stage_summary(stage_dir, summary)
|
||||
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
|
||||
return payload
|
||||
|
||||
|
||||
|
|
@ -1062,6 +1664,7 @@ def ingest_gui_run_review(args: argparse.Namespace) -> dict[str, Any]:
|
|||
previous_summary=previous_summary,
|
||||
)
|
||||
save_stage_summary(stage_dir, summary)
|
||||
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
|
||||
save_stage_repair_handoff(stage_dir, build_stage_repair_handoff(summary, review))
|
||||
return summary
|
||||
|
||||
|
|
@ -1135,7 +1738,23 @@ def build_stage_status(stage_manifest: dict[str, Any], stage_dir: Path) -> dict[
|
|||
else {}
|
||||
)
|
||||
domain_command_path = stage_dir / "domain_pack_loop.command.txt"
|
||||
question_review_path = stage_dir / "question_generation_review.json"
|
||||
question_review = (
|
||||
load_json_object(question_review_path, "Stage question generation review")
|
||||
if question_review_path.exists()
|
||||
else {}
|
||||
)
|
||||
repair_mode = str(summary.get("repair_mode") or stage_manifest.get("repair_mode") or dcl.REPAIR_MODE_LEAD_HANDOFF)
|
||||
next_action = str(summary.get("next_action") or "run_stage_loop_or_ingest_gui_run")
|
||||
next_step_guidance = summary.get("next_step_guidance") or build_next_step_guidance(next_action)
|
||||
if (
|
||||
repair_mode == dcl.REPAIR_MODE_LEAD_HANDOFF
|
||||
and next_action == "continue_autonomous_or_fix_blocker"
|
||||
and not summary.get("latest_lead_coder_handoff")
|
||||
and summary.get("loop_final_status") in {"partial", "needs_exact_capability"}
|
||||
):
|
||||
next_action = "rerun_stage_loop_for_lead_handoff"
|
||||
next_step_guidance = build_next_step_guidance(next_action)
|
||||
latest_gui_review = summary.get("latest_gui_review") if isinstance(summary.get("latest_gui_review"), dict) else {}
|
||||
latest_repair_execution = (
|
||||
summary.get("latest_repair_execution")
|
||||
|
|
@ -1159,14 +1778,18 @@ def build_stage_status(stage_manifest: dict[str, Any], stage_dir: Path) -> dict[
|
|||
"title": stage_manifest.get("title"),
|
||||
"stage_dir": repo_relative(stage_dir),
|
||||
"summary_exists": bool(summary),
|
||||
"repair_mode": repair_mode,
|
||||
"loop_final_status": summary.get("loop_final_status"),
|
||||
"accepted_gate": summary.get("accepted_gate"),
|
||||
"loop_accepted_gate": summary.get("loop_accepted_gate"),
|
||||
"stage_closing_gate": stage_closing_gate or None,
|
||||
"next_action": next_action,
|
||||
"next_step_guidance": summary.get("next_step_guidance") or build_next_step_guidance(next_action),
|
||||
"next_step_guidance": next_step_guidance,
|
||||
"latest_gui_run_id": latest_gui_review.get("run_id"),
|
||||
"latest_gui_business_status": latest_gui_review.get("overall_business_status"),
|
||||
"latest_business_audit": summary.get("latest_business_audit"),
|
||||
"latest_lead_coder_handoff": summary.get("latest_lead_coder_handoff"),
|
||||
"latest_lead_coder_handoff_json": summary.get("latest_lead_coder_handoff_json"),
|
||||
"latest_repair_coder_status": latest_repair_execution.get("coder_status"),
|
||||
"latest_repair_dry_run": latest_repair_execution.get("dry_run"),
|
||||
"latest_validation_run_id": latest_repair_validation.get("validation_run_id"),
|
||||
|
|
@ -1175,6 +1798,9 @@ def build_stage_status(stage_manifest: dict[str, Any], stage_dir: Path) -> dict[
|
|||
"summary_path": repo_relative(summary_path) if summary_path.exists() else None,
|
||||
"domain_pack_loop_command_exists": domain_command_path.exists(),
|
||||
"domain_pack_loop_command_path": repo_relative(domain_command_path) if domain_command_path.exists() else None,
|
||||
"question_generation_review_status": question_review.get("status"),
|
||||
"question_generation_review_score": question_review.get("score"),
|
||||
"question_generation_review_path": repo_relative(question_review_path) if question_review_path.exists() else None,
|
||||
"last_continue_action": continue_result.get("performed_action"),
|
||||
"last_continue_next_action": continue_result.get("next_action"),
|
||||
"last_continue_result_path": repo_relative(continue_result_path) if continue_result_path.exists() else None,
|
||||
|
|
@ -1190,6 +1816,19 @@ def handle_status(args: argparse.Namespace) -> int:
|
|||
return 0
|
||||
|
||||
|
||||
def handle_review_questions(args: argparse.Namespace) -> int:
|
||||
stage_manifest_path = repo_path(args.manifest)
|
||||
stage_manifest = load_stage_manifest(stage_manifest_path)
|
||||
stage_dir = stage_dir_for(repo_path(args.output_root), stage_manifest["stage_id"])
|
||||
stage_dir.mkdir(parents=True, exist_ok=True)
|
||||
review = build_stage_question_generation_review(stage_manifest, stage_dir=stage_dir)
|
||||
write_json(stage_dir / "question_generation_review.json", review)
|
||||
write_text(stage_dir / "question_generation_review.md", build_stage_question_generation_markdown(review))
|
||||
save_stage_context_capsule(stage_manifest, stage_dir, question_review=review)
|
||||
print(json.dumps(review, ensure_ascii=False, indent=2))
|
||||
return 0
|
||||
|
||||
|
||||
def args_with(args: argparse.Namespace, **overrides: Any) -> argparse.Namespace:
|
||||
values = vars(args).copy()
|
||||
values.update(overrides)
|
||||
|
|
@ -1277,6 +1916,36 @@ def handle_continue(args: argparse.Namespace) -> int:
|
|||
),
|
||||
}
|
||||
)
|
||||
elif next_action == "lead_coder_repair_required":
|
||||
payload.update(
|
||||
{
|
||||
"performed_action": "wait_for_lead_codex_repair",
|
||||
"next_action": next_action,
|
||||
"business_audit": status_before.get("latest_business_audit"),
|
||||
"lead_coder_handoff": status_before.get("latest_lead_coder_handoff"),
|
||||
"suggested_next_steps": [
|
||||
"repair code in the main Lead Codex context from the handoff artifacts",
|
||||
"run targeted tests/build/graphify after code edits",
|
||||
"rerun the same stage semantic pack or ingest the GUI validation run",
|
||||
],
|
||||
}
|
||||
)
|
||||
elif next_action == "rerun_stage_loop_for_lead_handoff":
|
||||
write_json(stage_dir / "stage_manifest.json", stage_manifest)
|
||||
write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n")
|
||||
command = build_domain_pack_loop_command(args, stage_manifest, stage_dir)
|
||||
write_text(stage_dir / "domain_pack_loop.command.txt", " ".join(command) + "\n")
|
||||
payload.update(
|
||||
{
|
||||
"performed_action": "materialize_lead_handoff_stage_rerun",
|
||||
"domain_pack_loop_command": command,
|
||||
"next_action": next_action,
|
||||
"suggested_command": (
|
||||
"python scripts/stage_agent_loop.py run "
|
||||
"--manifest <stage_manifest.json>"
|
||||
),
|
||||
}
|
||||
)
|
||||
elif next_action == "run_stage_loop_or_ingest_gui_run":
|
||||
if getattr(args, "run_id", None):
|
||||
ingest_summary = ingest_gui_run_review(args)
|
||||
|
|
@ -1326,6 +1995,7 @@ def handle_summarize(args: argparse.Namespace) -> int:
|
|||
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
|
||||
summary = build_stage_summary(stage_manifest, loop_dir, previous_summary=previous_summary)
|
||||
save_stage_summary(stage_dir, summary)
|
||||
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
|
||||
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
||||
return 0
|
||||
|
||||
|
|
@ -1337,6 +2007,7 @@ def handle_run(args: argparse.Namespace) -> int:
|
|||
stage_dir.mkdir(parents=True, exist_ok=True)
|
||||
write_json(stage_dir / "stage_manifest.json", stage_manifest)
|
||||
write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n")
|
||||
save_stage_context_capsule(stage_manifest, stage_dir)
|
||||
|
||||
command = build_domain_pack_loop_command(args, stage_manifest, stage_dir)
|
||||
write_text(stage_dir / "domain_pack_loop.command.txt", " ".join(command) + "\n")
|
||||
|
|
@ -1356,6 +2027,7 @@ def handle_run(args: argparse.Namespace) -> int:
|
|||
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
|
||||
summary = build_stage_summary(stage_manifest, loop_dir, previous_summary=previous_summary)
|
||||
save_stage_summary(stage_dir, summary)
|
||||
save_stage_context_capsule(stage_manifest, stage_dir, summary=summary)
|
||||
|
||||
if (
|
||||
bool(summary.get("accepted_gate"))
|
||||
|
|
@ -1390,13 +2062,14 @@ def add_common_args(parser: argparse.ArgumentParser) -> None:
|
|||
parser.add_argument("--max-output-tokens", type=int, default=2048)
|
||||
parser.add_argument("--timeout-seconds", type=int, default=180)
|
||||
parser.add_argument("--use-mock", action="store_true")
|
||||
parser.add_argument("--repair-mode", choices=[dcl.REPAIR_MODE_LEAD_HANDOFF, dcl.REPAIR_MODE_AUTO_CODER])
|
||||
parser.add_argument("--codex-binary", default="codex")
|
||||
parser.add_argument("--codex-profile")
|
||||
parser.add_argument("--codex-model")
|
||||
parser.add_argument("--analyst-codex-model", default="gpt-5.4")
|
||||
parser.add_argument("--coder-codex-model", default="gpt-5.4-mini")
|
||||
parser.add_argument("--coder-codex-model", default="gpt-5.4")
|
||||
parser.add_argument("--analyst-reasoning-effort", default="medium")
|
||||
parser.add_argument("--coder-reasoning-effort", default="low")
|
||||
parser.add_argument("--coder-reasoning-effort", default="high")
|
||||
parser.add_argument("--codex-timeout-seconds", type=int, default=1800)
|
||||
|
||||
|
||||
|
|
@ -1423,6 +2096,13 @@ def build_parser() -> argparse.ArgumentParser:
|
|||
add_common_args(status_parser)
|
||||
status_parser.set_defaults(func=handle_status)
|
||||
|
||||
review_questions_parser = subparsers.add_parser(
|
||||
"review-questions",
|
||||
help="Review generated stage-pack questions before launching the expensive live replay.",
|
||||
)
|
||||
add_common_args(review_questions_parser)
|
||||
review_questions_parser.set_defaults(func=handle_review_questions)
|
||||
|
||||
continue_parser = subparsers.add_parser(
|
||||
"continue",
|
||||
help="Execute the next safe stage-loop step derived from status.next_action.",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,289 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
|
||||
import domain_case_loop as dcl
|
||||
|
||||
|
||||
class DomainCaseLoopLeadHandoffTests(unittest.TestCase):
|
||||
def test_normalize_repair_mode_defaults_to_lead_handoff(self) -> None:
|
||||
self.assertEqual(dcl.normalize_repair_mode(None), "lead-handoff")
|
||||
self.assertEqual(dcl.normalize_repair_mode("lead_codex"), "lead-handoff")
|
||||
self.assertEqual(dcl.normalize_repair_mode("auto_coder"), "auto-coder")
|
||||
|
||||
def test_lead_handoff_captures_business_audit_and_primary_focus(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
pack_dir = root / "pack"
|
||||
iteration_dir = root / "loop" / "iterations" / "iteration_00"
|
||||
loop_dir = root / "loop"
|
||||
business_audit_path = iteration_dir / "business_audit.md"
|
||||
analyst_verdict_path = iteration_dir / "analyst_verdict.json"
|
||||
repair_targets_path = pack_dir / "repair_targets.json"
|
||||
repair_targets = {
|
||||
"target_count": 1,
|
||||
"severity_counts": {"P0": 1},
|
||||
"priority_foci": [
|
||||
{
|
||||
"focus_id": "answer_shape",
|
||||
"severity": "P0",
|
||||
"issue_code": "business_direct_answer_missing",
|
||||
"summary": "Direct answer is buried below service scaffolding.",
|
||||
"candidate_files": [
|
||||
"llm_normalizer/backend/src/services/address_runtime/composeStage.ts"
|
||||
],
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"severity": "P0",
|
||||
"issue_code": "business_direct_answer_missing",
|
||||
"step_id": "q01",
|
||||
}
|
||||
],
|
||||
}
|
||||
analyst_verdict = {
|
||||
"quality_score": 42,
|
||||
"loop_decision": "partial",
|
||||
"user_intent_summary": "User asked for a direct business answer.",
|
||||
"expected_direct_answer": "Direct first-line answer.",
|
||||
"actual_direct_answer": "Scaffolded long answer.",
|
||||
"root_cause_layers": ["answer_shape_mismatch"],
|
||||
}
|
||||
|
||||
handoff = dcl.build_lead_coder_handoff(
|
||||
loop_state={"loop_id": "demo"},
|
||||
iteration_id="iteration_00",
|
||||
pack_dir=pack_dir,
|
||||
analyst_verdict_path=analyst_verdict_path,
|
||||
repair_targets_path=repair_targets_path,
|
||||
business_audit_path=business_audit_path,
|
||||
analyst_verdict=analyst_verdict,
|
||||
repair_targets=repair_targets,
|
||||
target_score=88,
|
||||
loop_decision="partial",
|
||||
analyst_accepted_gate=False,
|
||||
accepted_gate=False,
|
||||
deterministic_gate_ok=False,
|
||||
deterministic_gate_reason="repair_targets_remaining=P0:1",
|
||||
requires_user_decision=False,
|
||||
user_decision_type="none",
|
||||
user_decision_prompt=None,
|
||||
)
|
||||
paths = dcl.save_lead_coder_handoff(
|
||||
loop_dir=loop_dir,
|
||||
iteration_dir=iteration_dir,
|
||||
handoff=handoff,
|
||||
)
|
||||
|
||||
saved = json.loads((iteration_dir / "lead_coder_handoff.json").read_text(encoding="utf-8"))
|
||||
latest_handoff_exists = Path(paths["latest_lead_coder_handoff_path"]).exists()
|
||||
|
||||
self.assertEqual(saved["repair_mode"], "lead-handoff")
|
||||
self.assertEqual(saved["status"], "lead_coder_repair_required")
|
||||
self.assertEqual(saved["assigned_primary_focus"]["focus_id"], "answer_shape")
|
||||
self.assertIn("business_audit", saved["artifact_refs"])
|
||||
self.assertTrue(latest_handoff_exists)
|
||||
|
||||
def test_analyst_priority_targets_become_lead_repair_targets(self) -> None:
|
||||
repair_targets = {
|
||||
"pack_id": "demo_pack",
|
||||
"domain": "demo",
|
||||
"target_count": 0,
|
||||
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
|
||||
"priority_foci": [],
|
||||
"targets": [],
|
||||
}
|
||||
analyst_verdict = {
|
||||
"priority_targets": [
|
||||
{
|
||||
"scenario_id": "svk_pivot",
|
||||
"step_id": "s03_summary",
|
||||
"severity": "P0",
|
||||
"problem_type": "bundle_reuse_gap",
|
||||
"fix_goal": "Reuse the confirmed SVK value-flow bundle in the final summary.",
|
||||
},
|
||||
{
|
||||
"scenario_id": "biz_scope",
|
||||
"step_id": "s02_money",
|
||||
"severity": "P1",
|
||||
"problem_type": "field_mapping_gap",
|
||||
"fix_goal": "Separate cash source/recipient labels from client/supplier labels.",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
|
||||
handoff = dcl.build_lead_coder_handoff(
|
||||
loop_state={"loop_id": "demo"},
|
||||
iteration_id="iteration_00",
|
||||
pack_dir=Path("pack"),
|
||||
analyst_verdict_path=Path("analyst_verdict.json"),
|
||||
repair_targets_path=Path("semantic_repair_targets.json"),
|
||||
business_audit_path=Path("business_audit.md"),
|
||||
analyst_verdict={"quality_score": 73, "loop_decision": "continue"},
|
||||
repair_targets=merged,
|
||||
target_score=88,
|
||||
loop_decision="continue",
|
||||
analyst_accepted_gate=False,
|
||||
accepted_gate=False,
|
||||
deterministic_gate_ok=True,
|
||||
deterministic_gate_reason="deterministic_gate_passed",
|
||||
requires_user_decision=False,
|
||||
user_decision_type="none",
|
||||
user_decision_prompt=None,
|
||||
)
|
||||
|
||||
self.assertEqual(merged["target_count"], 2)
|
||||
self.assertEqual(merged["severity_counts"]["P0"], 1)
|
||||
self.assertEqual(handoff["assigned_primary_focus"]["problem_type"], "bundle_reuse_gap")
|
||||
self.assertEqual(handoff["top_repair_targets"][0]["target_id"], "svk_pivot:s03_summary")
|
||||
self.assertIn(
|
||||
"llm_normalizer/backend/src/services/assistantMcpDiscoveryResponseCandidate.ts",
|
||||
handoff["candidate_files"],
|
||||
)
|
||||
|
||||
def test_stale_analyst_validation_target_is_suppressed_by_step_state(self) -> None:
|
||||
repair_targets = {
|
||||
"pack_id": "demo_pack",
|
||||
"domain": "demo",
|
||||
"target_count": 0,
|
||||
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
|
||||
"priority_foci": [],
|
||||
"targets": [],
|
||||
"step_validation_index": {
|
||||
"legacy_canaries:s02_acc60": {
|
||||
"acceptance_status": "validated",
|
||||
"violated_invariants": [],
|
||||
"warnings": [],
|
||||
"runtime_factual_answer_validated": False,
|
||||
"guarded_insufficiency_validated": True,
|
||||
}
|
||||
},
|
||||
}
|
||||
analyst_verdict = {
|
||||
"priority_targets": [
|
||||
{
|
||||
"scenario_id": "legacy_canaries",
|
||||
"step_id": "s02_acc60",
|
||||
"severity": "P0",
|
||||
"problem_type": "evidence_gap",
|
||||
"fix_goal": (
|
||||
"partial heuristic answer without runtime_factual_answer_validated "
|
||||
"or guarded_insufficiency_validated must not pass silently"
|
||||
),
|
||||
},
|
||||
{
|
||||
"scenario_id": "biz_scope",
|
||||
"step_id": "s03_best_year",
|
||||
"severity": "P2",
|
||||
"problem_type": "presentation_gap",
|
||||
"fix_goal": "Clarify why this year leads without implying pure profit.",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
|
||||
|
||||
self.assertEqual(merged["suppressed_analyst_priority_target_count"], 1)
|
||||
self.assertEqual(merged["target_count"], 1)
|
||||
self.assertEqual(merged["targets"][0]["target_id"], "biz_scope:s03_best_year")
|
||||
self.assertEqual(merged["severity_counts"]["P0"], 0)
|
||||
self.assertEqual(merged["severity_counts"]["P2"], 1)
|
||||
|
||||
def test_bounded_mcp_evidence_gap_target_is_suppressed_by_step_state(self) -> None:
|
||||
repair_targets = {
|
||||
"pack_id": "demo_pack",
|
||||
"domain": "demo",
|
||||
"target_count": 0,
|
||||
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
|
||||
"priority_foci": [],
|
||||
"targets": [],
|
||||
"step_validation_index": {
|
||||
"biz_scope:s03_best_year": {
|
||||
"acceptance_status": "validated",
|
||||
"violated_invariants": [],
|
||||
"warnings": [],
|
||||
"bounded_mcp_answer_validated": True,
|
||||
"mcp_discovery_response_applied": True,
|
||||
"mcp_discovery_response_candidate_status": "ready_for_guarded_use",
|
||||
"assistant_text_excerpt": (
|
||||
"Коротко: самый доходный год в доступном денежном контуре 1С — 2015. "
|
||||
"Важно: входящие уперлись в лимит выборки MCP; это проверенный срез, "
|
||||
"не чистая бухгалтерская прибыль."
|
||||
),
|
||||
}
|
||||
},
|
||||
}
|
||||
analyst_verdict = {
|
||||
"priority_targets": [
|
||||
{
|
||||
"scenario_id": "biz_scope",
|
||||
"step_id": "s03_best_year",
|
||||
"severity": "P0",
|
||||
"problem_type": "evidence_gap",
|
||||
"fix_goal": (
|
||||
"Убрать asserted winner-year как подтвержденный факт, пока yearly ranking "
|
||||
"не имеет exact validated compute; legacy metadata says unsupported/blocked."
|
||||
),
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
|
||||
|
||||
self.assertEqual(merged["suppressed_analyst_priority_target_count"], 1)
|
||||
self.assertEqual(merged["target_count"], 0)
|
||||
self.assertEqual(merged["severity_counts"], {"P0": 0, "P1": 0, "P2": 0})
|
||||
|
||||
def test_runtime_exact_followup_target_is_suppressed_when_focus_is_proven(self) -> None:
|
||||
repair_targets = {
|
||||
"pack_id": "demo_pack",
|
||||
"domain": "demo",
|
||||
"target_count": 0,
|
||||
"severity_counts": {"P0": 0, "P1": 0, "P2": 0},
|
||||
"priority_foci": [],
|
||||
"targets": [],
|
||||
"step_validation_index": {
|
||||
"svk_pivot:s02_svk_docs": {
|
||||
"acceptance_status": "validated",
|
||||
"violated_invariants": [],
|
||||
"warnings": [],
|
||||
"runtime_factual_answer_validated": True,
|
||||
"assistant_text_excerpt": "Контрагент: Группа СВК. Найдено документов: 19.",
|
||||
"extracted_filters": {"counterparty": "Группа СВК"},
|
||||
"focus_object": {"label": "Группа СВК"},
|
||||
}
|
||||
},
|
||||
}
|
||||
analyst_verdict = {
|
||||
"priority_targets": [
|
||||
{
|
||||
"scenario_id": "svk_pivot",
|
||||
"step_id": "s02_svk_docs",
|
||||
"severity": "P1",
|
||||
"problem_type": "followup_action_resolution_gap",
|
||||
"fix_goal": (
|
||||
"Добавить pack-level validation на object-centric carryover: docs follow-up "
|
||||
"и bundle reuse должны быть явно проверены через stable counterparty/focus."
|
||||
),
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
merged = dcl.merge_analyst_priority_repair_targets(repair_targets, analyst_verdict)
|
||||
|
||||
self.assertEqual(merged["suppressed_analyst_priority_target_count"], 1)
|
||||
self.assertEqual(merged["target_count"], 0)
|
||||
self.assertEqual(merged["severity_counts"], {"P0": 0, "P1": 0, "P2": 0})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
@ -50,6 +50,580 @@ class DomainCaseLoopStepStateTests(unittest.TestCase):
|
|||
self.assertEqual(step_state["mcp_discovery_catalog_chain_top_match"], "value_flow")
|
||||
self.assertTrue(step_state["mcp_discovery_catalog_chain_selected_matches_top"])
|
||||
|
||||
def test_analysis_context_date_is_not_implicit_business_filter(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="stage_pack_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "All-time summary",
|
||||
"depends_on": [],
|
||||
"question_template": "all-time money summary",
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="all-time money summary",
|
||||
analysis_context={"as_of_date": "2026-05-09", "source": "stage_pack"},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "factual_with_explanation",
|
||||
"text": "Short: all-time confirmed money summary.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertNotIn("missing_required_filter", step_state["violated_invariants"])
|
||||
self.assertNotIn("wrong_as_of_date", step_state["violated_invariants"])
|
||||
|
||||
def test_analysis_context_date_is_required_for_explicit_date_carryover(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="date_carryover_demo",
|
||||
domain="inventory",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Date carryover",
|
||||
"depends_on": [],
|
||||
"question_template": "stock on that date",
|
||||
"required_carryover_invariants": ["date_scope"],
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="stock on that date",
|
||||
analysis_context={"as_of_date": "2021-03-31"},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "factual",
|
||||
"text": "Short: stock confirmed.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "inventory_on_hand_as_of_date",
|
||||
"selected_recipe": "address_inventory_on_hand_as_of_date_v1",
|
||||
"capability_id": "confirmed_inventory_on_hand_as_of_date",
|
||||
"capability_route_mode": "exact",
|
||||
"fallback_type": "none",
|
||||
"extracted_filters": {"as_of_date": "2020-03-31"},
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertIn("wrong_as_of_date", step_state["violated_invariants"])
|
||||
|
||||
def test_temporal_reset_question_skips_carried_date_scope(self) -> None:
|
||||
self.assertTrue(dcl.question_resets_temporal_scope("show money za all time"))
|
||||
self.assertTrue(dcl.question_resets_temporal_scope("сколько всего денег за все доступное время"))
|
||||
|
||||
carried = dcl.carry_forward_analysis_context(
|
||||
{
|
||||
"semantic_memory": {
|
||||
"date_scope": {
|
||||
"as_of_date": "2020-12-31",
|
||||
"period_from": "2020-10-01",
|
||||
"period_to": "2020-12-31",
|
||||
},
|
||||
"organization_scope": {"label": "ООО Альтернатива Плюс"},
|
||||
}
|
||||
},
|
||||
{},
|
||||
prefer_carryover=True,
|
||||
carry_date_scope=False,
|
||||
)
|
||||
|
||||
self.assertNotIn("as_of_date", carried)
|
||||
self.assertEqual(carried["organization_scope"], {"label": "ООО Альтернатива Плюс"})
|
||||
|
||||
def test_merge_scenario_date_scope_keeps_current_scope_over_stale_previous(self) -> None:
|
||||
merged = dcl.merge_scenario_date_scope(
|
||||
{
|
||||
"as_of_date": "2020-12-31",
|
||||
"period_from": "2020-10-01",
|
||||
"period_to": "2020-12-31",
|
||||
"source": "scenario_state_carryover",
|
||||
},
|
||||
{
|
||||
"as_of_date": "2021-03-31",
|
||||
"period_from": "2021-03-01",
|
||||
"period_to": "2021-03-31",
|
||||
"source": "current_turn",
|
||||
},
|
||||
depends_on=["previous_step"],
|
||||
)
|
||||
|
||||
self.assertEqual(merged["as_of_date"], "2021-03-31")
|
||||
self.assertEqual(merged["source"], "current_turn")
|
||||
|
||||
def test_mcp_business_overview_all_time_scope_overrides_stale_session_date(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="business_overview_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "All-time money",
|
||||
"depends_on": ["previous_step"],
|
||||
"question_template": "all-time money summary",
|
||||
"expected_intents": ["business_overview"],
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="all-time money summary",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "partial_coverage",
|
||||
"text": "Short: all-time confirmed money summary.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
|
||||
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
|
||||
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
|
||||
"mcp_discovery_response_applied": True,
|
||||
"mcp_discovery_selected_chain_id": "business_overview",
|
||||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||||
"mcp_discovery_response_candidate_v1": {
|
||||
"candidate_status": "ready_for_guarded_use",
|
||||
"reply_type": "partial_coverage",
|
||||
},
|
||||
"assistant_mcp_discovery_entry_point_v1": {
|
||||
"bridge": {
|
||||
"pilot": {
|
||||
"derived_business_overview": {
|
||||
"period_scope": None,
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
"session_summary": {
|
||||
"address_navigation_state": {
|
||||
"session_context": {
|
||||
"active_result_set_id": "rs-stale",
|
||||
"date_scope": {
|
||||
"as_of_date": "2020-12-31",
|
||||
"period_from": "2020-10-01",
|
||||
"period_to": "2020-12-31",
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["date_scope"]["scope"], "all_time")
|
||||
self.assertIsNone(step_state["date_scope"]["as_of_date"])
|
||||
self.assertEqual(step_state["active_result_set_id"], "mcp-discovery-msg-1")
|
||||
self.assertNotIn("wrong_date_scope_state", step_state["violated_invariants"])
|
||||
|
||||
def test_applied_ready_mcp_discovery_chain_satisfies_expected_intent(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="business_overview_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Business overview",
|
||||
"depends_on": [],
|
||||
"question_template": "business overview for 2020",
|
||||
"expected_intents": ["business_overview"],
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="business overview for 2020",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "partial_coverage",
|
||||
"text": "Short: business overview from confirmed 1C rows.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
|
||||
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
|
||||
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
|
||||
"mcp_discovery_response_applied": True,
|
||||
"mcp_discovery_selected_chain_id": "business_overview",
|
||||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||||
"mcp_discovery_response_candidate_v1": {
|
||||
"candidate_status": "ready_for_guarded_use",
|
||||
"reply_type": "partial_coverage",
|
||||
},
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["mcp_discovery_effective_intents"], ["business_overview"])
|
||||
self.assertNotIn("wrong_intent", step_state["violated_invariants"])
|
||||
|
||||
def test_ready_bounded_mcp_answer_can_validate_without_exact_route(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="business_overview_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Business overview",
|
||||
"depends_on": [],
|
||||
"question_template": "business overview for 2020",
|
||||
"expected_intents": ["business_overview"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="business overview for 2020",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "partial_coverage",
|
||||
"text": "Short: confirmed bounded business overview from 1C rows.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "inventory_supplier_stock_overlap_as_of_date",
|
||||
"selected_recipe": "address_inventory_supplier_stock_overlap_as_of_date_v1",
|
||||
"capability_id": "inventory_inventory_supplier_stock_overlap_as_of_date",
|
||||
"mcp_discovery_response_applied": True,
|
||||
"mcp_discovery_selected_chain_id": "business_overview",
|
||||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||||
"mcp_discovery_response_candidate_v1": {
|
||||
"candidate_status": "ready_for_guarded_use",
|
||||
"reply_type": "partial_coverage",
|
||||
},
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["execution_status"], "partial")
|
||||
self.assertTrue(step_state["bounded_mcp_answer_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||||
|
||||
def test_required_answer_patterns_block_generic_bounded_mcp_summary(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="summary_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Summary",
|
||||
"depends_on": [],
|
||||
"question_template": "summarize company and SVK separately",
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"required_answer_patterns_all": ["SVK", "company"],
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="summarize company and SVK separately",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "partial_coverage",
|
||||
"text": "Short: company money summary only.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"mcp_discovery_response_applied": True,
|
||||
"mcp_discovery_selected_chain_id": "business_overview",
|
||||
"mcp_discovery_catalog_chain_top_match": "business_overview",
|
||||
"mcp_discovery_response_candidate_v1": {
|
||||
"candidate_status": "ready_for_guarded_use",
|
||||
"reply_type": "partial_coverage",
|
||||
},
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertIn("required_answer_patterns_all_missing", step_state["violated_invariants"])
|
||||
self.assertFalse(step_state["bounded_mcp_answer_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "rejected")
|
||||
|
||||
def test_memory_checkpoint_can_validate_honest_no_scope_answer(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="memory_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Memory checkpoint",
|
||||
"depends_on": [],
|
||||
"question_template": "is any company or counterparty selected in the current dialog?",
|
||||
"semantic_tags": ["memory", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="is any company or counterparty selected in the current dialog?",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "partial_coverage",
|
||||
"text": "В текущем диалоге не выбрана компания или контрагент; память не выдумываю.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "customer_revenue_and_payments",
|
||||
"fallback_type": "no_rows",
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["execution_status"], "partial")
|
||||
self.assertTrue(step_state["memory_checkpoint_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||||
|
||||
def test_deterministic_chat_memory_checkpoint_validates_without_exact_capability(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="memory_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Memory checkpoint",
|
||||
"depends_on": [],
|
||||
"question_template": "current dialog memory checkpoint",
|
||||
"semantic_tags": ["memory", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="current dialog memory checkpoint",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "factual_with_explanation",
|
||||
"text": (
|
||||
"Коротко: в текущем диалоге я не вижу выбранной компании, контрагента или позиции. "
|
||||
"Память про «Группа СВК» в этом диалоге не подтверждена."
|
||||
),
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "chat",
|
||||
"fallback_type": "none",
|
||||
"living_router_reason": "memory_recap_followup_detected",
|
||||
"living_chat_response_source": "deterministic_memory_recap_contract",
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["execution_status"], "partial")
|
||||
self.assertTrue(step_state["memory_checkpoint_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||||
|
||||
def test_confirmed_runtime_factual_answer_can_validate_without_exact_route_mode(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="runtime_factual_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Account 60 tails",
|
||||
"depends_on": [],
|
||||
"question_template": "show account 60 tails",
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="show account 60 tails",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "factual",
|
||||
"text": "Коротко: по счету 60 найдено 8 строк хвостов; контрагентов с сигналом: 6.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "open_items_by_counterparty_or_contract",
|
||||
"selected_recipe": "address_open_items_by_party_or_contract_v1",
|
||||
"capability_id": "address_open_items_by_counterparty_or_contract",
|
||||
"capability_route_mode": "heuristic",
|
||||
"fallback_type": "none",
|
||||
"mcp_call_status": "matched_non_empty",
|
||||
"response_type": "FACTUAL_LIST",
|
||||
"result_mode": "confirmed_balance",
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["execution_status"], "partial")
|
||||
self.assertTrue(step_state["runtime_factual_answer_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||||
|
||||
def test_exact_confirmed_document_followup_sets_runtime_factual_validation(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="svk_pivot",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "s02_svk_docs",
|
||||
"title": "Counterparty documents follow-up",
|
||||
"depends_on": ["s01_svk_money"],
|
||||
"question_template": "show documents by this chain",
|
||||
"semantic_tags": ["counterparty", "documents", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
step_index=2,
|
||||
question_resolved="show documents by this chain",
|
||||
analysis_context={"as_of_date": "2026-05-09"},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "factual",
|
||||
"text": "Контрагент: Группа СВК. Найдено документов: 19.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "list_documents_by_counterparty",
|
||||
"selected_recipe": "address_documents_by_counterparty_v1",
|
||||
"capability_id": "documents_drilldown",
|
||||
"capability_route_mode": "exact",
|
||||
"fallback_type": "none",
|
||||
"mcp_call_status": "matched_non_empty",
|
||||
"response_type": "FACTUAL_LIST",
|
||||
"truth_mode": "confirmed",
|
||||
"answer_shape": "confirmed_factual",
|
||||
"coverage_status": "full",
|
||||
"evidence_grade": "strong",
|
||||
"extracted_filters": {"counterparty": "Группа СВК", "as_of_date": "2026-05-09"},
|
||||
"focus_object": {
|
||||
"object_type": "counterparty",
|
||||
"object_id": "counterparty:группа свк",
|
||||
"label": "Группа СВК",
|
||||
},
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[{"item": "2021-11-10T12:00:07Z"}],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["execution_status"], "exact")
|
||||
self.assertTrue(step_state["runtime_factual_answer_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||||
|
||||
def test_heuristic_open_items_guarded_insufficiency_validates_separately(self) -> None:
|
||||
answer_text = (
|
||||
"\u041a\u043e\u0440\u043e\u0442\u043a\u043e: \u0442\u043e\u0447\u043d\u044b\u0439 "
|
||||
"\u043e\u0442\u043a\u0440\u044b\u0442\u044b\u0439 \u043e\u0441\u0442\u0430\u0442\u043e\u043a "
|
||||
"\u043f\u043e \u0441\u0447\u0435\u0442\u0443 60 \u043d\u0435 "
|
||||
"\u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d; \u043d\u0438\u0436\u0435 "
|
||||
"\u0442\u043e\u043b\u044c\u043a\u043e \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0435 "
|
||||
"\u0441\u0438\u0433\u043d\u0430\u043b\u044b \u043f\u043e \u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f\u043c: 8 "
|
||||
"\u0441\u0442\u0440\u043e\u043a.\n"
|
||||
"\u042d\u0442\u043e \u043d\u0435 \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0435\u043d\u043d\u043e\u0435 "
|
||||
"\u0441\u0430\u043b\u044c\u0434\u043e: \u0442\u0435\u043a\u0443\u0449\u0438\u0439 "
|
||||
"\u043a\u043e\u043d\u0442\u0443\u0440 \u0432\u0438\u0434\u0438\u0442 "
|
||||
"\u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f-\u043a\u0430\u043d\u0434\u0438\u0434\u0430\u0442\u044b, "
|
||||
"\u043d\u043e \u043d\u0435 \u0434\u043e\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442 "
|
||||
"\u043e\u0441\u0442\u0430\u0442\u043e\u043a."
|
||||
)
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="runtime_factual_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Account 60 limited tails",
|
||||
"depends_on": [],
|
||||
"question_template": "show account 60 tails; say if exact data is unavailable",
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="show account 60 tails; say if exact data is unavailable",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "factual",
|
||||
"text": answer_text,
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "open_items_by_counterparty_or_contract",
|
||||
"selected_recipe": "address_open_items_by_party_or_contract_v1",
|
||||
"capability_id": "address_open_items_by_counterparty_or_contract",
|
||||
"capability_route_mode": "heuristic",
|
||||
"fallback_type": "none",
|
||||
"mcp_call_status": "matched_non_empty",
|
||||
"response_type": "FACTUAL_LIST",
|
||||
"result_mode": "heuristic_candidates",
|
||||
"balance_confirmed": False,
|
||||
"truth_mode": "limited",
|
||||
"answer_shape": "limited_with_reason",
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["execution_status"], "partial")
|
||||
self.assertEqual(step_state["truth_mode"], "limited")
|
||||
self.assertEqual(step_state["answer_shape"], "limited_with_reason")
|
||||
self.assertFalse(step_state["runtime_factual_answer_validated"])
|
||||
self.assertTrue(step_state["guarded_insufficiency_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "validated")
|
||||
|
||||
def test_heuristic_open_items_without_limitation_is_rejected(self) -> None:
|
||||
step_state = dcl.build_scenario_step_state(
|
||||
scenario_id="runtime_factual_demo",
|
||||
domain="agentic_loop",
|
||||
step={
|
||||
"step_id": "step_01",
|
||||
"title": "Account 60 unguarded tails",
|
||||
"depends_on": [],
|
||||
"question_template": "show account 60 tails",
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
step_index=1,
|
||||
question_resolved="show account 60 tails",
|
||||
analysis_context={},
|
||||
turn_artifact={
|
||||
"assistant_message": {
|
||||
"reply_type": "factual",
|
||||
"text": "Short: account 60 has 8 open-item rows and 6 counterparties.",
|
||||
"message_id": "msg-1",
|
||||
"trace_id": "trace-1",
|
||||
},
|
||||
"technical_debug_payload": {
|
||||
"detected_mode": "address_query",
|
||||
"detected_intent": "open_items_by_counterparty_or_contract",
|
||||
"selected_recipe": "address_open_items_by_party_or_contract_v1",
|
||||
"capability_id": "address_open_items_by_counterparty_or_contract",
|
||||
"capability_route_mode": "heuristic",
|
||||
"fallback_type": "none",
|
||||
"mcp_call_status": "matched_non_empty",
|
||||
"response_type": "FACTUAL_LIST",
|
||||
"result_mode": "heuristic_candidates",
|
||||
"balance_confirmed": False,
|
||||
"truth_mode": "limited",
|
||||
"answer_shape": "limited_with_reason",
|
||||
},
|
||||
"session_summary": {},
|
||||
},
|
||||
entries=[],
|
||||
)
|
||||
|
||||
self.assertEqual(step_state["execution_status"], "partial")
|
||||
self.assertFalse(step_state["runtime_factual_answer_validated"])
|
||||
self.assertFalse(step_state["guarded_insufficiency_validated"])
|
||||
self.assertEqual(step_state["acceptance_status"], "rejected")
|
||||
|
||||
def test_truth_harness_warns_on_catalog_alignment_divergence(self) -> None:
|
||||
reviewed = dth.evaluate_truth_step(
|
||||
step={
|
||||
|
|
|
|||
|
|
@ -1,11 +1,8 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
|
|
@ -13,228 +10,55 @@ sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|||
import save_agent_semantic_run as saver
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
class SaveAgentSemanticRunTests(unittest.TestCase):
|
||||
def test_extract_questions_accepts_truth_harness_question_template(self) -> None:
|
||||
questions = saver.extract_questions_from_spec(
|
||||
{
|
||||
"steps": [
|
||||
{"step_id": "step_01", "question_template": "first question"},
|
||||
{"step_id": "step_02", "question": "second question"},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
self.assertEqual(questions, ["first question", "second question"])
|
||||
|
||||
def test_extract_questions_accepts_domain_pack_scenarios(self) -> None:
|
||||
questions = saver.extract_questions_from_spec(
|
||||
{
|
||||
"pack_id": "demo_pack",
|
||||
"scenarios": [
|
||||
{
|
||||
"scenario_id": "scenario_01",
|
||||
"steps": [
|
||||
{"step_id": "step_01", "question_template": "first question"},
|
||||
{"step_id": "step_02", "question": "second question"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"scenario_id": "scenario_02",
|
||||
"steps": [
|
||||
{"step_id": "step_01", "question": "first question"},
|
||||
{"step_id": "step_02", "question": "third question"},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
self.assertEqual(questions, ["first question", "second question", "third question"])
|
||||
|
||||
def test_validate_accepted_run_dir_accepts_clean_business_review(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
run_dir = Path(tmp)
|
||||
write_json(
|
||||
run_dir / "pack_state.json",
|
||||
def test_extract_questions_resolves_scenario_pack_bindings(self) -> None:
|
||||
spec = {
|
||||
"schema_version": "domain_scenario_pack_v1",
|
||||
"bindings": {
|
||||
"main_organization": "ООО Альтернатива Плюс",
|
||||
"control_year": "2020",
|
||||
"svk_counterparty": "Группа СВК",
|
||||
},
|
||||
"scenarios": [
|
||||
{
|
||||
"final_status": "accepted",
|
||||
"review_overall_status": "pass",
|
||||
"acceptance_gate_passed": True,
|
||||
"no_unresolved_p0": True,
|
||||
"unresolved_p0_count": 0,
|
||||
"steps_total": 1,
|
||||
"steps_passed": 1,
|
||||
"steps_failed": 0,
|
||||
},
|
||||
)
|
||||
write_json(run_dir / "truth_review.json", {"summary": {"overall_status": "pass"}})
|
||||
write_json(
|
||||
run_dir / "business_review.json",
|
||||
{
|
||||
"overall_business_status": "pass",
|
||||
"steps_with_business_failures": 0,
|
||||
"steps_with_business_warnings": 0,
|
||||
},
|
||||
)
|
||||
|
||||
metadata = saver.validate_accepted_run_dir(run_dir)
|
||||
|
||||
self.assertEqual(metadata["validation_status"], "accepted_live_replay")
|
||||
self.assertTrue(metadata["saved_after_validated_replay"])
|
||||
|
||||
def test_validate_accepted_run_dir_rejects_business_review_failures(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
run_dir = Path(tmp)
|
||||
write_json(
|
||||
run_dir / "pack_state.json",
|
||||
{
|
||||
"final_status": "accepted",
|
||||
"review_overall_status": "pass",
|
||||
"acceptance_gate_passed": True,
|
||||
"no_unresolved_p0": True,
|
||||
"unresolved_p0_count": 0,
|
||||
},
|
||||
)
|
||||
write_json(run_dir / "truth_review.json", {"summary": {"overall_status": "pass"}})
|
||||
write_json(
|
||||
run_dir / "business_review.json",
|
||||
{
|
||||
"overall_business_status": "fail",
|
||||
"steps_with_business_failures": 1,
|
||||
},
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(RuntimeError, "business_review"):
|
||||
saver.validate_accepted_run_dir(run_dir)
|
||||
|
||||
def test_validate_accepted_run_dir_accepts_clean_domain_pack_loop(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
loop_dir = Path(tmp)
|
||||
iteration_dir = loop_dir / "iterations" / "iteration_00"
|
||||
analyst_path = iteration_dir / "analyst_verdict.json"
|
||||
repair_targets_path = iteration_dir / "pack_output" / "pack_run" / "repair_targets.json"
|
||||
write_json(
|
||||
loop_dir / "loop_state.json",
|
||||
{
|
||||
"loop_id": "stage_demo",
|
||||
"target_score": 88,
|
||||
"final_status": "accepted",
|
||||
"iterations": [
|
||||
"scenario_id": "biz",
|
||||
"steps": [
|
||||
{
|
||||
"iteration_id": "iteration_00",
|
||||
"quality_score": 91,
|
||||
"accepted_gate": True,
|
||||
"analyst_accepted_gate": True,
|
||||
"deterministic_gate_ok": True,
|
||||
"repair_target_count": 0,
|
||||
"repair_target_severity_counts": {"P0": 0, "P1": 0, "P2": 0},
|
||||
"analyst_verdict_path": str(analyst_path),
|
||||
"repair_targets_path": str(repair_targets_path),
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
write_json(
|
||||
analyst_path,
|
||||
{
|
||||
"loop_decision": "accepted",
|
||||
"unresolved_p0_count": 0,
|
||||
"regression_detected": False,
|
||||
"direct_answer_ok": True,
|
||||
"business_usefulness_ok": True,
|
||||
"temporal_honesty_ok": True,
|
||||
"field_truth_ok": True,
|
||||
"answer_layering_ok": True,
|
||||
},
|
||||
)
|
||||
write_json(repair_targets_path, {"severity_counts": {"P0": 0, "P1": 0, "P2": 0}})
|
||||
|
||||
metadata = saver.validate_accepted_run_dir(loop_dir)
|
||||
|
||||
self.assertEqual(metadata["validation_status"], "accepted_domain_pack_loop")
|
||||
self.assertEqual(metadata["quality_score"], 91)
|
||||
|
||||
def test_validate_accepted_run_dir_rejects_domain_pack_loop_with_p1_targets(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
loop_dir = Path(tmp)
|
||||
iteration_dir = loop_dir / "iterations" / "iteration_00"
|
||||
analyst_path = iteration_dir / "analyst_verdict.json"
|
||||
repair_targets_path = iteration_dir / "pack_output" / "pack_run" / "repair_targets.json"
|
||||
write_json(
|
||||
loop_dir / "loop_state.json",
|
||||
{
|
||||
"loop_id": "stage_demo",
|
||||
"target_score": 88,
|
||||
"final_status": "accepted",
|
||||
"iterations": [
|
||||
"question": "Дай обзор {{bindings.main_organization}} за {{bindings.control_year}} год.",
|
||||
"semantic_tags": ["business_overview", "money"],
|
||||
},
|
||||
{
|
||||
"quality_score": 91,
|
||||
"accepted_gate": True,
|
||||
"analyst_accepted_gate": True,
|
||||
"deterministic_gate_ok": True,
|
||||
"analyst_verdict_path": str(analyst_path),
|
||||
"repair_targets_path": str(repair_targets_path),
|
||||
}
|
||||
"question": "Отдельно по {{bindings.svk_counterparty}} покажи документы.",
|
||||
"semantic_tags": ["counterparty", "documents"],
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
write_json(
|
||||
analyst_path,
|
||||
{
|
||||
"loop_decision": "accepted",
|
||||
"unresolved_p0_count": 0,
|
||||
"regression_detected": False,
|
||||
"direct_answer_ok": True,
|
||||
"business_usefulness_ok": True,
|
||||
"temporal_honesty_ok": True,
|
||||
"field_truth_ok": True,
|
||||
"answer_layering_ok": True,
|
||||
},
|
||||
)
|
||||
write_json(repair_targets_path, {"severity_counts": {"P0": 0, "P1": 1, "P2": 0}})
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
with self.assertRaisesRegex(RuntimeError, "repair_targets"):
|
||||
saver.validate_accepted_run_dir(loop_dir)
|
||||
questions = saver.extract_questions_from_spec(spec)
|
||||
|
||||
def test_save_gate_refuses_real_write_without_validation(self) -> None:
|
||||
args = SimpleNamespace(
|
||||
validated_run_dir=None,
|
||||
dry_run=False,
|
||||
allow_unvalidated=False,
|
||||
unvalidated_reason=None,
|
||||
self.assertEqual(
|
||||
questions,
|
||||
[
|
||||
"Дай обзор ООО Альтернатива Плюс за 2020 год.",
|
||||
"Отдельно по Группа СВК покажи документы.",
|
||||
],
|
||||
)
|
||||
self.assertFalse(any("{{bindings." in question for question in questions))
|
||||
self.assertEqual(
|
||||
saver.extract_semantic_tags(spec),
|
||||
["business_overview", "counterparty", "documents", "money"],
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(RuntimeError, "Refusing to save AGENT autorun"):
|
||||
saver.build_save_gate_metadata(args, {}, Path("demo.json"))
|
||||
def test_extract_questions_refuses_unresolved_bindings(self) -> None:
|
||||
spec = {
|
||||
"questions": ["Что с НДС за {{bindings.control_year}} год?"],
|
||||
"bindings": {},
|
||||
}
|
||||
|
||||
def test_save_gate_requires_reason_for_unvalidated_draft(self) -> None:
|
||||
args = SimpleNamespace(
|
||||
validated_run_dir=None,
|
||||
dry_run=False,
|
||||
allow_unvalidated=True,
|
||||
unvalidated_reason="",
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(RuntimeError, "--unvalidated-reason"):
|
||||
saver.build_save_gate_metadata(args, {}, Path("demo.json"))
|
||||
|
||||
def test_save_gate_marks_explicit_unvalidated_draft(self) -> None:
|
||||
args = SimpleNamespace(
|
||||
validated_run_dir=None,
|
||||
dry_run=False,
|
||||
allow_unvalidated=True,
|
||||
unvalidated_reason="manual GUI canary before live replay",
|
||||
)
|
||||
|
||||
metadata = saver.build_save_gate_metadata(args, {}, Path("demo.json"))
|
||||
|
||||
self.assertEqual(metadata["validation_status"], "explicitly_unvalidated")
|
||||
self.assertFalse(metadata["saved_after_validated_replay"])
|
||||
with self.assertRaisesRegex(RuntimeError, "unresolved bindings"):
|
||||
saver.extract_questions_from_spec(spec)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -29,13 +29,14 @@ def args() -> argparse.Namespace:
|
|||
temperature=0.0,
|
||||
max_output_tokens=2048,
|
||||
timeout_seconds=180,
|
||||
repair_mode=None,
|
||||
codex_binary="codex",
|
||||
codex_profile=None,
|
||||
codex_model=None,
|
||||
analyst_codex_model="gpt-5.4",
|
||||
coder_codex_model="gpt-5.4-mini",
|
||||
coder_codex_model="gpt-5.4",
|
||||
analyst_reasoning_effort="medium",
|
||||
coder_reasoning_effort="low",
|
||||
coder_reasoning_effort="high",
|
||||
codex_timeout_seconds=1800,
|
||||
analysis_date=None,
|
||||
max_scenarios=None,
|
||||
|
|
@ -81,6 +82,7 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
|
||||
self.assertEqual(manifest["target_score"], 88)
|
||||
self.assertEqual(manifest["max_iterations"], 6)
|
||||
self.assertEqual(manifest["repair_mode"], "lead-handoff")
|
||||
self.assertTrue(manifest["save_autorun_on_accept"])
|
||||
self.assertTrue(manifest["manual_confirmation_required_after_accept"])
|
||||
|
||||
|
|
@ -98,6 +100,8 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
self.assertIn("91", command)
|
||||
self.assertIn("--max-iterations", command)
|
||||
self.assertIn("4", command)
|
||||
self.assertIn("--repair-mode", command)
|
||||
self.assertIn("lead-handoff", command)
|
||||
self.assertIn("--output-root", command)
|
||||
|
||||
def test_build_stage_summary_requests_manual_confirmation_after_accept(self) -> None:
|
||||
|
|
@ -143,6 +147,7 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
loop_dir / "loop_state.json",
|
||||
{
|
||||
"final_status": "partial",
|
||||
"repair_mode": "auto-coder",
|
||||
"target_score": 88,
|
||||
"iterations": [
|
||||
{
|
||||
|
|
@ -162,6 +167,7 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
"module_name": "Open-World Bounded Autonomy Breadth",
|
||||
"title": "Open-world semantic control gate",
|
||||
"target_score": 88,
|
||||
"repair_mode": "auto-coder",
|
||||
},
|
||||
loop_dir,
|
||||
)
|
||||
|
|
@ -169,6 +175,81 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
self.assertFalse(summary["manual_confirmation_required"])
|
||||
self.assertEqual(summary["next_action"], "continue_autonomous_or_fix_blocker")
|
||||
|
||||
def test_build_stage_summary_reruns_stale_partial_loop_for_lead_handoff(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
loop_dir = Path(tmp)
|
||||
write_json(
|
||||
loop_dir / "loop_state.json",
|
||||
{
|
||||
"final_status": "needs_exact_capability",
|
||||
"target_score": 88,
|
||||
"iterations": [
|
||||
{
|
||||
"quality_score": 32,
|
||||
"loop_decision": "needs_exact_capability",
|
||||
"accepted_gate": False,
|
||||
"deterministic_gate_ok": False,
|
||||
"coder_status": "no_changes",
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
summary = stage_loop.build_stage_summary(
|
||||
{
|
||||
"stage_id": "agent_loop",
|
||||
"module_name": "Agent Loop",
|
||||
"title": "Agent Loop",
|
||||
"target_score": 88,
|
||||
"repair_mode": "lead-handoff",
|
||||
},
|
||||
loop_dir,
|
||||
)
|
||||
|
||||
self.assertEqual(summary["next_action"], "rerun_stage_loop_for_lead_handoff")
|
||||
self.assertIn("lead-handoff", summary["next_step_guidance"]["command_templates"][1])
|
||||
|
||||
def test_build_stage_summary_routes_lead_handoff_to_lead_repair(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
loop_dir = Path(tmp)
|
||||
iteration_dir = loop_dir / "iterations" / "iteration_00"
|
||||
write_json(
|
||||
loop_dir / "loop_state.json",
|
||||
{
|
||||
"final_status": "needs_exact_capability",
|
||||
"repair_mode": "lead-handoff",
|
||||
"target_score": 88,
|
||||
"latest_lead_coder_handoff_markdown_path": str(loop_dir / "lead_coder_handoff.md"),
|
||||
"iterations": [
|
||||
{
|
||||
"quality_score": 42,
|
||||
"loop_decision": "needs_exact_capability",
|
||||
"accepted_gate": False,
|
||||
"deterministic_gate_ok": False,
|
||||
"business_audit_path": str(iteration_dir / "business_audit.md"),
|
||||
"lead_coder_handoff_markdown_path": str(iteration_dir / "lead_coder_handoff.md"),
|
||||
"coder_status": "lead_handoff_required",
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
summary = stage_loop.build_stage_summary(
|
||||
{
|
||||
"stage_id": "agent_loop",
|
||||
"module_name": "Agent Loop",
|
||||
"title": "Agent Loop",
|
||||
"target_score": 88,
|
||||
"repair_mode": "lead-handoff",
|
||||
},
|
||||
loop_dir,
|
||||
)
|
||||
|
||||
self.assertEqual(summary["repair_mode"], "lead-handoff")
|
||||
self.assertEqual(summary["next_action"], "lead_coder_repair_required")
|
||||
self.assertIn("lead_coder_handoff", summary["latest_lead_coder_handoff"])
|
||||
self.assertIn("business_audit", summary["latest_business_audit"])
|
||||
|
||||
def test_build_stage_summary_blocks_close_when_repair_lacks_validation(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
loop_dir = Path(tmp)
|
||||
|
|
@ -817,6 +898,205 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
self.assertEqual(result["performed_action"], "materialize_stage_run_dry_run")
|
||||
self.assertIn("run-pack-loop", command_text)
|
||||
|
||||
def test_handle_review_questions_scores_stage_pack_quality(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
manifest_path = root / "stage.json"
|
||||
pack_path = root / "pack.json"
|
||||
output_root = root / "stage_runs"
|
||||
write_json(
|
||||
manifest_path,
|
||||
{
|
||||
"stage_id": "agent_loop",
|
||||
"module_name": "Agent Loop",
|
||||
"title": "Agent Loop",
|
||||
"pack_manifest": str(pack_path),
|
||||
"target_score": 88,
|
||||
},
|
||||
)
|
||||
write_json(
|
||||
pack_path,
|
||||
{
|
||||
"schema_version": "domain_scenario_pack_v1",
|
||||
"pack_id": "agent_loop_pack",
|
||||
"bindings": {
|
||||
"organization": "ООО Альтернатива Плюс",
|
||||
"counterparty": "Группа СВК",
|
||||
},
|
||||
"scenarios": [
|
||||
{
|
||||
"scenario_id": "company_overview",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01",
|
||||
"question": "Дай бизнес-обзор {{bindings.organization}}: деньги, НДС, долги и что нельзя утверждать.",
|
||||
"semantic_tags": ["business_overview", "money", "vat", "debt", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"forbidden_answer_patterns": ["(?i)runtime_"],
|
||||
},
|
||||
{
|
||||
"step_id": "step_02",
|
||||
"question": "Раскрой деньги подробнее: сколько получили и заплатили.",
|
||||
"depends_on": ["step_01"],
|
||||
"semantic_tags": ["money"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
{
|
||||
"step_id": "step_03",
|
||||
"question": "Что с НДС за 2020 год и на каких документах это основано?",
|
||||
"depends_on": ["step_02"],
|
||||
"semantic_tags": ["vat", "documents"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
{
|
||||
"step_id": "step_04",
|
||||
"question": "Теперь за все время не тащи НДС за 2020 как общую позицию.",
|
||||
"depends_on": ["step_03"],
|
||||
"semantic_tags": ["vat", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"scenario_id": "counterparty_pivot",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01",
|
||||
"question": "Отдельно по контрагенту {{bindings.counterparty}}: сколько денег прошло и какие документы есть?",
|
||||
"semantic_tags": ["counterparty", "money", "documents", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"forbidden_answer_patterns": ["(?i)capability_id"],
|
||||
},
|
||||
{
|
||||
"step_id": "step_02",
|
||||
"question": "Покажи документы по этой цепочке и не смешивай контрагента с организацией.",
|
||||
"depends_on": ["step_01"],
|
||||
"semantic_tags": ["counterparty", "documents", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
{
|
||||
"step_id": "step_03",
|
||||
"question": "Что было на складе на март 2021?",
|
||||
"depends_on": ["step_02"],
|
||||
"semantic_tags": ["inventory"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
{
|
||||
"step_id": "step_04",
|
||||
"question": "Собери итог: что можно выводить по компании и что нельзя.",
|
||||
"depends_on": ["step_03"],
|
||||
"semantic_tags": ["business_overview", "scope_guard"],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
exit_code = stage_loop.handle_review_questions(
|
||||
stage_args(manifest=str(manifest_path), output_root=str(output_root))
|
||||
)
|
||||
review = json.loads(
|
||||
(output_root / "agent_loop" / "question_generation_review.json").read_text(encoding="utf-8")
|
||||
)
|
||||
capsule = json.loads(
|
||||
(output_root / "agent_loop" / "stage_context_capsule.json").read_text(encoding="utf-8")
|
||||
)
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(review["status"], "strong")
|
||||
self.assertEqual(capsule["repair_mode"], "lead-handoff")
|
||||
self.assertEqual(capsule["question_generation"]["status"], "strong")
|
||||
self.assertGreaterEqual(review["score"], 85)
|
||||
self.assertEqual(review["question_count"], 8)
|
||||
self.assertGreaterEqual(review["domain_counts"]["vat"], 2)
|
||||
self.assertFalse(review["weak_flag_counts"])
|
||||
|
||||
def test_review_questions_flags_mojibake_before_live_replay(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
pack_path = root / "pack.json"
|
||||
write_json(
|
||||
pack_path,
|
||||
{
|
||||
"schema_version": "domain_scenario_pack_v1",
|
||||
"pack_id": "broken_pack",
|
||||
"scenarios": [
|
||||
{
|
||||
"scenario_id": "broken",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01",
|
||||
"question": "\u0420\u201d\u0420\u00b0\u0420\u2116 \u0420\u00b1\u0420\u0451\u0420\u00b7\u0420\u0405\u0420\u00b5\u0421\u0403-\u0420\u0455\u0420\u00b1\u0420\u00b7\u0420\u0455\u0421\u0402 \u0420\u0454\u0420\u0455\u0420\u0458\u0420\u0457\u0420\u00b0\u0420\u0405\u0420\u0451\u0420\u0451.",
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
review = stage_loop.build_stage_question_generation_review(
|
||||
{
|
||||
"stage_id": "broken_stage",
|
||||
"module_name": "Broken Stage",
|
||||
"title": "Broken Stage",
|
||||
"pack_manifest": str(pack_path),
|
||||
}
|
||||
)
|
||||
|
||||
self.assertEqual(review["status"], "weak")
|
||||
self.assertEqual(review["weak_flag_counts"]["mojibake_question_text"], 1)
|
||||
|
||||
def test_review_questions_flags_windows_artifact_path_risk(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
pack_path = root / "pack.json"
|
||||
write_json(
|
||||
pack_path,
|
||||
{
|
||||
"schema_version": "domain_scenario_pack_v1",
|
||||
"pack_id": "path_pack",
|
||||
"scenarios": [
|
||||
{
|
||||
"scenario_id": "very_long_scenario_id_for_windows_path_budget",
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "very_long_step_id_for_windows_path_budget",
|
||||
"question": "Дай бизнес-обзор компании: деньги, НДС, документы и что нельзя утверждать.",
|
||||
"semantic_tags": [
|
||||
"business_overview",
|
||||
"money",
|
||||
"vat",
|
||||
"documents",
|
||||
"counterparty",
|
||||
"scope_guard",
|
||||
],
|
||||
"required_answer_shape": "direct_answer_first",
|
||||
"forbidden_answer_patterns": ["(?i)runtime_"],
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
review = stage_loop.build_stage_question_generation_review(
|
||||
{
|
||||
"stage_id": "long_stage_name_for_path_budget",
|
||||
"module_name": "Path Budget",
|
||||
"title": "Path Budget",
|
||||
"pack_manifest": str(pack_path),
|
||||
"loop_id": "long_loop_name_for_path_budget",
|
||||
"artifact_path_warning_limit": 120,
|
||||
},
|
||||
stage_dir=root / "stage_runs" / "long_stage_name_for_path_budget",
|
||||
)
|
||||
|
||||
self.assertEqual(review["weak_flag_counts"]["artifact_path_too_long_for_windows"], 1)
|
||||
self.assertGreaterEqual(review["max_estimated_artifact_path"], 120)
|
||||
|
||||
def test_build_stage_status_reports_cold_start_continue_artifacts(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
|
|
@ -937,12 +1217,14 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
summary = json.loads((stage_dir / "stage_loop_summary.json").read_text(encoding="utf-8"))
|
||||
handoff_exists = (stage_dir / "stage_loop_handoff.md").exists()
|
||||
repair_handoff_exists = (stage_dir / "stage_repair_handoff.md").exists()
|
||||
context_capsule_exists = (stage_dir / "stage_context_capsule.md").exists()
|
||||
review_exists = (stage_dir / "gui_run_reviews" / run_id / "run_review.json").exists()
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(summary["next_action"], "continue_repair_from_gui_review_p0")
|
||||
self.assertTrue(handoff_exists)
|
||||
self.assertTrue(repair_handoff_exists)
|
||||
self.assertTrue(context_capsule_exists)
|
||||
self.assertTrue(review_exists)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue