Добавить post-repair validation в stage-loop
This commit is contained in:
parent
b4f50346cc
commit
a3378a3d52
|
|
@ -157,6 +157,8 @@ python scripts/stage_agent_loop.py run-repair --manifest docs/orchestration/<sta
|
|||
|
||||
`--dry-run` writes `repair_coder.command.txt`, records `repair_execution_summary.json`, updates `stage_loop_summary.json`, and prints the exact non-interactive Codex command without changing code. Without `--dry-run`, it executes the coder command with the prepared `repair_prompt.md`, writes `repair_coder_result.json`, captures stdout/stderr, records `repair_execution_summary.json`, and updates the stage next action to rerun/ingest, inspect, or stop for a decision depending on the coder status. After a real coder patch, rerun the same semantic pack or GUI session and ingest the new `assistant-stage1-<id>`.
|
||||
|
||||
When the coder result is `patched`, the next `ingest-gui-run` is treated as post-repair validation for that repair iteration. `stage_loop_summary.json` records `latest_repair_validation` and `repair_validation_history`, including the validation run id, remaining P0/P1 findings, and whether the repair was actually accepted after replay. A patch without this rerun/ingest evidence is not a closed stage.
|
||||
|
||||
## Placeholder contract
|
||||
|
||||
Scenario questions can reference earlier step outputs with placeholders such as:
|
||||
|
|
|
|||
|
|
@ -291,6 +291,24 @@ def build_stage_handoff_markdown(summary: dict[str, Any]) -> str:
|
|||
f"- execution_summary: `{latest_repair_execution.get('repair_execution_summary')}`",
|
||||
]
|
||||
)
|
||||
latest_repair_validation = (
|
||||
summary.get("latest_repair_validation")
|
||||
if isinstance(summary.get("latest_repair_validation"), dict)
|
||||
else {}
|
||||
)
|
||||
if latest_repair_validation:
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Latest Repair Validation",
|
||||
f"- validation_run_id: `{latest_repair_validation.get('validation_run_id')}`",
|
||||
f"- validation_status: `{latest_repair_validation.get('validation_status')}`",
|
||||
f"- accepted_after_repair: `{latest_repair_validation.get('accepted_after_repair')}`",
|
||||
f"- validated_repair_iteration: `{latest_repair_validation.get('validated_repair_iteration')}`",
|
||||
f"- remaining_p0_findings: `{latest_repair_validation.get('remaining_p0_findings')}`",
|
||||
f"- remaining_p1_findings: `{latest_repair_validation.get('remaining_p1_findings')}`",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
|
|
@ -354,6 +372,56 @@ def build_repair_execution_stage_summary(
|
|||
return base
|
||||
|
||||
|
||||
def repair_validation_status(*, business_status: str, p0_count: int, p1_count: int) -> str:
|
||||
if p0_count > 0:
|
||||
return "failed_p0"
|
||||
if p1_count > 0 or business_status == "warning":
|
||||
return "warning_p1"
|
||||
if business_status == "pass":
|
||||
return "passed"
|
||||
if business_status == "fail":
|
||||
return "failed"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def build_latest_repair_validation(
|
||||
*,
|
||||
previous_summary: dict[str, Any] | None,
|
||||
review: dict[str, Any],
|
||||
business_status: str,
|
||||
p0_count: int,
|
||||
p1_count: int,
|
||||
next_action: str,
|
||||
) -> dict[str, Any] | None:
|
||||
previous_repair = (
|
||||
previous_summary.get("latest_repair_execution")
|
||||
if isinstance(previous_summary, dict) and isinstance(previous_summary.get("latest_repair_execution"), dict)
|
||||
else {}
|
||||
)
|
||||
if not previous_repair:
|
||||
return None
|
||||
if bool(previous_repair.get("dry_run")) or previous_repair.get("coder_status") != "patched":
|
||||
return None
|
||||
validation_status = repair_validation_status(
|
||||
business_status=business_status,
|
||||
p0_count=p0_count,
|
||||
p1_count=p1_count,
|
||||
)
|
||||
return {
|
||||
"schema_version": "stage_repair_validation_v1",
|
||||
"validation_run_id": review.get("run_id"),
|
||||
"validated_repair_iteration": previous_repair.get("iteration_dir"),
|
||||
"validated_repair_result": previous_repair.get("repair_coder_result"),
|
||||
"validation_status": validation_status,
|
||||
"accepted_after_repair": bool(validation_status == "passed"),
|
||||
"remaining_p0_findings": p0_count,
|
||||
"remaining_p1_findings": p1_count,
|
||||
"overall_business_status": business_status,
|
||||
"next_action": next_action,
|
||||
"validated_at": now_iso(),
|
||||
}
|
||||
|
||||
|
||||
def build_save_autorun_command(args: argparse.Namespace, stage_manifest: dict[str, Any], loop_dir: Path) -> list[str]:
|
||||
return [
|
||||
sys.executable,
|
||||
|
|
@ -427,6 +495,27 @@ def build_gui_review_stage_summary(
|
|||
},
|
||||
}
|
||||
)
|
||||
latest_repair_validation = build_latest_repair_validation(
|
||||
previous_summary=previous_summary,
|
||||
review=review,
|
||||
business_status=status,
|
||||
p0_count=p0_count,
|
||||
p1_count=p1_count,
|
||||
next_action=next_action,
|
||||
)
|
||||
if latest_repair_validation is not None:
|
||||
base["latest_repair_validation"] = latest_repair_validation
|
||||
validations = base.get("repair_validation_history") if isinstance(base.get("repair_validation_history"), list) else []
|
||||
validations = [
|
||||
item
|
||||
for item in validations
|
||||
if not (
|
||||
isinstance(item, dict)
|
||||
and item.get("validation_run_id") == latest_repair_validation.get("validation_run_id")
|
||||
)
|
||||
]
|
||||
validations.append(latest_repair_validation)
|
||||
base["repair_validation_history"] = validations
|
||||
history = base.get("gui_review_history") if isinstance(base.get("gui_review_history"), list) else []
|
||||
history = [
|
||||
item
|
||||
|
|
|
|||
|
|
@ -204,6 +204,54 @@ class StageAgentLoopTests(unittest.TestCase):
|
|||
self.assertFalse(summary["accepted_gate"])
|
||||
self.assertEqual(summary["latest_gui_review"]["repair_targets_count"], 1)
|
||||
|
||||
def test_gui_review_stage_summary_links_post_repair_validation(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage_dir = Path(tmp) / "stage"
|
||||
review_dir = stage_dir / "gui_run_reviews" / "assistant-stage1-rerun"
|
||||
review = {
|
||||
"run_id": "assistant-stage1-rerun",
|
||||
"summary": {
|
||||
"overall_business_status": "pass",
|
||||
"turn_pairs_total": 5,
|
||||
"business_issue_turns": 0,
|
||||
"p0_findings": 0,
|
||||
"p1_findings": 0,
|
||||
"question_quality_status": "strong",
|
||||
"question_quality_score": 96,
|
||||
},
|
||||
"repair_targets": [],
|
||||
}
|
||||
previous_summary = {
|
||||
"latest_repair_execution": {
|
||||
"iteration_dir": "artifacts/domain_runs/stage_agent_loops/agent_loop/repair_iterations/repair_001",
|
||||
"repair_coder_result": "artifacts/domain_runs/stage_agent_loops/agent_loop/repair_iterations/repair_001/repair_coder_result.json",
|
||||
"dry_run": False,
|
||||
"coder_status": "patched",
|
||||
"changed_files": ["scripts/stage_agent_loop.py"],
|
||||
}
|
||||
}
|
||||
|
||||
summary = stage_loop.build_gui_review_stage_summary(
|
||||
stage_manifest={
|
||||
"stage_id": "agent_loop",
|
||||
"module_name": "Agent Loop",
|
||||
"title": "Agent Loop",
|
||||
"target_score": 88,
|
||||
"acceptance_invariants": [],
|
||||
"global_plan_refs": [],
|
||||
},
|
||||
stage_dir=stage_dir,
|
||||
review=review,
|
||||
review_dir=review_dir,
|
||||
previous_summary=previous_summary,
|
||||
)
|
||||
|
||||
self.assertEqual(summary["next_action"], "manual_gui_confirmation_or_stage_close")
|
||||
self.assertTrue(summary["latest_repair_validation"]["accepted_after_repair"])
|
||||
self.assertEqual(summary["latest_repair_validation"]["validation_status"], "passed")
|
||||
self.assertEqual(summary["latest_repair_validation"]["validation_run_id"], "assistant-stage1-rerun")
|
||||
self.assertEqual(len(summary["repair_validation_history"]), 1)
|
||||
|
||||
def test_stage_repair_handoff_keeps_primary_targets_and_samples(self) -> None:
|
||||
summary = {
|
||||
"stage_id": "agent_loop",
|
||||
|
|
|
|||
Loading…
Reference in New Issue