Связать GUI-прогоны со stage-loop агентной петли
This commit is contained in:
parent
931251d1eb
commit
4089708dfd
|
|
@ -92,6 +92,7 @@ Canonical commands:
|
||||||
```powershell
|
```powershell
|
||||||
python scripts/stage_agent_loop.py plan --manifest docs/orchestration/<stage_loop>.json
|
python scripts/stage_agent_loop.py plan --manifest docs/orchestration/<stage_loop>.json
|
||||||
python scripts/stage_agent_loop.py run --manifest docs/orchestration/<stage_loop>.json
|
python scripts/stage_agent_loop.py run --manifest docs/orchestration/<stage_loop>.json
|
||||||
|
python scripts/stage_agent_loop.py ingest-gui-run --manifest docs/orchestration/<stage_loop>.json --run-id assistant-stage1-<id>
|
||||||
python scripts/stage_agent_loop.py summarize --manifest docs/orchestration/<stage_loop>.json
|
python scripts/stage_agent_loop.py summarize --manifest docs/orchestration/<stage_loop>.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
@ -125,6 +126,17 @@ This bridge is intentionally business-first:
|
||||||
|
|
||||||
Use this bridge when the operator would otherwise say “чекни прогон `assistant-stage1-...`”. The expected next step is no longer manual eyeballing first; it is: review by id, inspect `run_review.md`, map `repair_targets.json` into the current stage loop, patch, and rerun.
|
Use this bridge when the operator would otherwise say “чекни прогон `assistant-stage1-...`”. The expected next step is no longer manual eyeballing first; it is: review by id, inspect `run_review.md`, map `repair_targets.json` into the current stage loop, patch, and rerun.
|
||||||
|
|
||||||
|
For stage work, prefer the integrated command:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
python scripts/stage_agent_loop.py ingest-gui-run --manifest docs/orchestration/<stage_loop>.json --run-id assistant-stage1-<id>
|
||||||
|
```
|
||||||
|
|
||||||
|
It stores the GUI review under `artifacts/domain_runs/stage_agent_loops/<stage_id>/gui_run_reviews/<run_id>/`, updates `stage_loop_summary.json`, and writes the next stage action:
|
||||||
|
- `continue_repair_from_gui_review_p0` when the GUI run exposes business-wrong or missing direct-answer defects;
|
||||||
|
- `continue_repair_from_gui_review_p1` when the run is semantically usable but still noisy, over-broad, or poorly layered;
|
||||||
|
- `manual_gui_confirmation_or_stage_close` when the GUI run is clean enough for final human confirmation.
|
||||||
|
|
||||||
## Placeholder contract
|
## Placeholder contract
|
||||||
|
|
||||||
Scenario questions can reference earlier step outputs with placeholders such as:
|
Scenario questions can reference earlier step outputs with placeholders such as:
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@ from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
import review_assistant_stage1_run as gui_review
|
||||||
|
|
||||||
|
|
||||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
DEFAULT_STAGE_OUTPUT_ROOT = REPO_ROOT / "artifacts" / "domain_runs" / "stage_agent_loops"
|
DEFAULT_STAGE_OUTPUT_ROOT = REPO_ROOT / "artifacts" / "domain_runs" / "stage_agent_loops"
|
||||||
|
|
@ -108,6 +110,10 @@ def stage_loop_dir(stage_dir: Path, stage_manifest: dict[str, Any]) -> Path:
|
||||||
return stage_dir / "domain_loops" / slugify(loop_id)
|
return stage_dir / "domain_loops" / slugify(loop_id)
|
||||||
|
|
||||||
|
|
||||||
|
def stage_gui_review_dir(stage_dir: Path, run_id: str) -> Path:
|
||||||
|
return stage_dir / "gui_run_reviews" / slugify(run_id)
|
||||||
|
|
||||||
|
|
||||||
def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dict[str, Any], stage_dir: Path) -> list[str]:
|
def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dict[str, Any], stage_dir: Path) -> list[str]:
|
||||||
loop_id = str(stage_manifest.get("loop_id") or stage_manifest["stage_id"]).strip()
|
loop_id = str(stage_manifest.get("loop_id") or stage_manifest["stage_id"]).strip()
|
||||||
command = [
|
command = [
|
||||||
|
|
@ -250,6 +256,22 @@ def build_stage_handoff_markdown(summary: dict[str, Any]) -> str:
|
||||||
lines.extend(["", "## Acceptance invariants"])
|
lines.extend(["", "## Acceptance invariants"])
|
||||||
invariants = summary.get("acceptance_invariants") or []
|
invariants = summary.get("acceptance_invariants") or []
|
||||||
lines.extend([f"- {item}" for item in invariants] if invariants else ["- domain loop gate + analyst verdict"])
|
lines.extend([f"- {item}" for item in invariants] if invariants else ["- domain loop gate + analyst verdict"])
|
||||||
|
latest_gui_review = summary.get("latest_gui_review") if isinstance(summary.get("latest_gui_review"), dict) else {}
|
||||||
|
if latest_gui_review:
|
||||||
|
lines.extend(
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"## Latest GUI Run Review",
|
||||||
|
f"- run_id: `{latest_gui_review.get('run_id')}`",
|
||||||
|
f"- overall_business_status: `{latest_gui_review.get('overall_business_status')}`",
|
||||||
|
f"- p0_findings: `{latest_gui_review.get('p0_findings')}`",
|
||||||
|
f"- p1_findings: `{latest_gui_review.get('p1_findings')}`",
|
||||||
|
f"- question_quality: `{latest_gui_review.get('question_quality_status')}` / `{latest_gui_review.get('question_quality_score')}`",
|
||||||
|
f"- repair_targets_count: `{latest_gui_review.get('repair_targets_count')}`",
|
||||||
|
f"- review_dir: `{latest_gui_review.get('review_dir')}`",
|
||||||
|
f"- review_markdown: `{latest_gui_review.get('review_markdown')}`",
|
||||||
|
]
|
||||||
|
)
|
||||||
return "\n".join(lines).strip() + "\n"
|
return "\n".join(lines).strip() + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -275,6 +297,106 @@ def build_save_autorun_command(args: argparse.Namespace, stage_manifest: dict[st
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def build_gui_review_stage_summary(
|
||||||
|
*,
|
||||||
|
stage_manifest: dict[str, Any],
|
||||||
|
stage_dir: Path,
|
||||||
|
review: dict[str, Any],
|
||||||
|
review_dir: Path,
|
||||||
|
previous_summary: dict[str, Any] | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
summary = review.get("summary") if isinstance(review.get("summary"), dict) else {}
|
||||||
|
repair_targets = review.get("repair_targets") if isinstance(review.get("repair_targets"), list) else []
|
||||||
|
status = str(summary.get("overall_business_status") or "unknown").strip()
|
||||||
|
p0_count = int(summary.get("p0_findings") or 0)
|
||||||
|
p1_count = int(summary.get("p1_findings") or 0)
|
||||||
|
if p0_count > 0:
|
||||||
|
next_action = "continue_repair_from_gui_review_p0"
|
||||||
|
elif p1_count > 0 or status == "warning":
|
||||||
|
next_action = "continue_repair_from_gui_review_p1"
|
||||||
|
elif status == "pass":
|
||||||
|
next_action = "manual_gui_confirmation_or_stage_close"
|
||||||
|
else:
|
||||||
|
next_action = "inspect_gui_review_status"
|
||||||
|
|
||||||
|
base = dict(previous_summary or {})
|
||||||
|
base.update(
|
||||||
|
{
|
||||||
|
"schema_version": STAGE_SUMMARY_SCHEMA_VERSION,
|
||||||
|
"stage_id": stage_manifest["stage_id"],
|
||||||
|
"module_name": stage_manifest.get("module_name"),
|
||||||
|
"title": stage_manifest.get("title"),
|
||||||
|
"global_plan_refs": stage_manifest.get("global_plan_refs") or [],
|
||||||
|
"target_score": stage_manifest.get("target_score"),
|
||||||
|
"acceptance_invariants": stage_manifest.get("acceptance_invariants") or [],
|
||||||
|
"loop_final_status": base.get("loop_final_status") or "gui_review_ingested",
|
||||||
|
"accepted_gate": bool(status == "pass" and p0_count == 0 and p1_count == 0),
|
||||||
|
"manual_confirmation_required": bool(status == "pass"),
|
||||||
|
"next_action": next_action,
|
||||||
|
"updated_at": now_iso(),
|
||||||
|
"latest_gui_review": {
|
||||||
|
"run_id": review.get("run_id"),
|
||||||
|
"review_dir": repo_relative(review_dir),
|
||||||
|
"review_json": repo_relative(review_dir / "run_review.json"),
|
||||||
|
"review_markdown": repo_relative(review_dir / "run_review.md"),
|
||||||
|
"conversation_pairs_json": repo_relative(review_dir / "conversation_pairs.json"),
|
||||||
|
"question_quality_json": repo_relative(review_dir / "question_quality_review.json"),
|
||||||
|
"repair_targets_json": repo_relative(review_dir / "repair_targets.json"),
|
||||||
|
"overall_business_status": status,
|
||||||
|
"turn_pairs_total": summary.get("turn_pairs_total"),
|
||||||
|
"business_issue_turns": summary.get("business_issue_turns"),
|
||||||
|
"p0_findings": p0_count,
|
||||||
|
"p1_findings": p1_count,
|
||||||
|
"question_quality_status": summary.get("question_quality_status"),
|
||||||
|
"question_quality_score": summary.get("question_quality_score"),
|
||||||
|
"repair_targets_count": len(repair_targets),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
history = base.get("gui_review_history") if isinstance(base.get("gui_review_history"), list) else []
|
||||||
|
history = [
|
||||||
|
item
|
||||||
|
for item in history
|
||||||
|
if not (isinstance(item, dict) and item.get("run_id") == review.get("run_id"))
|
||||||
|
]
|
||||||
|
history.append(base["latest_gui_review"])
|
||||||
|
base["gui_review_history"] = history
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def handle_ingest_gui_run(args: argparse.Namespace) -> int:
|
||||||
|
stage_manifest_path = repo_path(args.manifest)
|
||||||
|
stage_manifest = load_stage_manifest(stage_manifest_path)
|
||||||
|
stage_dir = stage_dir_for(repo_path(args.output_root), stage_manifest["stage_id"])
|
||||||
|
stage_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
write_json(stage_dir / "stage_manifest.json", stage_manifest)
|
||||||
|
write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n")
|
||||||
|
|
||||||
|
run_id = gui_review.run_id_from_value(args.run_id)
|
||||||
|
review_dir = repo_path(args.review_output_dir) if args.review_output_dir else stage_gui_review_dir(stage_dir, run_id)
|
||||||
|
session_files = gui_review.resolve_session_files(
|
||||||
|
run_id=run_id,
|
||||||
|
sessions_dir=repo_path(args.sessions_dir),
|
||||||
|
explicit_session_file=repo_path(args.session_file) if args.session_file else None,
|
||||||
|
)
|
||||||
|
report_path = repo_path(args.reports_dir) / f"{run_id}.md"
|
||||||
|
review = gui_review.build_run_review(run_id=run_id, session_files=session_files, report_path=report_path)
|
||||||
|
gui_review.save_run_review(review, review_dir)
|
||||||
|
|
||||||
|
summary_path = stage_dir / "stage_loop_summary.json"
|
||||||
|
previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None
|
||||||
|
summary = build_gui_review_stage_summary(
|
||||||
|
stage_manifest=stage_manifest,
|
||||||
|
stage_dir=stage_dir,
|
||||||
|
review=review,
|
||||||
|
review_dir=review_dir,
|
||||||
|
previous_summary=previous_summary,
|
||||||
|
)
|
||||||
|
save_stage_summary(stage_dir, summary)
|
||||||
|
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def handle_plan(args: argparse.Namespace) -> int:
|
def handle_plan(args: argparse.Namespace) -> int:
|
||||||
stage_manifest_path = repo_path(args.manifest)
|
stage_manifest_path = repo_path(args.manifest)
|
||||||
stage_manifest = load_stage_manifest(stage_manifest_path)
|
stage_manifest = load_stage_manifest(stage_manifest_path)
|
||||||
|
|
@ -389,6 +511,18 @@ def build_parser() -> argparse.ArgumentParser:
|
||||||
add_common_args(summarize_parser)
|
add_common_args(summarize_parser)
|
||||||
summarize_parser.add_argument("--loop-dir")
|
summarize_parser.add_argument("--loop-dir")
|
||||||
summarize_parser.set_defaults(func=handle_summarize)
|
summarize_parser.set_defaults(func=handle_summarize)
|
||||||
|
|
||||||
|
ingest_parser = subparsers.add_parser(
|
||||||
|
"ingest-gui-run",
|
||||||
|
help="Attach an existing assistant-stage1 GUI run review to the stage loop summary.",
|
||||||
|
)
|
||||||
|
add_common_args(ingest_parser)
|
||||||
|
ingest_parser.add_argument("--run-id", required=True)
|
||||||
|
ingest_parser.add_argument("--session-file")
|
||||||
|
ingest_parser.add_argument("--sessions-dir", default=str(gui_review.DEFAULT_SESSIONS_DIR))
|
||||||
|
ingest_parser.add_argument("--reports-dir", default=str(gui_review.DEFAULT_REPORTS_DIR))
|
||||||
|
ingest_parser.add_argument("--review-output-dir")
|
||||||
|
ingest_parser.set_defaults(func=handle_ingest_gui_run)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,26 @@ def args() -> argparse.Namespace:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def stage_args(**overrides: object) -> argparse.Namespace:
|
||||||
|
base = vars(args())
|
||||||
|
base.update(overrides)
|
||||||
|
return argparse.Namespace(**base)
|
||||||
|
|
||||||
|
|
||||||
|
def session_payload(conversation: list[dict[str, object]]) -> dict[str, object]:
|
||||||
|
return {
|
||||||
|
"schema_version": "assistant_session_v1",
|
||||||
|
"session_id": "assistant-stage1-gui-SAVED-001",
|
||||||
|
"started_at": "2026-05-09T00:00:00Z",
|
||||||
|
"updated_at": "2026-05-09T00:01:00Z",
|
||||||
|
"conversation": conversation,
|
||||||
|
"address_navigation_state": {"session_context": {}},
|
||||||
|
"investigation_state": {},
|
||||||
|
"counters": {},
|
||||||
|
"reply_types": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class StageAgentLoopTests(unittest.TestCase):
|
class StageAgentLoopTests(unittest.TestCase):
|
||||||
def test_load_stage_manifest_defaults_gate_fields(self) -> None:
|
def test_load_stage_manifest_defaults_gate_fields(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
|
@ -148,6 +168,106 @@ class StageAgentLoopTests(unittest.TestCase):
|
||||||
self.assertFalse(summary["manual_confirmation_required"])
|
self.assertFalse(summary["manual_confirmation_required"])
|
||||||
self.assertEqual(summary["next_action"], "continue_autonomous_or_fix_blocker")
|
self.assertEqual(summary["next_action"], "continue_autonomous_or_fix_blocker")
|
||||||
|
|
||||||
|
def test_gui_review_stage_summary_routes_p0_to_repair(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
stage_dir = Path(tmp) / "stage"
|
||||||
|
review_dir = stage_dir / "gui_run_reviews" / "assistant-stage1-test"
|
||||||
|
review = {
|
||||||
|
"run_id": "assistant-stage1-test",
|
||||||
|
"summary": {
|
||||||
|
"overall_business_status": "fail",
|
||||||
|
"turn_pairs_total": 3,
|
||||||
|
"business_issue_turns": 1,
|
||||||
|
"p0_findings": 1,
|
||||||
|
"p1_findings": 0,
|
||||||
|
"question_quality_status": "strong",
|
||||||
|
"question_quality_score": 100,
|
||||||
|
},
|
||||||
|
"repair_targets": [{"issue_code": "business_direct_answer_missing"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
summary = stage_loop.build_gui_review_stage_summary(
|
||||||
|
stage_manifest={
|
||||||
|
"stage_id": "agent_loop",
|
||||||
|
"module_name": "Agent Loop",
|
||||||
|
"title": "Agent Loop",
|
||||||
|
"target_score": 88,
|
||||||
|
"acceptance_invariants": [],
|
||||||
|
"global_plan_refs": [],
|
||||||
|
},
|
||||||
|
stage_dir=stage_dir,
|
||||||
|
review=review,
|
||||||
|
review_dir=review_dir,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(summary["next_action"], "continue_repair_from_gui_review_p0")
|
||||||
|
self.assertFalse(summary["accepted_gate"])
|
||||||
|
self.assertEqual(summary["latest_gui_review"]["repair_targets_count"], 1)
|
||||||
|
|
||||||
|
def test_handle_ingest_gui_run_materializes_stage_review(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
root = Path(tmp)
|
||||||
|
manifest_path = root / "stage.json"
|
||||||
|
sessions_dir = root / "sessions"
|
||||||
|
reports_dir = root / "reports"
|
||||||
|
output_root = root / "stage_runs"
|
||||||
|
run_id = "assistant-stage1-gui"
|
||||||
|
write_json(
|
||||||
|
manifest_path,
|
||||||
|
{
|
||||||
|
"stage_id": "agent_loop",
|
||||||
|
"module_name": "Agent Loop",
|
||||||
|
"title": "Agent Loop",
|
||||||
|
"pack_manifest": "docs/orchestration/demo_pack.json",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
write_json(
|
||||||
|
sessions_dir / f"{run_id}-SAVED-001.json",
|
||||||
|
session_payload(
|
||||||
|
[
|
||||||
|
{"role": "user", "text": "какой у нас самый доходный год"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"text": "Коротко: Ограниченный бизнес-обзор по подтвержденным строкам 1С. "
|
||||||
|
+ ("лишний текст " * 220),
|
||||||
|
"reply_type": "partial_coverage",
|
||||||
|
"message_id": "a-1",
|
||||||
|
"trace_id": "trace-1",
|
||||||
|
"debug": {},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
reports_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(reports_dir / f"{run_id}.md").write_text(
|
||||||
|
"# Assistant Stage 1 Eval Run\n\n"
|
||||||
|
f"- run_id: {run_id}\n"
|
||||||
|
"- suite_id: assistant_saved_session_runtime_job-test\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
exit_code = stage_loop.handle_ingest_gui_run(
|
||||||
|
stage_args(
|
||||||
|
manifest=str(manifest_path),
|
||||||
|
output_root=str(output_root),
|
||||||
|
run_id=run_id,
|
||||||
|
session_file=None,
|
||||||
|
sessions_dir=str(sessions_dir),
|
||||||
|
reports_dir=str(reports_dir),
|
||||||
|
review_output_dir=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
stage_dir = output_root / "agent_loop"
|
||||||
|
summary = json.loads((stage_dir / "stage_loop_summary.json").read_text(encoding="utf-8"))
|
||||||
|
handoff_exists = (stage_dir / "stage_loop_handoff.md").exists()
|
||||||
|
review_exists = (stage_dir / "gui_run_reviews" / run_id / "run_review.json").exists()
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
self.assertEqual(summary["next_action"], "continue_repair_from_gui_review_p0")
|
||||||
|
self.assertTrue(handoff_exists)
|
||||||
|
self.assertTrue(review_exists)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue