diff --git a/docs/orchestration/domain_scenario_loop_repo_adapter.md b/docs/orchestration/domain_scenario_loop_repo_adapter.md index 8ac4f91..60778d9 100644 --- a/docs/orchestration/domain_scenario_loop_repo_adapter.md +++ b/docs/orchestration/domain_scenario_loop_repo_adapter.md @@ -92,6 +92,7 @@ Canonical commands: ```powershell python scripts/stage_agent_loop.py plan --manifest docs/orchestration/.json python scripts/stage_agent_loop.py run --manifest docs/orchestration/.json +python scripts/stage_agent_loop.py ingest-gui-run --manifest docs/orchestration/.json --run-id assistant-stage1- python scripts/stage_agent_loop.py summarize --manifest docs/orchestration/.json ``` @@ -125,6 +126,17 @@ This bridge is intentionally business-first: Use this bridge when the operator would otherwise say “чекни прогон `assistant-stage1-...`”. The expected next step is no longer manual eyeballing first; it is: review by id, inspect `run_review.md`, map `repair_targets.json` into the current stage loop, patch, and rerun. +For stage work, prefer the integrated command: + +```powershell +python scripts/stage_agent_loop.py ingest-gui-run --manifest docs/orchestration/.json --run-id assistant-stage1- +``` + +It stores the GUI review under `artifacts/domain_runs/stage_agent_loops//gui_run_reviews//`, updates `stage_loop_summary.json`, and writes the next stage action: +- `continue_repair_from_gui_review_p0` when the GUI run exposes business-wrong or missing direct-answer defects; +- `continue_repair_from_gui_review_p1` when the run is semantically usable but still noisy, over-broad, or poorly layered; +- `manual_gui_confirmation_or_stage_close` when the GUI run is clean enough for final human confirmation. + ## Placeholder contract Scenario questions can reference earlier step outputs with placeholders such as: diff --git a/scripts/stage_agent_loop.py b/scripts/stage_agent_loop.py index a2cffa6..2d43426 100644 --- a/scripts/stage_agent_loop.py +++ b/scripts/stage_agent_loop.py @@ -10,6 +10,8 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any +import review_assistant_stage1_run as gui_review + REPO_ROOT = Path(__file__).resolve().parents[1] DEFAULT_STAGE_OUTPUT_ROOT = REPO_ROOT / "artifacts" / "domain_runs" / "stage_agent_loops" @@ -108,6 +110,10 @@ def stage_loop_dir(stage_dir: Path, stage_manifest: dict[str, Any]) -> Path: return stage_dir / "domain_loops" / slugify(loop_id) +def stage_gui_review_dir(stage_dir: Path, run_id: str) -> Path: + return stage_dir / "gui_run_reviews" / slugify(run_id) + + def build_domain_pack_loop_command(args: argparse.Namespace, stage_manifest: dict[str, Any], stage_dir: Path) -> list[str]: loop_id = str(stage_manifest.get("loop_id") or stage_manifest["stage_id"]).strip() command = [ @@ -250,6 +256,22 @@ def build_stage_handoff_markdown(summary: dict[str, Any]) -> str: lines.extend(["", "## Acceptance invariants"]) invariants = summary.get("acceptance_invariants") or [] lines.extend([f"- {item}" for item in invariants] if invariants else ["- domain loop gate + analyst verdict"]) + latest_gui_review = summary.get("latest_gui_review") if isinstance(summary.get("latest_gui_review"), dict) else {} + if latest_gui_review: + lines.extend( + [ + "", + "## Latest GUI Run Review", + f"- run_id: `{latest_gui_review.get('run_id')}`", + f"- overall_business_status: `{latest_gui_review.get('overall_business_status')}`", + f"- p0_findings: `{latest_gui_review.get('p0_findings')}`", + f"- p1_findings: `{latest_gui_review.get('p1_findings')}`", + f"- question_quality: `{latest_gui_review.get('question_quality_status')}` / `{latest_gui_review.get('question_quality_score')}`", + f"- repair_targets_count: `{latest_gui_review.get('repair_targets_count')}`", + f"- review_dir: `{latest_gui_review.get('review_dir')}`", + f"- review_markdown: `{latest_gui_review.get('review_markdown')}`", + ] + ) return "\n".join(lines).strip() + "\n" @@ -275,6 +297,106 @@ def build_save_autorun_command(args: argparse.Namespace, stage_manifest: dict[st ] +def build_gui_review_stage_summary( + *, + stage_manifest: dict[str, Any], + stage_dir: Path, + review: dict[str, Any], + review_dir: Path, + previous_summary: dict[str, Any] | None = None, +) -> dict[str, Any]: + summary = review.get("summary") if isinstance(review.get("summary"), dict) else {} + repair_targets = review.get("repair_targets") if isinstance(review.get("repair_targets"), list) else [] + status = str(summary.get("overall_business_status") or "unknown").strip() + p0_count = int(summary.get("p0_findings") or 0) + p1_count = int(summary.get("p1_findings") or 0) + if p0_count > 0: + next_action = "continue_repair_from_gui_review_p0" + elif p1_count > 0 or status == "warning": + next_action = "continue_repair_from_gui_review_p1" + elif status == "pass": + next_action = "manual_gui_confirmation_or_stage_close" + else: + next_action = "inspect_gui_review_status" + + base = dict(previous_summary or {}) + base.update( + { + "schema_version": STAGE_SUMMARY_SCHEMA_VERSION, + "stage_id": stage_manifest["stage_id"], + "module_name": stage_manifest.get("module_name"), + "title": stage_manifest.get("title"), + "global_plan_refs": stage_manifest.get("global_plan_refs") or [], + "target_score": stage_manifest.get("target_score"), + "acceptance_invariants": stage_manifest.get("acceptance_invariants") or [], + "loop_final_status": base.get("loop_final_status") or "gui_review_ingested", + "accepted_gate": bool(status == "pass" and p0_count == 0 and p1_count == 0), + "manual_confirmation_required": bool(status == "pass"), + "next_action": next_action, + "updated_at": now_iso(), + "latest_gui_review": { + "run_id": review.get("run_id"), + "review_dir": repo_relative(review_dir), + "review_json": repo_relative(review_dir / "run_review.json"), + "review_markdown": repo_relative(review_dir / "run_review.md"), + "conversation_pairs_json": repo_relative(review_dir / "conversation_pairs.json"), + "question_quality_json": repo_relative(review_dir / "question_quality_review.json"), + "repair_targets_json": repo_relative(review_dir / "repair_targets.json"), + "overall_business_status": status, + "turn_pairs_total": summary.get("turn_pairs_total"), + "business_issue_turns": summary.get("business_issue_turns"), + "p0_findings": p0_count, + "p1_findings": p1_count, + "question_quality_status": summary.get("question_quality_status"), + "question_quality_score": summary.get("question_quality_score"), + "repair_targets_count": len(repair_targets), + }, + } + ) + history = base.get("gui_review_history") if isinstance(base.get("gui_review_history"), list) else [] + history = [ + item + for item in history + if not (isinstance(item, dict) and item.get("run_id") == review.get("run_id")) + ] + history.append(base["latest_gui_review"]) + base["gui_review_history"] = history + return base + + +def handle_ingest_gui_run(args: argparse.Namespace) -> int: + stage_manifest_path = repo_path(args.manifest) + stage_manifest = load_stage_manifest(stage_manifest_path) + stage_dir = stage_dir_for(repo_path(args.output_root), stage_manifest["stage_id"]) + stage_dir.mkdir(parents=True, exist_ok=True) + write_json(stage_dir / "stage_manifest.json", stage_manifest) + write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n") + + run_id = gui_review.run_id_from_value(args.run_id) + review_dir = repo_path(args.review_output_dir) if args.review_output_dir else stage_gui_review_dir(stage_dir, run_id) + session_files = gui_review.resolve_session_files( + run_id=run_id, + sessions_dir=repo_path(args.sessions_dir), + explicit_session_file=repo_path(args.session_file) if args.session_file else None, + ) + report_path = repo_path(args.reports_dir) / f"{run_id}.md" + review = gui_review.build_run_review(run_id=run_id, session_files=session_files, report_path=report_path) + gui_review.save_run_review(review, review_dir) + + summary_path = stage_dir / "stage_loop_summary.json" + previous_summary = load_json_object(summary_path, "Existing stage summary") if summary_path.exists() else None + summary = build_gui_review_stage_summary( + stage_manifest=stage_manifest, + stage_dir=stage_dir, + review=review, + review_dir=review_dir, + previous_summary=previous_summary, + ) + save_stage_summary(stage_dir, summary) + print(json.dumps(summary, ensure_ascii=False, indent=2)) + return 0 + + def handle_plan(args: argparse.Namespace) -> int: stage_manifest_path = repo_path(args.manifest) stage_manifest = load_stage_manifest(stage_manifest_path) @@ -389,6 +511,18 @@ def build_parser() -> argparse.ArgumentParser: add_common_args(summarize_parser) summarize_parser.add_argument("--loop-dir") summarize_parser.set_defaults(func=handle_summarize) + + ingest_parser = subparsers.add_parser( + "ingest-gui-run", + help="Attach an existing assistant-stage1 GUI run review to the stage loop summary.", + ) + add_common_args(ingest_parser) + ingest_parser.add_argument("--run-id", required=True) + ingest_parser.add_argument("--session-file") + ingest_parser.add_argument("--sessions-dir", default=str(gui_review.DEFAULT_SESSIONS_DIR)) + ingest_parser.add_argument("--reports-dir", default=str(gui_review.DEFAULT_REPORTS_DIR)) + ingest_parser.add_argument("--review-output-dir") + ingest_parser.set_defaults(func=handle_ingest_gui_run) return parser diff --git a/scripts/test_stage_agent_loop.py b/scripts/test_stage_agent_loop.py index a64ca86..a89b7f3 100644 --- a/scripts/test_stage_agent_loop.py +++ b/scripts/test_stage_agent_loop.py @@ -43,6 +43,26 @@ def args() -> argparse.Namespace: ) +def stage_args(**overrides: object) -> argparse.Namespace: + base = vars(args()) + base.update(overrides) + return argparse.Namespace(**base) + + +def session_payload(conversation: list[dict[str, object]]) -> dict[str, object]: + return { + "schema_version": "assistant_session_v1", + "session_id": "assistant-stage1-gui-SAVED-001", + "started_at": "2026-05-09T00:00:00Z", + "updated_at": "2026-05-09T00:01:00Z", + "conversation": conversation, + "address_navigation_state": {"session_context": {}}, + "investigation_state": {}, + "counters": {}, + "reply_types": {}, + } + + class StageAgentLoopTests(unittest.TestCase): def test_load_stage_manifest_defaults_gate_fields(self) -> None: with tempfile.TemporaryDirectory() as tmp: @@ -148,6 +168,106 @@ class StageAgentLoopTests(unittest.TestCase): self.assertFalse(summary["manual_confirmation_required"]) self.assertEqual(summary["next_action"], "continue_autonomous_or_fix_blocker") + def test_gui_review_stage_summary_routes_p0_to_repair(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + stage_dir = Path(tmp) / "stage" + review_dir = stage_dir / "gui_run_reviews" / "assistant-stage1-test" + review = { + "run_id": "assistant-stage1-test", + "summary": { + "overall_business_status": "fail", + "turn_pairs_total": 3, + "business_issue_turns": 1, + "p0_findings": 1, + "p1_findings": 0, + "question_quality_status": "strong", + "question_quality_score": 100, + }, + "repair_targets": [{"issue_code": "business_direct_answer_missing"}], + } + + summary = stage_loop.build_gui_review_stage_summary( + stage_manifest={ + "stage_id": "agent_loop", + "module_name": "Agent Loop", + "title": "Agent Loop", + "target_score": 88, + "acceptance_invariants": [], + "global_plan_refs": [], + }, + stage_dir=stage_dir, + review=review, + review_dir=review_dir, + ) + + self.assertEqual(summary["next_action"], "continue_repair_from_gui_review_p0") + self.assertFalse(summary["accepted_gate"]) + self.assertEqual(summary["latest_gui_review"]["repair_targets_count"], 1) + + def test_handle_ingest_gui_run_materializes_stage_review(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + manifest_path = root / "stage.json" + sessions_dir = root / "sessions" + reports_dir = root / "reports" + output_root = root / "stage_runs" + run_id = "assistant-stage1-gui" + write_json( + manifest_path, + { + "stage_id": "agent_loop", + "module_name": "Agent Loop", + "title": "Agent Loop", + "pack_manifest": "docs/orchestration/demo_pack.json", + }, + ) + write_json( + sessions_dir / f"{run_id}-SAVED-001.json", + session_payload( + [ + {"role": "user", "text": "какой у нас самый доходный год"}, + { + "role": "assistant", + "text": "Коротко: Ограниченный бизнес-обзор по подтвержденным строкам 1С. " + + ("лишний текст " * 220), + "reply_type": "partial_coverage", + "message_id": "a-1", + "trace_id": "trace-1", + "debug": {}, + }, + ] + ), + ) + reports_dir.mkdir(parents=True, exist_ok=True) + (reports_dir / f"{run_id}.md").write_text( + "# Assistant Stage 1 Eval Run\n\n" + f"- run_id: {run_id}\n" + "- suite_id: assistant_saved_session_runtime_job-test\n", + encoding="utf-8", + ) + + exit_code = stage_loop.handle_ingest_gui_run( + stage_args( + manifest=str(manifest_path), + output_root=str(output_root), + run_id=run_id, + session_file=None, + sessions_dir=str(sessions_dir), + reports_dir=str(reports_dir), + review_output_dir=None, + ) + ) + + stage_dir = output_root / "agent_loop" + summary = json.loads((stage_dir / "stage_loop_summary.json").read_text(encoding="utf-8")) + handoff_exists = (stage_dir / "stage_loop_handoff.md").exists() + review_exists = (stage_dir / "gui_run_reviews" / run_id / "run_review.json").exists() + + self.assertEqual(exit_code, 0) + self.assertEqual(summary["next_action"], "continue_repair_from_gui_review_p0") + self.assertTrue(handoff_exists) + self.assertTrue(review_exists) + if __name__ == "__main__": unittest.main()