diff --git a/docs/orchestration/schemas/auto_coder_gate.schema.json b/docs/orchestration/schemas/auto_coder_gate.schema.json index 9428fe5..dab2f86 100644 --- a/docs/orchestration/schemas/auto_coder_gate.schema.json +++ b/docs/orchestration/schemas/auto_coder_gate.schema.json @@ -81,6 +81,18 @@ ] } }, + "detector_results_summary": { + "type": "object", + "additionalProperties": true, + "properties": { + "status": { + "type": "string" + }, + "signal_ok_for_auto_coder": { + "type": "boolean" + } + } + }, "blocking_reasons": { "type": "array", "items": { diff --git a/docs/orchestration/schemas/business_audit_contract.schema.json b/docs/orchestration/schemas/business_audit_contract.schema.json index 132c888..6b8c16d 100644 --- a/docs/orchestration/schemas/business_audit_contract.schema.json +++ b/docs/orchestration/schemas/business_audit_contract.schema.json @@ -103,6 +103,18 @@ "type": "object", "additionalProperties": true }, + "detector_results_summary": { + "type": "object", + "additionalProperties": true, + "properties": { + "status": { + "type": "string" + }, + "signal_ok_for_auto_coder": { + "type": "boolean" + } + } + }, "rerun_matrix": { "type": "array", "items": { diff --git a/docs/orchestration/schemas/domain_loop_lead_coder_handoff.schema.json b/docs/orchestration/schemas/domain_loop_lead_coder_handoff.schema.json index 19b0fa3..d6280b8 100644 --- a/docs/orchestration/schemas/domain_loop_lead_coder_handoff.schema.json +++ b/docs/orchestration/schemas/domain_loop_lead_coder_handoff.schema.json @@ -94,6 +94,18 @@ "type": "string" } }, + "detector_results_summary": { + "type": "object", + "additionalProperties": true, + "properties": { + "status": { + "type": "string" + }, + "signal_ok_for_auto_coder": { + "type": "boolean" + } + } + }, "human_meaning": { "type": "object", "additionalProperties": true diff --git a/scripts/domain_case_loop.py b/scripts/domain_case_loop.py index 1bfe26e..387a380 100644 --- a/scripts/domain_case_loop.py +++ b/scripts/domain_case_loop.py @@ -4581,6 +4581,7 @@ def select_primary_repair_focus(repair_targets: dict[str, Any]) -> dict[str, Any def evaluate_auto_coder_gate( repair_targets: dict[str, Any], assigned_focus: dict[str, Any] | None, + detector_results: dict[str, Any] | None = None, ) -> dict[str, Any]: catalog = load_issue_catalog() issue_codes = normalize_string_list((assigned_focus or {}).get("issue_codes")) @@ -4593,6 +4594,7 @@ def evaluate_auto_coder_gate( catalog_allowed_patch_targets: list[str] = [] catalog_forbidden_patch_targets: list[str] = [] issue_catalog_contracts: dict[str, Any] = {} + detector_results_summary = summarize_detector_results(detector_results) if not assigned_focus: blocking_reasons.append("missing_assigned_focus") @@ -4703,6 +4705,8 @@ def evaluate_auto_coder_gate( blocking_reasons.append(f"target_missing_rerun_matrix:{target_id}") elif "accepted_smoke_pack" not in normalize_string_list(target.get("rerun_matrix")): blocking_reasons.append(f"target_missing_accepted_smoke_pack:{target_id}") + if detector_results is not None and not detector_results_summary["signal_ok_for_auto_coder"]: + blocking_reasons.append(f"detector_results_no_repair_signal:{detector_results_summary['status']}") allowed = not blocking_reasons return { @@ -4717,6 +4721,7 @@ def evaluate_auto_coder_gate( "rerun_matrix": rerun_matrix, "allowlisted_issue_codes": sorted(AUTO_CODER_ALLOWED_ISSUE_CODES), "issue_catalog_contracts": issue_catalog_contracts, + "detector_results_summary": detector_results_summary, "blocking_reasons": blocking_reasons, "reason": "auto_coder_gate_passed" if allowed else ";".join(blocking_reasons), "policy": { @@ -4726,6 +4731,7 @@ def evaluate_auto_coder_gate( "requires_target_evidence_paths": True, "requires_accepted_smoke_pack": True, "requires_catalog_limited_patch_scope": True, + "requires_detector_results_signal_when_available": True, "lead_owns_merge_and_acceptance": True, }, } @@ -5213,6 +5219,48 @@ def build_detector_candidates(repair_targets: dict[str, Any], catalog: dict[str, } +def summarize_detector_results(detector_results: dict[str, Any] | None, *, limit: int = 8) -> dict[str, Any]: + if not isinstance(detector_results, dict): + return { + "status": "not_run", + "detector_count": 0, + "pass": 0, + "fail": 0, + "review": 0, + "skipped": 0, + "failed_detectors": [], + "review_detectors": [], + "skipped_detectors": [], + "signal_ok_for_auto_coder": False, + } + summary = detector_results.get("summary") if isinstance(detector_results.get("summary"), dict) else {} + results = detector_results.get("results") if isinstance(detector_results.get("results"), list) else [] + + def _detectors_with_status(status: str) -> list[str]: + names: list[str] = [] + for item in results: + if not isinstance(item, dict) or str(item.get("status") or "") != status: + continue + detector_name = str(item.get("detector") or "").strip() + if detector_name and detector_name not in names: + names.append(detector_name) + return names[:limit] + + status = str(summary.get("status") or "skipped") + return { + "status": status, + "detector_count": int(summary.get("detector_count") or len(results)), + "pass": int(summary.get("pass") or 0), + "fail": int(summary.get("fail") or 0), + "review": int(summary.get("review") or 0), + "skipped": int(summary.get("skipped") or 0), + "failed_detectors": _detectors_with_status("fail"), + "review_detectors": _detectors_with_status("review"), + "skipped_detectors": _detectors_with_status("skipped"), + "signal_ok_for_auto_coder": status in {"fail", "review"}, + } + + def build_blocking_issue_contract(target: dict[str, Any], catalog: dict[str, Any]) -> dict[str, Any]: issue_code = str(target.get("issue_code") or target.get("problem_type") or "other").strip() entry = issue_catalog_entry(issue_code, catalog) @@ -5257,6 +5305,7 @@ def build_business_audit_contract( rerun_matrix_path: Path | None = None, detector_candidates_path: Path | None = None, detector_results_path: Path | None = None, + detector_results: dict[str, Any] | None = None, ) -> dict[str, Any]: catalog = load_issue_catalog() targets = repair_targets.get("targets") if isinstance(repair_targets.get("targets"), list) else [] @@ -5298,6 +5347,7 @@ def build_business_audit_contract( "severity_counts": repair_targets.get("severity_counts") or {}, "priority_foci": _limited_dict_items(repair_targets.get("priority_foci"), limit=8), }, + "detector_results_summary": summarize_detector_results(detector_results), "rerun_matrix": rerun_matrix, "artifact_refs": { "business_audit_md": repo_relative(business_audit_markdown_path), @@ -5410,6 +5460,7 @@ def build_lead_coder_handoff( rerun_matrix_path: Path | None = None, detector_candidates_path: Path | None = None, detector_results_path: Path | None = None, + detector_results: dict[str, Any] | None = None, analyst_verdict: dict[str, Any], repair_targets: dict[str, Any], target_score: int, @@ -5485,6 +5536,7 @@ def build_lead_coder_handoff( "artifact_refs": artifact_refs, "issue_codes": issue_codes, "rerun_matrix": rerun_matrix, + "detector_results_summary": summarize_detector_results(detector_results), "human_meaning": { "user_intent_summary": analyst_verdict.get("user_intent_summary"), "expected_direct_answer": analyst_verdict.get("expected_direct_answer"), @@ -5501,7 +5553,7 @@ def build_lead_coder_handoff( "candidate_files": candidate_files, "lead_instructions": [ "Read business_audit.md first and judge the user-facing answer before debug metadata.", - "Use business_audit.json, issue_catalog_snapshot.json, rerun_matrix.json, and detector_candidates.json as the repair contract.", + "Use business_audit.json, issue_catalog_snapshot.json, rerun_matrix.json, detector_candidates.json, and detector_results.json as the repair contract.", "Inspect analyst_verdict.json and repair_targets.json only after the semantic defect is clear.", "Patch only inside allowed_patch_targets for the issue_code unless Lead Codex explicitly expands scope.", "Do not touch forbidden_patch_targets and do not repair by masking detector symptoms.", @@ -5516,6 +5568,9 @@ def build_lead_coder_handoff_markdown(handoff: dict[str, Any]) -> str: artifact_refs = handoff.get("artifact_refs") if isinstance(handoff.get("artifact_refs"), dict) else {} human_meaning = handoff.get("human_meaning") if isinstance(handoff.get("human_meaning"), dict) else {} auto_coder_gate = handoff.get("auto_coder_gate") if isinstance(handoff.get("auto_coder_gate"), dict) else {} + detector_summary = ( + handoff.get("detector_results_summary") if isinstance(handoff.get("detector_results_summary"), dict) else {} + ) lines = [ "# Lead Codex repair handoff", "", @@ -5544,6 +5599,18 @@ def build_lead_coder_handoff_markdown(handoff: dict[str, Any]) -> str: f"- rerun_matrix: `{', '.join(normalize_string_list(handoff.get('rerun_matrix'))) or 'n/a'}`", "", ] + if detector_summary: + lines.extend( + [ + "## Detector Results", + f"- status: `{detector_summary.get('status') or 'n/a'}`", + f"- counts: `pass={detector_summary.get('pass') or 0}, fail={detector_summary.get('fail') or 0}, review={detector_summary.get('review') or 0}, skipped={detector_summary.get('skipped') or 0}`", + f"- failed_detectors: `{', '.join(normalize_string_list(detector_summary.get('failed_detectors'))) or 'none'}`", + f"- review_detectors: `{', '.join(normalize_string_list(detector_summary.get('review_detectors'))) or 'none'}`", + f"- skipped_detectors: `{', '.join(normalize_string_list(detector_summary.get('skipped_detectors'))) or 'none'}`", + "", + ] + ) if auto_coder_gate: lines.extend( [ @@ -5799,6 +5866,7 @@ def build_loop_summary(loop_state: dict[str, Any]) -> str: "## Iterations", ] for item in loop_state.get("iterations", []): + detector_summary = item.get("detector_results_summary") if isinstance(item.get("detector_results_summary"), dict) else {} lines.extend( [ f"- `{item['iteration_id']}`", @@ -5822,6 +5890,8 @@ def build_loop_summary(loop_state: dict[str, Any]) -> str: f" detector_candidates: `{item.get('detector_candidates_path') or 'n/a'}`", f" detector_results: `{item.get('detector_results_path') or 'n/a'}`", f" detector_results_status: `{item.get('detector_results_status') or 'n/a'}`", + f" detector_failed: `{', '.join(normalize_string_list(detector_summary.get('failed_detectors'))) or 'none'}`", + f" detector_review: `{', '.join(normalize_string_list(detector_summary.get('review_detectors'))) or 'none'}`", f" auto_coder_gate: `{item.get('auto_coder_gate_path') or 'n/a'}`", f" lead_coder_handoff: `{item.get('lead_coder_handoff_path') or 'n/a'}`", f" repair_target_count: `{item.get('repair_target_count')}`", @@ -5987,6 +6057,14 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: "items": collect_rerun_matrix(repair_targets), } detector_candidates = build_detector_candidates(repair_targets) + write_json(detector_candidates_path, detector_candidates) + detector_results = agent_detector_runner.build_detector_results( + pack_dir, + detector_candidates_path=detector_candidates_path, + include_default_global=False, + ) + write_json(detector_results_path, detector_results) + detector_results_summary = summarize_detector_results(detector_results) business_audit_contract = build_business_audit_contract( analyst_verdict=analyst_verdict, repair_targets=repair_targets, @@ -6004,17 +6082,11 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: rerun_matrix_path=rerun_matrix_path, detector_candidates_path=detector_candidates_path, detector_results_path=detector_results_path, + detector_results=detector_results, ) write_json(business_audit_json_path, business_audit_contract) write_json(issue_catalog_snapshot_path, issue_catalog_snapshot) write_json(rerun_matrix_path, rerun_matrix_contract) - write_json(detector_candidates_path, detector_candidates) - detector_results = agent_detector_runner.build_detector_results( - pack_dir, - detector_candidates_path=detector_candidates_path, - include_default_global=False, - ) - write_json(detector_results_path, detector_results) repair_target_count = int(repair_targets.get("target_count") or 0) if isinstance(repair_targets, dict) else 0 repair_target_severity_counts = ( repair_targets.get("severity_counts") @@ -6051,7 +6123,8 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: "rerun_matrix_path": str(rerun_matrix_path), "detector_candidates_path": str(detector_candidates_path), "detector_results_path": str(detector_results_path), - "detector_results_status": detector_results.get("summary", {}).get("status"), + "detector_results_status": detector_results_summary.get("status"), + "detector_results_summary": detector_results_summary, "repair_target_count": repair_target_count, "repair_target_severity_counts": repair_target_severity_counts, "coder_status": None, @@ -6098,6 +6171,7 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: rerun_matrix_path=rerun_matrix_path, detector_candidates_path=detector_candidates_path, detector_results_path=detector_results_path, + detector_results=detector_results, analyst_verdict=analyst_verdict, repair_targets=repair_targets, target_score=target_score, @@ -6137,7 +6211,7 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: assigned_focus = select_primary_repair_focus(repair_targets) auto_coder_gate_path = iteration_dir / "auto_coder_gate.json" - auto_coder_gate = evaluate_auto_coder_gate(repair_targets, assigned_focus) + auto_coder_gate = evaluate_auto_coder_gate(repair_targets, assigned_focus, detector_results=detector_results) write_json(auto_coder_gate_path, auto_coder_gate) iteration_record["auto_coder_gate_path"] = str(auto_coder_gate_path) if not bool(auto_coder_gate.get("allowed")): @@ -6153,6 +6227,7 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: rerun_matrix_path=rerun_matrix_path, detector_candidates_path=detector_candidates_path, detector_results_path=detector_results_path, + detector_results=detector_results, analyst_verdict=analyst_verdict, repair_targets=repair_targets, target_score=target_score, diff --git a/scripts/test_domain_case_loop_lead_handoff.py b/scripts/test_domain_case_loop_lead_handoff.py index 0bfdb63..0eb1eff 100644 --- a/scripts/test_domain_case_loop_lead_handoff.py +++ b/scripts/test_domain_case_loop_lead_handoff.py @@ -69,6 +69,11 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): issue_catalog_snapshot_path=iteration_dir / "issue_catalog_snapshot.json", rerun_matrix_path=iteration_dir / "rerun_matrix.json", detector_candidates_path=iteration_dir / "detector_candidates.json", + detector_results_path=iteration_dir / "detector_results.json", + detector_results={ + "summary": {"status": "fail", "detector_count": 1, "pass": 0, "fail": 1, "review": 0, "skipped": 0}, + "results": [{"detector": "first_line_not_direct_answer", "status": "fail"}], + }, analyst_verdict=analyst_verdict, repair_targets=repair_targets, target_score=88, @@ -96,6 +101,9 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): self.assertIn("business_audit", saved["artifact_refs"]) self.assertIn("business_audit_json", saved["artifact_refs"]) self.assertIn("issue_catalog_snapshot", saved["artifact_refs"]) + self.assertIn("detector_results", saved["artifact_refs"]) + self.assertEqual(saved["detector_results_summary"]["status"], "fail") + self.assertEqual(saved["detector_results_summary"]["failed_detectors"], ["first_line_not_direct_answer"]) self.assertIn("business_direct_answer_missing", saved["issue_codes"]) self.assertIn("failed_scenario", saved["rerun_matrix"]) self.assertTrue(latest_handoff_exists) @@ -140,6 +148,11 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): issue_catalog_snapshot_path=Path("issue_catalog_snapshot.json"), rerun_matrix_path=Path("rerun_matrix.json"), detector_candidates_path=Path("detector_candidates.json"), + detector_results_path=Path("detector_results.json"), + detector_results={ + "summary": {"status": "review", "detector_count": 1, "pass": 0, "fail": 0, "review": 1, "skipped": 0}, + "results": [{"detector": "missing_revenue_cogs_margin_fields", "status": "review"}], + }, ) self.assertEqual(contract["overall_status"], "partial") @@ -147,6 +160,9 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): self.assertEqual(contract["blocking_issues"][0]["expected_business_answer_contract"], "margin_profitability_v1") self.assertIn("failed_margin_scenario", contract["rerun_matrix"]) self.assertIn("detector_candidates_json", contract["artifact_refs"]) + self.assertIn("detector_results_json", contract["artifact_refs"]) + self.assertEqual(contract["detector_results_summary"]["status"], "review") + self.assertEqual(contract["detector_results_summary"]["review_detectors"], ["missing_revenue_cogs_margin_fields"]) def test_auto_coder_gate_blocks_non_allowlisted_issue_codes(self) -> None: repair_targets = { @@ -209,6 +225,81 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): "direct_answer_surface_v1", ) + def test_auto_coder_gate_blocks_when_detector_results_have_no_repair_signal(self) -> None: + repair_targets = { + "targets": [ + { + "target_id": "pack:s01", + "issue_code": "business_direct_answer_missing", + "root_cause_layers": ["answer_surface"], + "expected_business_answer_contract": "direct_answer_surface_v1", + "evidence_paths": ["artifacts/domain_runs/pack/steps/s01/output.md"], + "allowed_patch_targets": ["llm_normalizer/backend/src/services/address_runtime/composeStage.ts"], + "forbidden_patch_targets": ["routing rewrites", "fake evidence", "global runtime rewrite"], + "rerun_matrix": ["failed_scenario", "direct_answer_surface_pack", "accepted_smoke_pack"], + } + ], + } + assigned_focus = { + "focus_id": "answer_shape|composeStage", + "issue_codes": ["business_direct_answer_missing"], + "root_cause_layers": ["answer_surface"], + "allowed_patch_targets": ["llm_normalizer/backend/src/services/address_runtime/composeStage.ts"], + "forbidden_patch_targets": ["routing rewrites", "fake evidence", "global runtime rewrite"], + "rerun_matrix": ["failed_scenario", "direct_answer_surface_pack", "accepted_smoke_pack"], + "target_ids": ["pack:s01"], + } + + gate = dcl.evaluate_auto_coder_gate( + repair_targets, + assigned_focus, + detector_results={ + "summary": {"status": "pass", "detector_count": 1, "pass": 1, "fail": 0, "review": 0, "skipped": 0}, + "results": [{"detector": "first_line_not_direct_answer", "status": "pass"}], + }, + ) + + self.assertFalse(gate["allowed"]) + self.assertEqual(gate["detector_results_summary"]["status"], "pass") + self.assertIn("detector_results_no_repair_signal:pass", gate["blocking_reasons"]) + + def test_auto_coder_gate_allows_when_detector_results_confirm_failure(self) -> None: + repair_targets = { + "targets": [ + { + "target_id": "pack:s01", + "issue_code": "business_direct_answer_missing", + "root_cause_layers": ["answer_surface"], + "expected_business_answer_contract": "direct_answer_surface_v1", + "evidence_paths": ["artifacts/domain_runs/pack/steps/s01/output.md"], + "allowed_patch_targets": ["llm_normalizer/backend/src/services/address_runtime/composeStage.ts"], + "forbidden_patch_targets": ["routing rewrites", "fake evidence", "global runtime rewrite"], + "rerun_matrix": ["failed_scenario", "direct_answer_surface_pack", "accepted_smoke_pack"], + } + ], + } + assigned_focus = { + "focus_id": "answer_shape|composeStage", + "issue_codes": ["business_direct_answer_missing"], + "root_cause_layers": ["answer_surface"], + "allowed_patch_targets": ["llm_normalizer/backend/src/services/address_runtime/composeStage.ts"], + "forbidden_patch_targets": ["routing rewrites", "fake evidence", "global runtime rewrite"], + "rerun_matrix": ["failed_scenario", "direct_answer_surface_pack", "accepted_smoke_pack"], + "target_ids": ["pack:s01"], + } + + gate = dcl.evaluate_auto_coder_gate( + repair_targets, + assigned_focus, + detector_results={ + "summary": {"status": "fail", "detector_count": 1, "pass": 0, "fail": 1, "review": 0, "skipped": 0}, + "results": [{"detector": "first_line_not_direct_answer", "status": "fail"}], + }, + ) + + self.assertTrue(gate["allowed"]) + self.assertEqual(gate["detector_results_summary"]["failed_detectors"], ["first_line_not_direct_answer"]) + def test_auto_coder_gate_blocks_broad_or_blind_patch_scope(self) -> None: repair_targets = { "targets": [ @@ -302,6 +393,7 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): "business_audit": "artifacts/domain_runs/demo/business_audit.md", "analyst_verdict": "artifacts/domain_runs/demo/analyst_verdict.json", "repair_targets": "artifacts/domain_runs/demo/repair_targets.json", + "detector_results": "artifacts/domain_runs/demo/detector_results.json", "auto_coder_gate": "artifacts/domain_runs/demo/auto_coder_gate.json", "pack_dir": "artifacts/domain_runs/demo/pack", }, @@ -311,6 +403,16 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): "top_repair_targets": [], "candidate_files": [], "lead_instructions": [], + "detector_results_summary": { + "status": "fail", + "pass": 0, + "fail": 1, + "review": 0, + "skipped": 0, + "failed_detectors": ["first_line_not_direct_answer"], + "review_detectors": [], + "skipped_detectors": [], + }, "auto_coder_gate": { "allowed": False, "reason": "target_missing_evidence_paths:pack:s01", @@ -331,6 +433,8 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): markdown = dcl.build_lead_coder_handoff_markdown(handoff) self.assertIn("## Auto-Coder Gate", markdown) + self.assertIn("## Detector Results", markdown) + self.assertIn("first_line_not_direct_answer", markdown) self.assertIn("target_missing_evidence_paths:pack:s01", markdown) self.assertIn("## Auto-Coder Catalog Contracts", markdown) self.assertIn("direct_answer_surface_v1", markdown)