NODEDC_1C/scripts/run_address_nightly_regress...

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import subprocess
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any


PROJECT_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_OUTPUT_ROOT = PROJECT_ROOT / "docs" / "ADDRESS" / "runs"
RUNNER_SCRIPT = PROJECT_ROOT / "scripts" / "run_address_live_slang_stress.py"
VALIDATOR_SCRIPT = PROJECT_ROOT / "scripts" / "validate_address_run_pack.py"
COMPARATOR_SCRIPT = PROJECT_ROOT / "scripts" / "compare_address_run_summary.py"


@dataclass
class NightlyPackConfig:
    name: str
    questions_file: Path
    baseline_summary: Path


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Run ADDRESS nightly regression packs (102 + 25), validate run packs and compare vs baseline."
    )
    parser.add_argument("--backend-url", default="http://127.0.0.1:8787/api/assistant/message")
    parser.add_argument("--prompt-version", default="address_query_runtime_v1")
    parser.add_argument("--llm-provider", default="local")
    parser.add_argument("--llm-model", default="qwen2.5-14b-instruct-1m")
    parser.add_argument("--llm-base-url", default="http://127.0.0.1:1234")
    parser.add_argument("--temperature", type=float, default=0.0)
    parser.add_argument("--max-output-tokens", type=int, default=900)
    parser.add_argument("--timeout-sec", type=int, default=120)
    parser.add_argument("--strict-policy", default="route", choices=["semantic", "route", "factual"])
    parser.add_argument(
        "--output-root",
        default=str(DEFAULT_OUTPUT_ROOT),
        help="Root where nightly bundle folder will be created.",
    )
    parser.add_argument(
        "--nightly-run-id",
        default="",
        help="Optional nightly bundle id. Default: <date>_Address_Nightly_Regression_<time>.",
    )
    parser.add_argument(
        "--python-bin",
        default=sys.executable,
        help="Python executable to run child scripts.",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print planned commands and baseline checks without executing live requests.",
    )
    parser.add_argument(
        "--skip-comparator",
        action="store_true",
        help="Run packs + validator only, skip baseline comparison.",
    )
    return parser.parse_args()


def now_stamp() -> str:
    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


def run_command(cmd: list[str], *, cwd: Path, dry_run: bool) -> tuple[int, str, str]:
    rendered = " ".join(f'"{token}"' if " " in token else token for token in cmd)
    print(f"$ {rendered}")
    if dry_run:
        return 0, "", ""
    completed = subprocess.run(
        cmd,
        cwd=str(cwd),
        check=False,
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="replace",
    )
    if completed.stdout:
        print(completed.stdout.strip())
    if completed.stderr:
        print(completed.stderr.strip())
    return completed.returncode, completed.stdout, completed.stderr


def main() -> None:
    args = parse_args()

    output_root = Path(args.output_root).resolve()
    nightly_run_id = args.nightly_run_id.strip() or f"{datetime.now().date().isoformat()}_Address_Nightly_Regression_{now_stamp()}"
    nightly_dir = output_root / nightly_run_id
    nightly_dir.mkdir(parents=True, exist_ok=True)

    packs = [
        NightlyPackConfig(
            name="stress_102",
            questions_file=(PROJECT_ROOT / "docs" / "ADDRESS" / "question_sets" / "address_slang_stress_full_2026-04-02.json").resolve(),
            baseline_summary=(
                PROJECT_ROOT
                / "docs"
                / "ADDRESS"
                / "runs"
                / "2026-04-02_Address_Slang_Live_Stress_2026-04-02_12-57-27"
                / "run_summary.json"
            ).resolve(),
        ),
        NightlyPackConfig(
            name="followup_25",
            questions_file=(PROJECT_ROOT / "docs" / "ADDRESS" / "question_sets" / "address_followup_context_chains_2026-04-02.json").resolve(),
            baseline_summary=(
                PROJECT_ROOT
                / "docs"
                / "ADDRESS"
                / "runs"
                / "2026-04-02_Address_Followup_Context_Chains_2026-04-02_19-15-Run5"
                / "run_summary.json"
            ).resolve(),
        ),
    ]

    failures: list[str] = []
    packs_report: list[dict[str, Any]] = []

    print(f"Nightly bundle: {nightly_dir}")
    for pack in packs:
        if not pack.questions_file.exists():
            failures.append(f"[{pack.name}] missing questions file: {pack.questions_file}")
        if not pack.baseline_summary.exists():
            failures.append(f"[{pack.name}] missing baseline summary: {pack.baseline_summary}")
    if failures:
        for item in failures:
            print(f"ERROR: {item}")
        raise SystemExit(1)

    for pack in packs:
        stamp = datetime.now().strftime("%H-%M-%S")
        run_id = f"{datetime.now().date().isoformat()}_Address_Nightly_{pack.name}_{stamp}"
        run_dir = nightly_dir / run_id

        row: dict[str, Any] = {
            "pack": pack.name,
            "run_id": run_id,
            "questions_file": str(pack.questions_file),
            "baseline_summary": str(pack.baseline_summary),
            "run_dir": str(run_dir),
            "runner_ok": False,
            "validator_ok": False,
            "comparator_ok": None if args.skip_comparator else False,
            "errors": [],
        }

        runner_cmd = [
            str(Path(args.python_bin).resolve()),
            str(RUNNER_SCRIPT),
            "--questions-file",
            str(pack.questions_file),
            "--backend-url",
            args.backend_url,
            "--prompt-version",
            args.prompt_version,
            "--llm-provider",
            args.llm_provider,
            "--llm-model",
            args.llm_model,
            "--llm-base-url",
            args.llm_base_url,
            "--temperature",
            str(args.temperature),
            "--max-output-tokens",
            str(args.max_output_tokens),
            "--timeout-sec",
            str(args.timeout_sec),
            "--strict-policy",
            args.strict_policy,
            "--run-id",
            run_id,
            "--output-root",
            str(nightly_dir),
        ]
        code, _, _ = run_command(runner_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
        if code != 0:
            row["errors"].append(f"runner failed with exit code {code}")
            packs_report.append(row)
            continue
        row["runner_ok"] = True

        validator_report = nightly_dir / f"{run_id}_validator_report.json"
        validator_cmd = [
            str(Path(args.python_bin).resolve()),
            str(VALIDATOR_SCRIPT),
            str(run_dir),
            "--report-json",
            str(validator_report),
        ]
        code, _, _ = run_command(validator_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
        if code != 0:
            row["errors"].append(f"validator failed with exit code {code}")
            packs_report.append(row)
            continue
        row["validator_ok"] = True
        row["validator_report"] = str(validator_report)

        if not args.skip_comparator:
            comparator_report = nightly_dir / f"{run_id}_comparator_report.json"
            comparator_cmd = [
                str(Path(args.python_bin).resolve()),
                str(COMPARATOR_SCRIPT),
                "--baseline-summary",
                str(pack.baseline_summary),
                "--candidate-summary",
                str(run_dir / "run_summary.json"),
                "--report-json",
                str(comparator_report),
            ]
            code, _, _ = run_command(comparator_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
            if code != 0:
                row["errors"].append(f"comparator failed with exit code {code}")
                packs_report.append(row)
                continue
            row["comparator_ok"] = True
            row["comparator_report"] = str(comparator_report)

        packs_report.append(row)

    overall_ok = all(item.get("runner_ok") and item.get("validator_ok") and (args.skip_comparator or item.get("comparator_ok")) for item in packs_report)
    if len(packs_report) != len(packs):
        overall_ok = False

    report = {
        "nightly_run_id": nightly_run_id,
        "generated_at": datetime.now().isoformat(timespec="seconds"),
        "dry_run": bool(args.dry_run),
        "overall_ok": overall_ok,
        "strict_policy": args.strict_policy,
        "packs": packs_report,
    }

    report_path = nightly_dir / "nightly_summary.json"
    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")

    md_lines = [
        f"# {nightly_run_id}",
        "",
        f"Generated at: {report['generated_at']}",
        f"Dry run: {report['dry_run']}",
        f"Strict policy: {report['strict_policy']}",
        f"Overall: {'PASS' if overall_ok else 'FAIL'}",
        "",
        "## Packs",
    ]
    for item in packs_report:
        md_lines.extend(
            [
                f"### {item['pack']}",
                f"- run_id: {item['run_id']}",
                f"- runner_ok: {item['runner_ok']}",
                f"- validator_ok: {item['validator_ok']}",
                f"- comparator_ok: {item['comparator_ok']}",
                f"- run_dir: {item['run_dir']}",
            ]
        )
        if item.get("errors"):
            md_lines.append("- errors:")
            for err in item["errors"]:
                md_lines.append(f"  - {err}")
        md_lines.append("")

    (nightly_dir / "README.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8")

    print(f"\nNightly summary: {report_path}")
    print(f"Overall: {'PASS' if overall_ok else 'FAIL'}")
    if not overall_ok:
        raise SystemExit(1)


if __name__ == "__main__":
    main()