NODEDC_1C/scripts/run_address_nightly_regress...

286 lines
9.9 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
PROJECT_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_OUTPUT_ROOT = PROJECT_ROOT / "docs" / "ADDRESS" / "runs"
RUNNER_SCRIPT = PROJECT_ROOT / "scripts" / "run_address_live_slang_stress.py"
VALIDATOR_SCRIPT = PROJECT_ROOT / "scripts" / "validate_address_run_pack.py"
COMPARATOR_SCRIPT = PROJECT_ROOT / "scripts" / "compare_address_run_summary.py"
@dataclass
class NightlyPackConfig:
name: str
questions_file: Path
baseline_summary: Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run ADDRESS nightly regression packs (102 + 25), validate run packs and compare vs baseline."
)
parser.add_argument("--backend-url", default="http://127.0.0.1:8787/api/assistant/message")
parser.add_argument("--prompt-version", default="address_query_runtime_v1")
parser.add_argument("--llm-provider", default="local")
parser.add_argument("--llm-model", default="qwen2.5-14b-instruct-1m")
parser.add_argument("--llm-base-url", default="http://127.0.0.1:1234")
parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--max-output-tokens", type=int, default=900)
parser.add_argument("--timeout-sec", type=int, default=120)
parser.add_argument("--strict-policy", default="route", choices=["semantic", "route", "factual"])
parser.add_argument(
"--output-root",
default=str(DEFAULT_OUTPUT_ROOT),
help="Root where nightly bundle folder will be created.",
)
parser.add_argument(
"--nightly-run-id",
default="",
help="Optional nightly bundle id. Default: <date>_Address_Nightly_Regression_<time>.",
)
parser.add_argument(
"--python-bin",
default=sys.executable,
help="Python executable to run child scripts.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print planned commands and baseline checks without executing live requests.",
)
parser.add_argument(
"--skip-comparator",
action="store_true",
help="Run packs + validator only, skip baseline comparison.",
)
return parser.parse_args()
def now_stamp() -> str:
return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
def run_command(cmd: list[str], *, cwd: Path, dry_run: bool) -> tuple[int, str, str]:
rendered = " ".join(f'"{token}"' if " " in token else token for token in cmd)
print(f"$ {rendered}")
if dry_run:
return 0, "", ""
completed = subprocess.run(
cmd,
cwd=str(cwd),
check=False,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
)
if completed.stdout:
print(completed.stdout.strip())
if completed.stderr:
print(completed.stderr.strip())
return completed.returncode, completed.stdout, completed.stderr
def main() -> None:
args = parse_args()
output_root = Path(args.output_root).resolve()
nightly_run_id = args.nightly_run_id.strip() or f"{datetime.now().date().isoformat()}_Address_Nightly_Regression_{now_stamp()}"
nightly_dir = output_root / nightly_run_id
nightly_dir.mkdir(parents=True, exist_ok=True)
packs = [
NightlyPackConfig(
name="stress_102",
questions_file=(PROJECT_ROOT / "docs" / "ADDRESS" / "question_sets" / "address_slang_stress_full_2026-04-02.json").resolve(),
baseline_summary=(
PROJECT_ROOT
/ "docs"
/ "ADDRESS"
/ "runs"
/ "2026-04-02_Address_Slang_Live_Stress_2026-04-02_12-57-27"
/ "run_summary.json"
).resolve(),
),
NightlyPackConfig(
name="followup_25",
questions_file=(PROJECT_ROOT / "docs" / "ADDRESS" / "question_sets" / "address_followup_context_chains_2026-04-02.json").resolve(),
baseline_summary=(
PROJECT_ROOT
/ "docs"
/ "ADDRESS"
/ "runs"
/ "2026-04-02_Address_Followup_Context_Chains_2026-04-02_19-15-Run5"
/ "run_summary.json"
).resolve(),
),
]
failures: list[str] = []
packs_report: list[dict[str, Any]] = []
print(f"Nightly bundle: {nightly_dir}")
for pack in packs:
if not pack.questions_file.exists():
failures.append(f"[{pack.name}] missing questions file: {pack.questions_file}")
if not pack.baseline_summary.exists():
failures.append(f"[{pack.name}] missing baseline summary: {pack.baseline_summary}")
if failures:
for item in failures:
print(f"ERROR: {item}")
raise SystemExit(1)
for pack in packs:
stamp = datetime.now().strftime("%H-%M-%S")
run_id = f"{datetime.now().date().isoformat()}_Address_Nightly_{pack.name}_{stamp}"
run_dir = nightly_dir / run_id
row: dict[str, Any] = {
"pack": pack.name,
"run_id": run_id,
"questions_file": str(pack.questions_file),
"baseline_summary": str(pack.baseline_summary),
"run_dir": str(run_dir),
"runner_ok": False,
"validator_ok": False,
"comparator_ok": None if args.skip_comparator else False,
"errors": [],
}
runner_cmd = [
str(Path(args.python_bin).resolve()),
str(RUNNER_SCRIPT),
"--questions-file",
str(pack.questions_file),
"--backend-url",
args.backend_url,
"--prompt-version",
args.prompt_version,
"--llm-provider",
args.llm_provider,
"--llm-model",
args.llm_model,
"--llm-base-url",
args.llm_base_url,
"--temperature",
str(args.temperature),
"--max-output-tokens",
str(args.max_output_tokens),
"--timeout-sec",
str(args.timeout_sec),
"--strict-policy",
args.strict_policy,
"--run-id",
run_id,
"--output-root",
str(nightly_dir),
]
code, _, _ = run_command(runner_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
if code != 0:
row["errors"].append(f"runner failed with exit code {code}")
packs_report.append(row)
continue
row["runner_ok"] = True
validator_report = nightly_dir / f"{run_id}_validator_report.json"
validator_cmd = [
str(Path(args.python_bin).resolve()),
str(VALIDATOR_SCRIPT),
str(run_dir),
"--report-json",
str(validator_report),
]
code, _, _ = run_command(validator_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
if code != 0:
row["errors"].append(f"validator failed with exit code {code}")
packs_report.append(row)
continue
row["validator_ok"] = True
row["validator_report"] = str(validator_report)
if not args.skip_comparator:
comparator_report = nightly_dir / f"{run_id}_comparator_report.json"
comparator_cmd = [
str(Path(args.python_bin).resolve()),
str(COMPARATOR_SCRIPT),
"--baseline-summary",
str(pack.baseline_summary),
"--candidate-summary",
str(run_dir / "run_summary.json"),
"--report-json",
str(comparator_report),
]
code, _, _ = run_command(comparator_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
if code != 0:
row["errors"].append(f"comparator failed with exit code {code}")
packs_report.append(row)
continue
row["comparator_ok"] = True
row["comparator_report"] = str(comparator_report)
packs_report.append(row)
overall_ok = all(item.get("runner_ok") and item.get("validator_ok") and (args.skip_comparator or item.get("comparator_ok")) for item in packs_report)
if len(packs_report) != len(packs):
overall_ok = False
report = {
"nightly_run_id": nightly_run_id,
"generated_at": datetime.now().isoformat(timespec="seconds"),
"dry_run": bool(args.dry_run),
"overall_ok": overall_ok,
"strict_policy": args.strict_policy,
"packs": packs_report,
}
report_path = nightly_dir / "nightly_summary.json"
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
md_lines = [
f"# {nightly_run_id}",
"",
f"Generated at: {report['generated_at']}",
f"Dry run: {report['dry_run']}",
f"Strict policy: {report['strict_policy']}",
f"Overall: {'PASS' if overall_ok else 'FAIL'}",
"",
"## Packs",
]
for item in packs_report:
md_lines.extend(
[
f"### {item['pack']}",
f"- run_id: {item['run_id']}",
f"- runner_ok: {item['runner_ok']}",
f"- validator_ok: {item['validator_ok']}",
f"- comparator_ok: {item['comparator_ok']}",
f"- run_dir: {item['run_dir']}",
]
)
if item.get("errors"):
md_lines.append("- errors:")
for err in item["errors"]:
md_lines.append(f" - {err}")
md_lines.append("")
(nightly_dir / "README.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8")
print(f"\nNightly summary: {report_path}")
print(f"Overall: {'PASS' if overall_ok else 'FAIL'}")
if not overall_ok:
raise SystemExit(1)
if __name__ == "__main__":
main()