286 lines
9.9 KiB
Python
286 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
DEFAULT_OUTPUT_ROOT = PROJECT_ROOT / "docs" / "ADDRESS" / "runs"
|
|
RUNNER_SCRIPT = PROJECT_ROOT / "scripts" / "run_address_live_slang_stress.py"
|
|
VALIDATOR_SCRIPT = PROJECT_ROOT / "scripts" / "validate_address_run_pack.py"
|
|
COMPARATOR_SCRIPT = PROJECT_ROOT / "scripts" / "compare_address_run_summary.py"
|
|
|
|
|
|
@dataclass
|
|
class NightlyPackConfig:
|
|
name: str
|
|
questions_file: Path
|
|
baseline_summary: Path
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Run ADDRESS nightly regression packs (102 + 25), validate run packs and compare vs baseline."
|
|
)
|
|
parser.add_argument("--backend-url", default="http://127.0.0.1:8787/api/assistant/message")
|
|
parser.add_argument("--prompt-version", default="address_query_runtime_v1")
|
|
parser.add_argument("--llm-provider", default="local")
|
|
parser.add_argument("--llm-model", default="qwen2.5-14b-instruct-1m")
|
|
parser.add_argument("--llm-base-url", default="http://127.0.0.1:1234")
|
|
parser.add_argument("--temperature", type=float, default=0.0)
|
|
parser.add_argument("--max-output-tokens", type=int, default=900)
|
|
parser.add_argument("--timeout-sec", type=int, default=120)
|
|
parser.add_argument("--strict-policy", default="route", choices=["semantic", "route", "factual"])
|
|
parser.add_argument(
|
|
"--output-root",
|
|
default=str(DEFAULT_OUTPUT_ROOT),
|
|
help="Root where nightly bundle folder will be created.",
|
|
)
|
|
parser.add_argument(
|
|
"--nightly-run-id",
|
|
default="",
|
|
help="Optional nightly bundle id. Default: <date>_Address_Nightly_Regression_<time>.",
|
|
)
|
|
parser.add_argument(
|
|
"--python-bin",
|
|
default=sys.executable,
|
|
help="Python executable to run child scripts.",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Print planned commands and baseline checks without executing live requests.",
|
|
)
|
|
parser.add_argument(
|
|
"--skip-comparator",
|
|
action="store_true",
|
|
help="Run packs + validator only, skip baseline comparison.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def now_stamp() -> str:
|
|
return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
|
|
|
|
def run_command(cmd: list[str], *, cwd: Path, dry_run: bool) -> tuple[int, str, str]:
|
|
rendered = " ".join(f'"{token}"' if " " in token else token for token in cmd)
|
|
print(f"$ {rendered}")
|
|
if dry_run:
|
|
return 0, "", ""
|
|
completed = subprocess.run(
|
|
cmd,
|
|
cwd=str(cwd),
|
|
check=False,
|
|
capture_output=True,
|
|
text=True,
|
|
encoding="utf-8",
|
|
errors="replace",
|
|
)
|
|
if completed.stdout:
|
|
print(completed.stdout.strip())
|
|
if completed.stderr:
|
|
print(completed.stderr.strip())
|
|
return completed.returncode, completed.stdout, completed.stderr
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
|
|
output_root = Path(args.output_root).resolve()
|
|
nightly_run_id = args.nightly_run_id.strip() or f"{datetime.now().date().isoformat()}_Address_Nightly_Regression_{now_stamp()}"
|
|
nightly_dir = output_root / nightly_run_id
|
|
nightly_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
packs = [
|
|
NightlyPackConfig(
|
|
name="stress_102",
|
|
questions_file=(PROJECT_ROOT / "docs" / "ADDRESS" / "question_sets" / "address_slang_stress_full_2026-04-02.json").resolve(),
|
|
baseline_summary=(
|
|
PROJECT_ROOT
|
|
/ "docs"
|
|
/ "ADDRESS"
|
|
/ "runs"
|
|
/ "2026-04-02_Address_Slang_Live_Stress_2026-04-02_12-57-27"
|
|
/ "run_summary.json"
|
|
).resolve(),
|
|
),
|
|
NightlyPackConfig(
|
|
name="followup_25",
|
|
questions_file=(PROJECT_ROOT / "docs" / "ADDRESS" / "question_sets" / "address_followup_context_chains_2026-04-02.json").resolve(),
|
|
baseline_summary=(
|
|
PROJECT_ROOT
|
|
/ "docs"
|
|
/ "ADDRESS"
|
|
/ "runs"
|
|
/ "2026-04-02_Address_Followup_Context_Chains_2026-04-02_19-15-Run5"
|
|
/ "run_summary.json"
|
|
).resolve(),
|
|
),
|
|
]
|
|
|
|
failures: list[str] = []
|
|
packs_report: list[dict[str, Any]] = []
|
|
|
|
print(f"Nightly bundle: {nightly_dir}")
|
|
for pack in packs:
|
|
if not pack.questions_file.exists():
|
|
failures.append(f"[{pack.name}] missing questions file: {pack.questions_file}")
|
|
if not pack.baseline_summary.exists():
|
|
failures.append(f"[{pack.name}] missing baseline summary: {pack.baseline_summary}")
|
|
if failures:
|
|
for item in failures:
|
|
print(f"ERROR: {item}")
|
|
raise SystemExit(1)
|
|
|
|
for pack in packs:
|
|
stamp = datetime.now().strftime("%H-%M-%S")
|
|
run_id = f"{datetime.now().date().isoformat()}_Address_Nightly_{pack.name}_{stamp}"
|
|
run_dir = nightly_dir / run_id
|
|
|
|
row: dict[str, Any] = {
|
|
"pack": pack.name,
|
|
"run_id": run_id,
|
|
"questions_file": str(pack.questions_file),
|
|
"baseline_summary": str(pack.baseline_summary),
|
|
"run_dir": str(run_dir),
|
|
"runner_ok": False,
|
|
"validator_ok": False,
|
|
"comparator_ok": None if args.skip_comparator else False,
|
|
"errors": [],
|
|
}
|
|
|
|
runner_cmd = [
|
|
str(Path(args.python_bin).resolve()),
|
|
str(RUNNER_SCRIPT),
|
|
"--questions-file",
|
|
str(pack.questions_file),
|
|
"--backend-url",
|
|
args.backend_url,
|
|
"--prompt-version",
|
|
args.prompt_version,
|
|
"--llm-provider",
|
|
args.llm_provider,
|
|
"--llm-model",
|
|
args.llm_model,
|
|
"--llm-base-url",
|
|
args.llm_base_url,
|
|
"--temperature",
|
|
str(args.temperature),
|
|
"--max-output-tokens",
|
|
str(args.max_output_tokens),
|
|
"--timeout-sec",
|
|
str(args.timeout_sec),
|
|
"--strict-policy",
|
|
args.strict_policy,
|
|
"--run-id",
|
|
run_id,
|
|
"--output-root",
|
|
str(nightly_dir),
|
|
]
|
|
code, _, _ = run_command(runner_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
|
|
if code != 0:
|
|
row["errors"].append(f"runner failed with exit code {code}")
|
|
packs_report.append(row)
|
|
continue
|
|
row["runner_ok"] = True
|
|
|
|
validator_report = nightly_dir / f"{run_id}_validator_report.json"
|
|
validator_cmd = [
|
|
str(Path(args.python_bin).resolve()),
|
|
str(VALIDATOR_SCRIPT),
|
|
str(run_dir),
|
|
"--report-json",
|
|
str(validator_report),
|
|
]
|
|
code, _, _ = run_command(validator_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
|
|
if code != 0:
|
|
row["errors"].append(f"validator failed with exit code {code}")
|
|
packs_report.append(row)
|
|
continue
|
|
row["validator_ok"] = True
|
|
row["validator_report"] = str(validator_report)
|
|
|
|
if not args.skip_comparator:
|
|
comparator_report = nightly_dir / f"{run_id}_comparator_report.json"
|
|
comparator_cmd = [
|
|
str(Path(args.python_bin).resolve()),
|
|
str(COMPARATOR_SCRIPT),
|
|
"--baseline-summary",
|
|
str(pack.baseline_summary),
|
|
"--candidate-summary",
|
|
str(run_dir / "run_summary.json"),
|
|
"--report-json",
|
|
str(comparator_report),
|
|
]
|
|
code, _, _ = run_command(comparator_cmd, cwd=PROJECT_ROOT, dry_run=args.dry_run)
|
|
if code != 0:
|
|
row["errors"].append(f"comparator failed with exit code {code}")
|
|
packs_report.append(row)
|
|
continue
|
|
row["comparator_ok"] = True
|
|
row["comparator_report"] = str(comparator_report)
|
|
|
|
packs_report.append(row)
|
|
|
|
overall_ok = all(item.get("runner_ok") and item.get("validator_ok") and (args.skip_comparator or item.get("comparator_ok")) for item in packs_report)
|
|
if len(packs_report) != len(packs):
|
|
overall_ok = False
|
|
|
|
report = {
|
|
"nightly_run_id": nightly_run_id,
|
|
"generated_at": datetime.now().isoformat(timespec="seconds"),
|
|
"dry_run": bool(args.dry_run),
|
|
"overall_ok": overall_ok,
|
|
"strict_policy": args.strict_policy,
|
|
"packs": packs_report,
|
|
}
|
|
|
|
report_path = nightly_dir / "nightly_summary.json"
|
|
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
|
|
md_lines = [
|
|
f"# {nightly_run_id}",
|
|
"",
|
|
f"Generated at: {report['generated_at']}",
|
|
f"Dry run: {report['dry_run']}",
|
|
f"Strict policy: {report['strict_policy']}",
|
|
f"Overall: {'PASS' if overall_ok else 'FAIL'}",
|
|
"",
|
|
"## Packs",
|
|
]
|
|
for item in packs_report:
|
|
md_lines.extend(
|
|
[
|
|
f"### {item['pack']}",
|
|
f"- run_id: {item['run_id']}",
|
|
f"- runner_ok: {item['runner_ok']}",
|
|
f"- validator_ok: {item['validator_ok']}",
|
|
f"- comparator_ok: {item['comparator_ok']}",
|
|
f"- run_dir: {item['run_dir']}",
|
|
]
|
|
)
|
|
if item.get("errors"):
|
|
md_lines.append("- errors:")
|
|
for err in item["errors"]:
|
|
md_lines.append(f" - {err}")
|
|
md_lines.append("")
|
|
|
|
(nightly_dir / "README.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8")
|
|
|
|
print(f"\nNightly summary: {report_path}")
|
|
print(f"Overall: {'PASS' if overall_ok else 'FAIL'}")
|
|
if not overall_ok:
|
|
raise SystemExit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|