1483 lines
65 KiB
Python
1483 lines
65 KiB
Python
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
from datetime import date, datetime, timezone
|
||
from pathlib import Path
|
||
from types import SimpleNamespace
|
||
from typing import Any
|
||
|
||
import domain_case_loop as dcl
|
||
import check_mcp_live_readiness as mcp_readiness
|
||
import scenario_acceptance_policy as sap
|
||
|
||
|
||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||
DEFAULT_OUTPUT_ROOT = REPO_ROOT / "artifacts" / "domain_runs"
|
||
TRUTH_HARNESS_SPEC_SCHEMA_VERSION = "domain_truth_harness_spec_v1"
|
||
TRUTH_HARNESS_STATE_SCHEMA_VERSION = "domain_truth_harness_state_v1"
|
||
TRUTH_HARNESS_REVIEW_SCHEMA_VERSION = "domain_truth_harness_review_v1"
|
||
TRUTH_HARNESS_MANIFEST_SCHEMA_VERSION = "domain_truth_harness_manifest_v1"
|
||
DEFAULT_CRITICALITY = "critical"
|
||
TECHNICAL_QUESTION_FIELDS = (
|
||
"expected_intents",
|
||
"expected_capability",
|
||
"expected_recipe",
|
||
"expected_result_mode",
|
||
"expected_catalog_alignment_status",
|
||
"expected_catalog_chain_top_match",
|
||
"expected_catalog_selected_matches_top",
|
||
"expected_route_candidate_status",
|
||
"expected_route_candidate_executable_now",
|
||
"expected_route_candidate_missing_axes",
|
||
"required_filters",
|
||
"forbidden_capabilities",
|
||
"forbidden_recipes",
|
||
"required_state_objects",
|
||
"required_answer_shape",
|
||
"forbidden_answer_patterns",
|
||
"required_carryover_invariants",
|
||
"invariant_severity",
|
||
)
|
||
|
||
|
||
def dump_json(payload: Any) -> str:
|
||
return json.dumps(payload, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def write_text(file_path: Path, text: str) -> None:
|
||
file_path.parent.mkdir(parents=True, exist_ok=True)
|
||
file_path.write_text(text, encoding="utf-8", newline="\n")
|
||
|
||
|
||
def write_json(file_path: Path, payload: Any) -> None:
|
||
write_text(file_path, dump_json(payload) + "\n")
|
||
|
||
|
||
def read_json(file_path: Path) -> dict[str, Any]:
|
||
payload = json.loads(file_path.read_text(encoding="utf-8-sig"))
|
||
if not isinstance(payload, dict):
|
||
raise RuntimeError(f"Expected JSON object in {file_path}")
|
||
return payload
|
||
|
||
|
||
def normalize_criticality(value: Any) -> str:
|
||
raw = str(value or "").strip().lower()
|
||
if raw in {"critical", "warning", "info"}:
|
||
return raw
|
||
return DEFAULT_CRITICALITY
|
||
|
||
|
||
def normalize_pattern_list(raw_value: Any) -> list[str]:
|
||
return [item for item in dcl.normalize_string_list(raw_value) if item]
|
||
|
||
|
||
def normalize_filter_values_mapping(raw_value: Any) -> dict[str, list[str]]:
|
||
if not isinstance(raw_value, dict):
|
||
return {}
|
||
output: dict[str, list[str]] = {}
|
||
for key, value in raw_value.items():
|
||
normalized_key = str(key or "").strip()
|
||
if not normalized_key:
|
||
continue
|
||
normalized_values = normalize_pattern_list(value)
|
||
if normalized_values:
|
||
output[normalized_key] = normalized_values
|
||
return output
|
||
|
||
|
||
def normalize_step_spec(index: int, raw_step: Any) -> dict[str, Any]:
|
||
normalized_step = dcl.normalize_step_definition(index, raw_step)
|
||
step = raw_step if isinstance(raw_step, dict) else {}
|
||
normalized_step["criticality"] = normalize_criticality(step.get("criticality"))
|
||
normalized_step["semantic_tags"] = dcl.normalize_string_list(step.get("semantic_tags"))
|
||
normalized_step["allowed_reply_types"] = normalize_pattern_list(step.get("allowed_reply_types"))
|
||
normalized_step["allowed_limited_reason_categories"] = normalize_pattern_list(
|
||
step.get("allowed_limited_reason_categories")
|
||
)
|
||
normalized_step["expected_catalog_alignment_status"] = (
|
||
str(step.get("expected_catalog_alignment_status") or "").strip() or None
|
||
)
|
||
normalized_step["expected_catalog_chain_top_match"] = (
|
||
str(step.get("expected_catalog_chain_top_match") or "").strip() or None
|
||
)
|
||
normalized_step["expected_catalog_selected_matches_top"] = step.get("expected_catalog_selected_matches_top")
|
||
normalized_step["expected_route_candidate_status"] = (
|
||
str(step.get("expected_route_candidate_status") or "").strip() or None
|
||
)
|
||
normalized_step["expected_route_candidate_executable_now"] = step.get("expected_route_candidate_executable_now")
|
||
normalized_step["expected_route_candidate_missing_axes"] = normalize_pattern_list(
|
||
step.get("expected_route_candidate_missing_axes")
|
||
)
|
||
normalized_step["required_answer_patterns_any"] = normalize_pattern_list(step.get("required_answer_patterns_any"))
|
||
normalized_step["required_answer_patterns_all"] = normalize_pattern_list(step.get("required_answer_patterns_all"))
|
||
normalized_step["required_direct_answer_patterns_any"] = normalize_pattern_list(
|
||
step.get("required_direct_answer_patterns_any")
|
||
)
|
||
normalized_step["required_direct_answer_patterns_all"] = normalize_pattern_list(
|
||
step.get("required_direct_answer_patterns_all")
|
||
)
|
||
normalized_step["forbidden_direct_answer_patterns"] = normalize_pattern_list(
|
||
step.get("forbidden_direct_answer_patterns")
|
||
)
|
||
normalized_step["forbidden_filter_keys"] = normalize_pattern_list(step.get("forbidden_filter_keys"))
|
||
normalized_step["forbidden_filter_values"] = normalize_filter_values_mapping(step.get("forbidden_filter_values"))
|
||
normalized_step["required_filter_within_previous_step_period"] = {
|
||
str(key).strip(): str(value).strip()
|
||
for key, value in (step.get("required_filter_within_previous_step_period") or {}).items()
|
||
if str(key).strip() and str(value).strip()
|
||
} if isinstance(step.get("required_filter_within_previous_step_period"), dict) else {}
|
||
normalized_step["notes"] = str(step.get("notes") or "").strip() or None
|
||
return normalized_step
|
||
|
||
|
||
def load_truth_harness_spec(file_path: Path) -> dict[str, Any]:
|
||
raw_spec = read_json(file_path)
|
||
steps_raw = raw_spec.get("steps")
|
||
if not isinstance(steps_raw, list) or not steps_raw:
|
||
raise RuntimeError("Truth harness spec must define non-empty `steps`")
|
||
scenario_id = str(raw_spec.get("scenario_id") or "").strip()
|
||
domain = str(raw_spec.get("domain") or "").strip()
|
||
if not scenario_id:
|
||
raise RuntimeError("Truth harness spec must define `scenario_id`")
|
||
if not domain:
|
||
raise RuntimeError("Truth harness spec must define `domain`")
|
||
return {
|
||
"schema_version": str(raw_spec.get("schema_version") or TRUTH_HARNESS_SPEC_SCHEMA_VERSION),
|
||
"scenario_id": scenario_id,
|
||
"domain": domain,
|
||
"title": str(raw_spec.get("title") or scenario_id).strip() or scenario_id,
|
||
"description": str(raw_spec.get("description") or "").strip() or None,
|
||
"source_export": str(raw_spec.get("source_export") or "").strip() or None,
|
||
"bindings": dcl.normalize_bindings(raw_spec.get("bindings")),
|
||
"steps": [normalize_step_spec(index + 1, raw_step) for index, raw_step in enumerate(steps_raw)],
|
||
}
|
||
|
||
|
||
def build_runtime_bindings() -> dict[str, Any]:
|
||
today = date.today()
|
||
today_iso = today.isoformat()
|
||
today_dot = today.strftime("%d.%m.%Y")
|
||
return {
|
||
"today_iso": today_iso,
|
||
"today_dot": today_dot,
|
||
"today_iso_regex": re.escape(today_iso),
|
||
"today_dot_regex": re.escape(today_dot),
|
||
"generated_at_utc": datetime.now(timezone.utc).replace(microsecond=0).isoformat(),
|
||
}
|
||
|
||
|
||
def build_review_root(
|
||
bindings: dict[str, Any], runtime_bindings: dict[str, Any], step_results: dict[str, Any]
|
||
) -> dict[str, Any]:
|
||
root: dict[str, Any] = {
|
||
"bindings": bindings,
|
||
"runtime": runtime_bindings,
|
||
"step_results": step_results,
|
||
}
|
||
if isinstance(step_results, dict):
|
||
root.update(step_results)
|
||
return root
|
||
|
||
|
||
def lookup_review_value(
|
||
path_expression: str, step_results: dict[str, Any], bindings: dict[str, Any], runtime_bindings: dict[str, Any]
|
||
) -> Any:
|
||
current: Any = build_review_root(bindings, runtime_bindings, step_results)
|
||
for token in dcl.parse_path_tokens(path_expression):
|
||
if isinstance(token, int):
|
||
if not isinstance(current, list):
|
||
raise RuntimeError(f"Placeholder `{path_expression}` does not point to a list before index access")
|
||
if token >= len(current):
|
||
raise RuntimeError(f"Placeholder `{path_expression}` index {token} is out of range")
|
||
current = current[token]
|
||
continue
|
||
if not isinstance(current, dict) or token not in current:
|
||
raise RuntimeError(f"Placeholder `{path_expression}` could not be resolved at `{token}`")
|
||
current = current[token]
|
||
return current
|
||
|
||
|
||
def resolve_template_string(
|
||
template: str, step_results: dict[str, Any], bindings: dict[str, Any], runtime_bindings: dict[str, Any]
|
||
) -> str:
|
||
pattern = re.compile(r"{{\s*([^{}]+?)\s*}}")
|
||
|
||
def replace(match: re.Match[str]) -> str:
|
||
value = lookup_review_value(match.group(1), step_results, bindings, runtime_bindings)
|
||
if isinstance(value, (dict, list)):
|
||
return dump_json(value)
|
||
return str(value)
|
||
|
||
return pattern.sub(replace, template)
|
||
|
||
|
||
def resolve_nested_placeholders(
|
||
raw_value: Any, step_results: dict[str, Any], bindings: dict[str, Any], runtime_bindings: dict[str, Any]
|
||
) -> Any:
|
||
if isinstance(raw_value, str):
|
||
return resolve_template_string(raw_value, step_results, bindings, runtime_bindings)
|
||
if isinstance(raw_value, list):
|
||
return [resolve_nested_placeholders(item, step_results, bindings, runtime_bindings) for item in raw_value]
|
||
if isinstance(raw_value, dict):
|
||
return {
|
||
str(key): resolve_nested_placeholders(value, step_results, bindings, runtime_bindings)
|
||
for key, value in raw_value.items()
|
||
}
|
||
return raw_value
|
||
|
||
|
||
def build_generated_manifest(spec: dict[str, Any]) -> dict[str, Any]:
|
||
manifest_steps: list[dict[str, Any]] = []
|
||
previous_step_id: str | None = None
|
||
for step in spec["steps"]:
|
||
manifest_step: dict[str, Any] = {
|
||
"step_id": step["step_id"],
|
||
"title": step["title"],
|
||
"question": step["question_template"],
|
||
"depends_on": [previous_step_id] if previous_step_id else [],
|
||
"analysis_context": {},
|
||
}
|
||
for field_name in TECHNICAL_QUESTION_FIELDS:
|
||
manifest_step[field_name] = step.get(field_name)
|
||
manifest_step["semantic_tags"] = step.get("semantic_tags") or []
|
||
manifest_steps.append(manifest_step)
|
||
previous_step_id = step["step_id"]
|
||
return {
|
||
"schema_version": TRUTH_HARNESS_MANIFEST_SCHEMA_VERSION,
|
||
"scenario_id": spec["scenario_id"],
|
||
"domain": spec["domain"],
|
||
"title": spec["title"],
|
||
"description": spec.get("description"),
|
||
"analysis_context": {},
|
||
"bindings": spec.get("bindings") or {},
|
||
"steps": manifest_steps,
|
||
}
|
||
|
||
|
||
def build_runner_args(args: argparse.Namespace) -> SimpleNamespace:
|
||
return SimpleNamespace(
|
||
backend_url=args.backend_url,
|
||
prompt_version=args.prompt_version,
|
||
llm_provider=args.llm_provider,
|
||
llm_model=args.llm_model,
|
||
llm_base_url=args.llm_base_url,
|
||
llm_api_key=args.llm_api_key,
|
||
temperature=args.temperature,
|
||
max_output_tokens=args.max_output_tokens,
|
||
timeout_seconds=args.timeout_seconds,
|
||
use_mock=bool(args.use_mock),
|
||
)
|
||
|
||
|
||
def build_plain_assistant_message_payload(
|
||
args: argparse.Namespace, *, question: str, session_id: str | None
|
||
) -> dict[str, Any]:
|
||
return dcl.drop_none_values(
|
||
{
|
||
"session_id": session_id,
|
||
"user_message": question,
|
||
"message": question,
|
||
"mode": "assistant",
|
||
"llmProvider": args.llm_provider,
|
||
"apiKey": args.llm_api_key,
|
||
"model": args.llm_model,
|
||
"baseUrl": args.llm_base_url,
|
||
"temperature": args.temperature,
|
||
"maxOutputTokens": args.max_output_tokens,
|
||
"promptVersion": args.prompt_version,
|
||
"useMock": bool(args.use_mock),
|
||
}
|
||
)
|
||
|
||
|
||
def build_placeholder_step_result(step_state: dict[str, Any]) -> dict[str, Any]:
|
||
result = dict(step_state)
|
||
result["filters"] = dict(step_state.get("extracted_filters") or {})
|
||
return result
|
||
|
||
|
||
def append_finding(
|
||
findings: list[dict[str, Any]],
|
||
step: dict[str, Any],
|
||
code: str,
|
||
message: str,
|
||
*,
|
||
actual: Any = None,
|
||
expected: Any = None,
|
||
severity: str | None = None,
|
||
) -> None:
|
||
findings.append(
|
||
{
|
||
"code": code,
|
||
"severity": severity or step.get("criticality") or DEFAULT_CRITICALITY,
|
||
"message": message,
|
||
"actual": actual,
|
||
"expected": expected,
|
||
}
|
||
)
|
||
|
||
|
||
BUSINESS_REVIEW_FINDING_MESSAGES = {
|
||
"technical_garbage_in_answer": "User-facing answer leaked internal runtime or MCP identifiers.",
|
||
"business_direct_answer_missing": "The answer did not put the direct business answer first.",
|
||
"answer_layering_noise": "The answer opened with scaffolding or report framing instead of a clean business result.",
|
||
"business_answer_too_verbose": "The answer is too verbose for a direct business question.",
|
||
}
|
||
|
||
BUSINESS_REVIEW_FINDING_SEVERITY = {
|
||
"technical_garbage_in_answer": "critical",
|
||
"business_direct_answer_missing": "critical",
|
||
"answer_layering_noise": "critical",
|
||
"business_answer_too_verbose": "warning",
|
||
}
|
||
|
||
|
||
def append_business_review_findings(findings: list[dict[str, Any]], step: dict[str, Any], step_state: dict[str, Any]) -> None:
|
||
business_review = step_state.get("business_first_review")
|
||
if not isinstance(business_review, dict):
|
||
return
|
||
for issue_code in dcl.normalize_string_list(business_review.get("issue_codes")):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"business_review:{issue_code}",
|
||
BUSINESS_REVIEW_FINDING_MESSAGES.get(issue_code, "Business-first answer review detected a semantic quality issue."),
|
||
actual={
|
||
"direct_answer": business_review.get("actual_direct_answer"),
|
||
"answer_length_chars": business_review.get("answer_length_chars"),
|
||
"technical_garbage_hits": business_review.get("technical_garbage_hits"),
|
||
"top_line_scaffold_hits": business_review.get("top_line_scaffold_hits"),
|
||
},
|
||
expected={
|
||
"direct_answer_first_ok": True,
|
||
"business_usefulness_ok": True,
|
||
"answer_layering_ok": True,
|
||
},
|
||
severity=BUSINESS_REVIEW_FINDING_SEVERITY.get(issue_code, step.get("criticality") or DEFAULT_CRITICALITY),
|
||
)
|
||
|
||
|
||
def matches_any_pattern(text: str, patterns: list[str]) -> bool:
|
||
return any(re.search(pattern, text, flags=re.IGNORECASE) for pattern in patterns if pattern)
|
||
|
||
|
||
def find_missing_patterns(text: str, patterns: list[str]) -> list[str]:
|
||
return [pattern for pattern in patterns if pattern and not re.search(pattern, text, flags=re.IGNORECASE)]
|
||
|
||
|
||
def normalize_actual_filter_value(filter_key: str, raw_value: Any) -> str:
|
||
if filter_key in {"as_of_date", "period_from", "period_to"}:
|
||
return dcl.normalize_iso_date(raw_value) or ""
|
||
return str(raw_value or "").strip()
|
||
|
||
|
||
def normalize_optional_bool(value: Any) -> bool | None:
|
||
if isinstance(value, bool):
|
||
return value
|
||
raw = str(value or "").strip().lower()
|
||
if raw in {"true", "1", "yes", "y"}:
|
||
return True
|
||
if raw in {"false", "0", "no", "n"}:
|
||
return False
|
||
return None
|
||
|
||
|
||
def evaluate_truth_step(
|
||
*,
|
||
step: dict[str, Any],
|
||
step_state: dict[str, Any],
|
||
step_results: dict[str, Any],
|
||
bindings: dict[str, Any],
|
||
runtime_bindings: dict[str, Any],
|
||
) -> dict[str, Any]:
|
||
findings: list[dict[str, Any]] = []
|
||
reply_type = str(step_state.get("reply_type") or "").strip()
|
||
assistant_text = str(step_state.get("assistant_text") or "")
|
||
direct_answer = str(step_state.get("actual_direct_answer") or "").strip()
|
||
detected_intent = str(step_state.get("detected_intent") or "").strip()
|
||
effective_intents = [
|
||
detected_intent,
|
||
*dcl.normalize_string_list(step_state.get("mcp_discovery_effective_intents")),
|
||
]
|
||
selected_recipe = str(step_state.get("selected_recipe") or "").strip()
|
||
capability_id = str(step_state.get("capability_id") or "").strip()
|
||
catalog_alignment_status = str(step_state.get("mcp_discovery_catalog_chain_alignment_status") or "").strip()
|
||
catalog_chain_top_match = str(step_state.get("mcp_discovery_catalog_chain_top_match") or "").strip()
|
||
route_candidate_status = str(step_state.get("mcp_discovery_route_candidate_status") or "").strip()
|
||
limited_reason_category = str(step_state.get("limited_reason_category") or "").strip()
|
||
extracted_filters = (
|
||
step_state.get("extracted_filters") if isinstance(step_state.get("extracted_filters"), dict) else {}
|
||
)
|
||
append_business_review_findings(findings, step, step_state)
|
||
|
||
if (
|
||
catalog_alignment_status in {"selected_lower_rank", "selected_outside_match_set"}
|
||
and not bool(step.get("allow_catalog_alignment_divergence"))
|
||
):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"catalog_alignment_divergence",
|
||
"Planner selected chain diverges from the top reviewed catalog-chain match and needs semantic review.",
|
||
actual={
|
||
"alignment_status": catalog_alignment_status,
|
||
"top_match": step_state.get("mcp_discovery_catalog_chain_top_match"),
|
||
"selected_matches_top": step_state.get("mcp_discovery_catalog_chain_selected_matches_top"),
|
||
},
|
||
expected="selected_matches_top or explicit allow_catalog_alignment_divergence",
|
||
severity="warning",
|
||
)
|
||
|
||
expected_catalog_alignment_status = str(
|
||
resolve_nested_placeholders(
|
||
step.get("expected_catalog_alignment_status"),
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
or ""
|
||
).strip()
|
||
if expected_catalog_alignment_status and catalog_alignment_status != expected_catalog_alignment_status:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_catalog_alignment_status",
|
||
"Catalog-chain alignment status does not match the expected planner/catalog verdict for this step.",
|
||
actual=catalog_alignment_status or None,
|
||
expected=expected_catalog_alignment_status,
|
||
)
|
||
|
||
expected_catalog_chain_top_match = str(
|
||
resolve_nested_placeholders(
|
||
step.get("expected_catalog_chain_top_match"),
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
or ""
|
||
).strip()
|
||
if expected_catalog_chain_top_match and catalog_chain_top_match != expected_catalog_chain_top_match:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_catalog_chain_top_match",
|
||
"Top reviewed catalog-chain match does not match the expected chain for this step.",
|
||
actual=catalog_chain_top_match or None,
|
||
expected=expected_catalog_chain_top_match,
|
||
)
|
||
|
||
expected_catalog_selected_matches_top = normalize_optional_bool(
|
||
resolve_nested_placeholders(
|
||
step.get("expected_catalog_selected_matches_top"),
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
)
|
||
if expected_catalog_selected_matches_top is not None:
|
||
actual_catalog_selected_matches_top = step_state.get("mcp_discovery_catalog_chain_selected_matches_top") is True
|
||
if actual_catalog_selected_matches_top != expected_catalog_selected_matches_top:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_catalog_selected_matches_top",
|
||
"Selected chain top-match flag does not match the expected planner/catalog verdict for this step.",
|
||
actual=actual_catalog_selected_matches_top,
|
||
expected=expected_catalog_selected_matches_top,
|
||
)
|
||
|
||
expected_route_candidate_status = str(
|
||
resolve_nested_placeholders(
|
||
step.get("expected_route_candidate_status"),
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
or ""
|
||
).strip()
|
||
if expected_route_candidate_status and route_candidate_status != expected_route_candidate_status:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_route_candidate_status",
|
||
"Runtime route candidate status does not match the expected autonomy handoff verdict for this step.",
|
||
actual=route_candidate_status or None,
|
||
expected=expected_route_candidate_status,
|
||
)
|
||
|
||
expected_route_candidate_executable_now = normalize_optional_bool(
|
||
resolve_nested_placeholders(
|
||
step.get("expected_route_candidate_executable_now"),
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
)
|
||
if expected_route_candidate_executable_now is not None:
|
||
actual_route_candidate_executable_now = step_state.get("mcp_discovery_route_candidate_executable_now") is True
|
||
if actual_route_candidate_executable_now != expected_route_candidate_executable_now:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_route_candidate_executable_now",
|
||
"Runtime route candidate executability does not match the expected autonomy handoff verdict.",
|
||
actual=actual_route_candidate_executable_now,
|
||
expected=expected_route_candidate_executable_now,
|
||
)
|
||
|
||
expected_route_candidate_missing_axes = dcl.normalize_string_list(
|
||
resolve_nested_placeholders(
|
||
step.get("expected_route_candidate_missing_axes"),
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
)
|
||
if expected_route_candidate_missing_axes:
|
||
actual_missing_axes = dcl.normalize_string_list(step_state.get("mcp_discovery_route_candidate_missing_axes"))
|
||
missing_expected_axes = [
|
||
axis for axis in expected_route_candidate_missing_axes if axis not in actual_missing_axes
|
||
]
|
||
if missing_expected_axes:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"missing_route_candidate_axes",
|
||
"Runtime route candidate does not expose expected missing axes for the handoff.",
|
||
actual=actual_missing_axes,
|
||
expected=expected_route_candidate_missing_axes,
|
||
)
|
||
|
||
if step_state.get("question_resolved") != step["question_template"]:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"question_sequence_mismatch",
|
||
"В live/export прогоне вопрос отличается от зафиксированного exact-сценария.",
|
||
actual=step_state.get("question_resolved"),
|
||
expected=step["question_template"],
|
||
)
|
||
|
||
if step["allowed_reply_types"] and reply_type not in step["allowed_reply_types"]:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"unexpected_reply_type",
|
||
"Тип ответа не соответствует ожидаемому режиму шага.",
|
||
actual=reply_type,
|
||
expected=step["allowed_reply_types"],
|
||
)
|
||
|
||
allowed_limited_reason_categories = step.get("allowed_limited_reason_categories") or []
|
||
if allowed_limited_reason_categories and limited_reason_category:
|
||
if limited_reason_category not in allowed_limited_reason_categories:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"unexpected_limited_reason_category",
|
||
"Категория честного ограничения не соответствует допустимым сценарным условиям.",
|
||
actual=limited_reason_category,
|
||
expected=allowed_limited_reason_categories,
|
||
)
|
||
|
||
expected_intents = dcl.normalize_string_list(
|
||
resolve_nested_placeholders(step.get("expected_intents") or [], step_results, bindings, runtime_bindings)
|
||
)
|
||
if expected_intents and not any(dcl.identifier_in_list(intent, expected_intents) for intent in effective_intents if intent):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_intent",
|
||
"Интент не соответствует ожидаемому бизнес-смыслу шага.",
|
||
actual=effective_intents,
|
||
expected=expected_intents,
|
||
)
|
||
|
||
expected_capability = str(
|
||
resolve_nested_placeholders(step.get("expected_capability"), step_results, bindings, runtime_bindings) or ""
|
||
).strip()
|
||
if expected_capability and not dcl.identifiers_match(capability_id, expected_capability):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_capability",
|
||
"Выбрана не та capability, которую должен был отработать этот шаг.",
|
||
actual=capability_id or None,
|
||
expected=expected_capability,
|
||
)
|
||
|
||
expected_recipe = str(
|
||
resolve_nested_placeholders(step.get("expected_recipe"), step_results, bindings, runtime_bindings) or ""
|
||
).strip()
|
||
if expected_recipe and not dcl.identifiers_match(selected_recipe, expected_recipe):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_recipe",
|
||
"Выбран не тот рецепт адресного контура.",
|
||
actual=selected_recipe or None,
|
||
expected=expected_recipe,
|
||
)
|
||
|
||
expected_result_mode = str(
|
||
resolve_nested_placeholders(step.get("expected_result_mode"), step_results, bindings, runtime_bindings) or ""
|
||
).strip()
|
||
actual_result_mode = str(step_state.get("result_mode") or "").strip()
|
||
if expected_result_mode and actual_result_mode and not dcl.identifiers_match(actual_result_mode, expected_result_mode):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"wrong_result_mode",
|
||
"Режим результата не соответствует ожидаемой форме ответа.",
|
||
actual=actual_result_mode,
|
||
expected=expected_result_mode,
|
||
)
|
||
|
||
required_filters = dcl.normalize_validation_filters(
|
||
resolve_nested_placeholders(step.get("required_filters") or {}, step_results, bindings, runtime_bindings)
|
||
)
|
||
for filter_key, expected_value in required_filters.items():
|
||
actual_value = normalize_actual_filter_value(filter_key, extracted_filters.get(filter_key))
|
||
if not actual_value:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"missing_filter:{filter_key}",
|
||
f"В техчате отсутствует обязательный фильтр `{filter_key}`.",
|
||
actual=actual_value or None,
|
||
expected=expected_value,
|
||
)
|
||
continue
|
||
if actual_value != expected_value:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"wrong_filter:{filter_key}",
|
||
f"Фильтр `{filter_key}` не совпадает с ожидаемым carryover/period состоянием.",
|
||
actual=actual_value,
|
||
expected=expected_value,
|
||
)
|
||
|
||
forbidden_filter_keys = dcl.normalize_string_list(
|
||
resolve_nested_placeholders(step.get("forbidden_filter_keys") or [], step_results, bindings, runtime_bindings)
|
||
)
|
||
for filter_key in forbidden_filter_keys:
|
||
actual_value = normalize_actual_filter_value(filter_key, extracted_filters.get(filter_key))
|
||
if actual_value:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"forbidden_filter_key:{filter_key}",
|
||
f"Шаг не должен был тащить фильтр `{filter_key}` в техчат.",
|
||
actual=actual_value,
|
||
expected="absent",
|
||
)
|
||
|
||
forbidden_filter_values = normalize_filter_values_mapping(
|
||
resolve_nested_placeholders(step.get("forbidden_filter_values") or {}, step_results, bindings, runtime_bindings)
|
||
)
|
||
for filter_key, forbidden_values in forbidden_filter_values.items():
|
||
actual_value = normalize_actual_filter_value(filter_key, extracted_filters.get(filter_key))
|
||
if actual_value and actual_value in forbidden_values:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"forbidden_filter_value:{filter_key}",
|
||
f"Шаг унаследовал запрещенное значение фильтра `{filter_key}`.",
|
||
actual=actual_value,
|
||
expected={"not_in": forbidden_values},
|
||
)
|
||
|
||
within_previous_period = step.get("required_filter_within_previous_step_period") or {}
|
||
for filter_key, step_ref in within_previous_period.items():
|
||
resolved_step_ref = str(
|
||
resolve_nested_placeholders(step_ref, step_results, bindings, runtime_bindings) or ""
|
||
).strip()
|
||
previous_step = step_results.get(resolved_step_ref)
|
||
actual_value = dcl.normalize_iso_date(extracted_filters.get(filter_key))
|
||
if not isinstance(previous_step, dict):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"previous_step_missing:{filter_key}",
|
||
f"Не удалось разрешить предыдущий шаг `{resolved_step_ref}` для проверки диапазона фильтра `{filter_key}`.",
|
||
actual=actual_value,
|
||
expected=resolved_step_ref,
|
||
)
|
||
continue
|
||
previous_filters = previous_step.get("filters") if isinstance(previous_step.get("filters"), dict) else {}
|
||
previous_from = dcl.normalize_iso_date(previous_filters.get("period_from"))
|
||
previous_to = dcl.normalize_iso_date(previous_filters.get("period_to"))
|
||
if not actual_value or not previous_from or not previous_to:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"period_carryover_missing:{filter_key}",
|
||
f"Не удалось подтвердить, что `{filter_key}` попал в диапазон предыдущего шага `{resolved_step_ref}`.",
|
||
actual=actual_value,
|
||
expected={"period_from": previous_from, "period_to": previous_to},
|
||
)
|
||
continue
|
||
if not (previous_from <= actual_value <= previous_to):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
f"period_carryover_outside:{filter_key}",
|
||
f"Фильтр `{filter_key}` вышел за диапазон предыдущего релевантного периода.",
|
||
actual=actual_value,
|
||
expected={"period_from": previous_from, "period_to": previous_to},
|
||
)
|
||
|
||
required_answer_patterns_any = normalize_pattern_list(
|
||
resolve_nested_placeholders(step.get("required_answer_patterns_any") or [], step_results, bindings, runtime_bindings)
|
||
)
|
||
if required_answer_patterns_any and not matches_any_pattern(assistant_text, required_answer_patterns_any):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"required_answer_patterns_any_missing",
|
||
"Полный ответ модели не содержит ни одного из обязательных смысловых паттернов.",
|
||
actual=assistant_text,
|
||
expected=required_answer_patterns_any,
|
||
)
|
||
|
||
required_answer_patterns_all = normalize_pattern_list(
|
||
resolve_nested_placeholders(step.get("required_answer_patterns_all") or [], step_results, bindings, runtime_bindings)
|
||
)
|
||
missing_answer_patterns = find_missing_patterns(assistant_text, required_answer_patterns_all)
|
||
if missing_answer_patterns:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"required_answer_patterns_all_missing",
|
||
"Полный ответ модели не закрыл обязательные паттерны шага.",
|
||
actual=assistant_text,
|
||
expected=missing_answer_patterns,
|
||
)
|
||
|
||
forbidden_answer_patterns = normalize_pattern_list(
|
||
resolve_nested_placeholders(step.get("forbidden_answer_patterns") or [], step_results, bindings, runtime_bindings)
|
||
)
|
||
forbidden_answer_hits = [pattern for pattern in forbidden_answer_patterns if re.search(pattern, assistant_text, flags=re.IGNORECASE)]
|
||
if forbidden_answer_hits:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"forbidden_answer_pattern_hit",
|
||
"В полном ответе модели найден запрещенный паттерн.",
|
||
actual=forbidden_answer_hits,
|
||
expected="absent",
|
||
)
|
||
|
||
required_direct_any = normalize_pattern_list(
|
||
resolve_nested_placeholders(
|
||
step.get("required_direct_answer_patterns_any") or [],
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
)
|
||
if required_direct_any and not matches_any_pattern(direct_answer, required_direct_any):
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"required_direct_answer_patterns_any_missing",
|
||
"Первая строка ответа не содержит ни одного из обязательных direct-answer паттернов.",
|
||
actual=direct_answer,
|
||
expected=required_direct_any,
|
||
)
|
||
|
||
required_direct_all = normalize_pattern_list(
|
||
resolve_nested_placeholders(
|
||
step.get("required_direct_answer_patterns_all") or [],
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
)
|
||
missing_direct_patterns = find_missing_patterns(direct_answer, required_direct_all)
|
||
if missing_direct_patterns:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"required_direct_answer_patterns_all_missing",
|
||
"Первая строка ответа не закрыла обязательные direct-answer паттерны.",
|
||
actual=direct_answer,
|
||
expected=missing_direct_patterns,
|
||
)
|
||
|
||
forbidden_direct_patterns = normalize_pattern_list(
|
||
resolve_nested_placeholders(
|
||
step.get("forbidden_direct_answer_patterns") or [],
|
||
step_results,
|
||
bindings,
|
||
runtime_bindings,
|
||
)
|
||
)
|
||
forbidden_direct_hits = [pattern for pattern in forbidden_direct_patterns if re.search(pattern, direct_answer, flags=re.IGNORECASE)]
|
||
if forbidden_direct_hits:
|
||
append_finding(
|
||
findings,
|
||
step,
|
||
"forbidden_direct_answer_pattern_hit",
|
||
"Первая строка ответа попала в запрещенный direct-answer паттерн.",
|
||
actual=forbidden_direct_hits,
|
||
expected="absent",
|
||
)
|
||
|
||
critical_findings = [item for item in findings if item.get("severity") == "critical"]
|
||
warning_findings = [item for item in findings if item.get("severity") == "warning"]
|
||
info_findings = [item for item in findings if item.get("severity") == "info"]
|
||
review_status = "pass"
|
||
if critical_findings:
|
||
review_status = "fail"
|
||
elif warning_findings:
|
||
review_status = "warning"
|
||
elif info_findings:
|
||
review_status = "info"
|
||
|
||
reviewed_state = dict(step_state)
|
||
reviewed_state["review_findings"] = findings
|
||
reviewed_state["review_status"] = review_status
|
||
reviewed_state["critical_findings_count"] = len(critical_findings)
|
||
reviewed_state["warning_findings_count"] = len(warning_findings)
|
||
reviewed_state["info_findings_count"] = len(info_findings)
|
||
reviewed_state["truth_harness_notes"] = step.get("notes")
|
||
return reviewed_state
|
||
|
||
|
||
def build_truth_review_summary(spec: dict[str, Any], scenario_state: dict[str, Any], review_source: str) -> dict[str, Any]:
|
||
step_outputs = scenario_state.get("step_outputs") if isinstance(scenario_state.get("step_outputs"), dict) else {}
|
||
review_steps = [step_outputs.get(step["step_id"], {}) for step in spec["steps"]]
|
||
passed = sum(1 for item in review_steps if str(item.get("review_status") or "") == "pass")
|
||
warnings = sum(1 for item in review_steps if str(item.get("review_status") or "") == "warning")
|
||
failed = sum(1 for item in review_steps if str(item.get("review_status") or "") == "fail")
|
||
return {
|
||
"schema_version": TRUTH_HARNESS_REVIEW_SCHEMA_VERSION,
|
||
"review_source": review_source,
|
||
"scenario_id": spec["scenario_id"],
|
||
"domain": spec["domain"],
|
||
"title": spec["title"],
|
||
"session_id": scenario_state.get("session_id"),
|
||
"steps_total": len(spec["steps"]),
|
||
"steps_passed": passed,
|
||
"steps_with_warning": warnings,
|
||
"steps_failed": failed,
|
||
"overall_status": "fail" if failed else ("warning" if warnings else "pass"),
|
||
}
|
||
|
||
|
||
def build_business_review_summary(spec: dict[str, Any], scenario_state: dict[str, Any]) -> dict[str, Any]:
|
||
step_outputs = scenario_state.get("step_outputs") if isinstance(scenario_state.get("step_outputs"), dict) else {}
|
||
steps: list[dict[str, Any]] = []
|
||
issue_counts: dict[str, int] = {}
|
||
for index, step in enumerate(spec["steps"], start=1):
|
||
step_state = step_outputs.get(step["step_id"], {})
|
||
business_review = (
|
||
step_state.get("business_first_review")
|
||
if isinstance(step_state, dict) and isinstance(step_state.get("business_first_review"), dict)
|
||
else {}
|
||
)
|
||
issue_codes = dcl.normalize_string_list(business_review.get("issue_codes"))
|
||
for issue_code in issue_codes:
|
||
issue_counts[issue_code] = issue_counts.get(issue_code, 0) + 1
|
||
steps.append(
|
||
{
|
||
"index": index,
|
||
"step_id": step["step_id"],
|
||
"question": step["question_template"],
|
||
"review_status": step_state.get("review_status") if isinstance(step_state, dict) else None,
|
||
"direct_answer": business_review.get("actual_direct_answer"),
|
||
"answer_length_chars": business_review.get("answer_length_chars"),
|
||
"direct_answer_required": business_review.get("direct_answer_required"),
|
||
"direct_answer_first_ok": business_review.get("direct_answer_first_ok"),
|
||
"business_usefulness_ok": business_review.get("business_usefulness_ok"),
|
||
"answer_layering_ok": business_review.get("answer_layering_ok"),
|
||
"technical_garbage_present": business_review.get("technical_garbage_present"),
|
||
"too_verbose_for_direct_question": business_review.get("too_verbose_for_direct_question"),
|
||
"issue_codes": issue_codes,
|
||
"suggested_root_cause_layers": business_review.get("suggested_root_cause_layers") or [],
|
||
}
|
||
)
|
||
failed = sum(
|
||
1
|
||
for step in steps
|
||
if any(
|
||
issue in {"technical_garbage_in_answer", "business_direct_answer_missing", "answer_layering_noise"}
|
||
for issue in step["issue_codes"]
|
||
)
|
||
)
|
||
warnings = sum(1 for step in steps if "business_answer_too_verbose" in step["issue_codes"])
|
||
return {
|
||
"schema_version": "business_first_run_review_v1",
|
||
"scenario_id": spec["scenario_id"],
|
||
"domain": spec["domain"],
|
||
"title": spec["title"],
|
||
"session_id": scenario_state.get("session_id"),
|
||
"steps_total": len(steps),
|
||
"steps_with_business_failures": failed,
|
||
"steps_with_business_warnings": warnings,
|
||
"issue_counts": issue_counts,
|
||
"overall_business_status": "fail" if failed else ("warning" if warnings else "pass"),
|
||
"steps": steps,
|
||
}
|
||
|
||
|
||
def build_business_review_markdown(business_review: dict[str, Any]) -> str:
|
||
lines = [
|
||
"# Business-first review",
|
||
"",
|
||
f"- scenario_id: `{business_review.get('scenario_id') or 'n/a'}`",
|
||
f"- domain: `{business_review.get('domain') or 'n/a'}`",
|
||
f"- title: {business_review.get('title') or 'n/a'}",
|
||
f"- session_id: `{business_review.get('session_id') or 'n/a'}`",
|
||
f"- overall_business_status: `{business_review.get('overall_business_status') or 'n/a'}`",
|
||
f"- steps_total: `{business_review.get('steps_total')}`",
|
||
f"- steps_with_business_failures: `{business_review.get('steps_with_business_failures')}`",
|
||
f"- steps_with_business_warnings: `{business_review.get('steps_with_business_warnings')}`",
|
||
f"- issue_counts: `{dump_json(business_review.get('issue_counts') or {})}`",
|
||
"",
|
||
"## Human Answer Surface",
|
||
]
|
||
for step in business_review.get("steps") or []:
|
||
if not isinstance(step, dict):
|
||
continue
|
||
lines.extend(
|
||
[
|
||
f"{step.get('index')}. `{step.get('step_id')}` - {step.get('question')}",
|
||
f"review_status: `{step.get('review_status') or 'n/a'}`",
|
||
f"direct_answer: {step.get('direct_answer') or 'n/a'}",
|
||
f"answer_length_chars: `{step.get('answer_length_chars')}`",
|
||
f"direct_answer_required: `{step.get('direct_answer_required')}`",
|
||
f"direct_answer_first_ok: `{step.get('direct_answer_first_ok')}`",
|
||
f"business_usefulness_ok: `{step.get('business_usefulness_ok')}`",
|
||
f"answer_layering_ok: `{step.get('answer_layering_ok')}`",
|
||
f"technical_garbage_present: `{step.get('technical_garbage_present')}`",
|
||
f"too_verbose_for_direct_question: `{step.get('too_verbose_for_direct_question')}`",
|
||
f"issue_codes: `{', '.join(step.get('issue_codes') or []) or 'none'}`",
|
||
f"suggested_root_cause_layers: `{', '.join(step.get('suggested_root_cause_layers') or []) or 'none'}`",
|
||
"",
|
||
]
|
||
)
|
||
return "\n".join(lines).strip() + "\n"
|
||
|
||
|
||
def build_truth_review_markdown(spec: dict[str, Any], scenario_state: dict[str, Any], review_summary: dict[str, Any]) -> str:
|
||
lines = [
|
||
"# Truth harness review",
|
||
"",
|
||
f"- scenario_id: `{spec['scenario_id']}`",
|
||
f"- domain: `{spec['domain']}`",
|
||
f"- title: {spec['title']}",
|
||
f"- review_source: `{review_summary.get('review_source') or 'n/a'}`",
|
||
f"- session_id: `{scenario_state.get('session_id') or 'n/a'}`",
|
||
f"- overall_status: `{review_summary.get('overall_status') or 'n/a'}`",
|
||
f"- steps_total: `{review_summary.get('steps_total')}`",
|
||
f"- steps_passed: `{review_summary.get('steps_passed')}`",
|
||
f"- steps_with_warning: `{review_summary.get('steps_with_warning')}`",
|
||
f"- steps_failed: `{review_summary.get('steps_failed')}`",
|
||
"",
|
||
"## Steps",
|
||
]
|
||
step_outputs = scenario_state.get("step_outputs") if isinstance(scenario_state.get("step_outputs"), dict) else {}
|
||
for index, step in enumerate(spec["steps"], start=1):
|
||
step_state = step_outputs.get(step["step_id"], {})
|
||
findings = step_state.get("review_findings") if isinstance(step_state.get("review_findings"), list) else []
|
||
business_review = (
|
||
step_state.get("business_first_review")
|
||
if isinstance(step_state, dict) and isinstance(step_state.get("business_first_review"), dict)
|
||
else {}
|
||
)
|
||
lines.extend(
|
||
[
|
||
f"{index}. `{step['step_id']}` - {step['question_template']}",
|
||
f"review_status: `{step_state.get('review_status') or 'n/a'}`",
|
||
f"reply_type: `{step_state.get('reply_type') or 'n/a'}`",
|
||
f"intent: `{step_state.get('detected_intent') or 'n/a'}`",
|
||
f"recipe: `{step_state.get('selected_recipe') or 'n/a'}`",
|
||
f"capability: `{step_state.get('capability_id') or 'n/a'}`",
|
||
f"catalog_alignment_status: `{step_state.get('mcp_discovery_catalog_chain_alignment_status') or 'n/a'}`",
|
||
f"catalog_top_match: `{step_state.get('mcp_discovery_catalog_chain_top_match') or 'n/a'}`",
|
||
f"catalog_selected_matches_top: `{step_state.get('mcp_discovery_catalog_chain_selected_matches_top')}`",
|
||
f"route_candidate_status: `{step_state.get('mcp_discovery_route_candidate_status') or 'n/a'}`",
|
||
f"route_candidate_missing_axes: `{', '.join(step_state.get('mcp_discovery_route_candidate_missing_axes') or []) or 'n/a'}`",
|
||
f"route_candidate_executable_now: `{step_state.get('mcp_discovery_route_candidate_executable_now')}`",
|
||
f"limited_reason_category: `{step_state.get('limited_reason_category') or 'n/a'}`",
|
||
f"filters: `{dump_json(step_state.get('extracted_filters') or {})}`",
|
||
f"direct_answer: {step_state.get('actual_direct_answer') or 'n/a'}",
|
||
f"business_first: status=`{business_review.get('business_usefulness_ok')}`, "
|
||
f"direct_first=`{business_review.get('direct_answer_first_ok')}`, "
|
||
f"layering=`{business_review.get('answer_layering_ok')}`, "
|
||
f"length=`{business_review.get('answer_length_chars')}`, "
|
||
f"issues=`{', '.join(business_review.get('issue_codes') or []) or 'none'}`",
|
||
]
|
||
)
|
||
if step.get("notes"):
|
||
lines.append(f"notes: {step['notes']}")
|
||
if findings:
|
||
lines.append("findings:")
|
||
for finding in findings:
|
||
lines.append(
|
||
f"- [{finding.get('severity')}] {finding.get('code')}: {finding.get('message')} "
|
||
f"(actual={dump_json(finding.get('actual'))}, expected={dump_json(finding.get('expected'))})"
|
||
)
|
||
else:
|
||
lines.append("findings: none")
|
||
lines.append("")
|
||
return "\n".join(lines).strip() + "\n"
|
||
|
||
|
||
def write_acceptance_artifacts(
|
||
output_dir: Path,
|
||
spec: dict[str, Any],
|
||
scenario_state: dict[str, Any],
|
||
review_summary: dict[str, Any],
|
||
) -> dict[str, Any]:
|
||
acceptance_matrix = sap.build_scenario_acceptance_matrix(spec, scenario_state, review_summary)
|
||
pack_state = sap.derive_truth_harness_pack_state(spec, scenario_state, review_summary, acceptance_matrix)
|
||
write_json(output_dir / "scenario_acceptance_matrix.json", acceptance_matrix)
|
||
write_text(output_dir / "scenario_acceptance_matrix.md", sap.build_scenario_acceptance_matrix_markdown(acceptance_matrix))
|
||
write_json(output_dir / "pack_state.json", pack_state)
|
||
write_text(output_dir / "final_status.md", sap.build_truth_harness_final_status_markdown(pack_state))
|
||
return {"acceptance_matrix": acceptance_matrix, "pack_state": pack_state}
|
||
|
||
|
||
def save_step_bundle(
|
||
*,
|
||
step_dir: Path,
|
||
export_markdown: str,
|
||
turn_artifact: dict[str, Any],
|
||
session_record: dict[str, Any] | None,
|
||
response_payload: dict[str, Any] | None,
|
||
step_state: dict[str, Any],
|
||
) -> None:
|
||
write_text(step_dir / "output.md", export_markdown)
|
||
write_json(step_dir / "debug.json", step_state.get("technical_debug_payload") or {})
|
||
write_json(step_dir / "turn.json", turn_artifact)
|
||
write_json(step_dir / "assistant_response.json", response_payload or {})
|
||
write_json(step_dir / "step_state.json", step_state)
|
||
if session_record is not None:
|
||
write_json(step_dir / "session.json", session_record)
|
||
write_text(step_dir / "resolved_question.txt", f"{step_state.get('question_resolved') or ''}\n")
|
||
|
||
|
||
def build_step_state_from_turn(
|
||
*,
|
||
spec: dict[str, Any],
|
||
step: dict[str, Any],
|
||
step_index: int,
|
||
question: str,
|
||
conversation: list[dict[str, Any]],
|
||
session_record: dict[str, Any] | None,
|
||
export_markdown: str,
|
||
) -> tuple[dict[str, Any], dict[str, Any]]:
|
||
mini_conversation = conversation[-2:] if len(conversation) >= 2 else conversation
|
||
turn_artifact = dcl.build_turn_artifact(
|
||
slot="step",
|
||
domain=spec["domain"],
|
||
case_id=spec["scenario_id"],
|
||
question=question,
|
||
session_id=str(spec.get("session_id") or "n/a"),
|
||
conversation=mini_conversation,
|
||
session_record=session_record,
|
||
job_record=None,
|
||
report_case=None,
|
||
export_file_name="output.md",
|
||
)
|
||
turn_artifact["schema_version"] = "domain_truth_harness_turn_artifact_v1"
|
||
turn_artifact["scenario"] = {
|
||
"scenario_id": spec["scenario_id"],
|
||
"step_id": step["step_id"],
|
||
"step_index": step_index,
|
||
"question_template": step["question_template"],
|
||
"question_resolved": question,
|
||
"review_mode": "truth_harness_strict_replay",
|
||
}
|
||
last_assistant = dcl.find_last_assistant(mini_conversation)
|
||
entries = dcl.extract_structured_entries(str(last_assistant.get("text") or ""))
|
||
base_step_state = dcl.build_scenario_step_state(
|
||
scenario_id=spec["scenario_id"],
|
||
domain=spec["domain"],
|
||
step=step,
|
||
step_index=step_index,
|
||
question_resolved=question,
|
||
analysis_context={},
|
||
turn_artifact=turn_artifact,
|
||
entries=entries,
|
||
)
|
||
base_step_state["export_markdown"] = export_markdown
|
||
base_step_state["technical_debug_payload"] = (
|
||
last_assistant.get("debug") if isinstance(last_assistant.get("debug"), dict) else {}
|
||
)
|
||
return base_step_state, turn_artifact
|
||
|
||
|
||
def build_conversation_pairs(conversation: list[dict[str, Any]]) -> list[tuple[dict[str, Any], dict[str, Any] | None]]:
|
||
pairs: list[tuple[dict[str, Any], dict[str, Any] | None]] = []
|
||
for index, item in enumerate(conversation):
|
||
if item.get("role") != "user":
|
||
continue
|
||
assistant_item: dict[str, Any] | None = None
|
||
if index + 1 < len(conversation) and conversation[index + 1].get("role") == "assistant":
|
||
assistant_item = conversation[index + 1]
|
||
pairs.append((item, assistant_item))
|
||
return pairs
|
||
|
||
|
||
def review_export(spec: dict[str, Any], export_path: Path, output_dir: Path) -> dict[str, Any]:
|
||
export_text = export_path.read_text(encoding="utf-8-sig")
|
||
session_id, conversation = dcl.parse_export_markdown(export_text)
|
||
pairs = build_conversation_pairs(conversation)
|
||
if len(pairs) != len(spec["steps"]):
|
||
raise RuntimeError(
|
||
f"Spec steps ({len(spec['steps'])}) and export user turns ({len(pairs)}) do not match for {export_path}"
|
||
)
|
||
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
write_json(output_dir / "truth_harness_spec.json", spec)
|
||
write_text(output_dir / "source_export.md", export_text)
|
||
|
||
step_results: dict[str, Any] = {}
|
||
runtime_bindings = build_runtime_bindings()
|
||
scenario_state: dict[str, Any] = {
|
||
"schema_version": TRUTH_HARNESS_STATE_SCHEMA_VERSION,
|
||
"scenario_id": spec["scenario_id"],
|
||
"domain": spec["domain"],
|
||
"title": spec["title"],
|
||
"session_id": session_id,
|
||
"review_source": str(export_path),
|
||
"step_outputs": {},
|
||
"updated_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat(),
|
||
}
|
||
steps_dir = output_dir / "steps"
|
||
steps_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
for index, (step, pair) in enumerate(zip(spec["steps"], pairs, strict=True), start=1):
|
||
user_item, assistant_item = pair
|
||
if assistant_item is None:
|
||
raise RuntimeError(f"Export pair for step `{step['step_id']}` is missing assistant message")
|
||
mini_conversation = [user_item, assistant_item]
|
||
export_markdown = dcl.build_conversation_export(session_id, mini_conversation, mode="technical")
|
||
base_step_state, turn_artifact = build_step_state_from_turn(
|
||
spec={**spec, "session_id": session_id},
|
||
step=step,
|
||
step_index=index,
|
||
question=str(user_item.get("text") or ""),
|
||
conversation=mini_conversation,
|
||
session_record=None,
|
||
export_markdown=export_markdown,
|
||
)
|
||
reviewed_step = evaluate_truth_step(
|
||
step=step,
|
||
step_state=base_step_state,
|
||
step_results=step_results,
|
||
bindings=spec.get("bindings") or {},
|
||
runtime_bindings=runtime_bindings,
|
||
)
|
||
scenario_state["step_outputs"][step["step_id"]] = reviewed_step
|
||
step_results[step["step_id"]] = build_placeholder_step_result(reviewed_step)
|
||
save_step_bundle(
|
||
step_dir=steps_dir / step["step_id"],
|
||
export_markdown=export_markdown,
|
||
turn_artifact=turn_artifact,
|
||
session_record=None,
|
||
response_payload=None,
|
||
step_state=reviewed_step,
|
||
)
|
||
|
||
scenario_state["updated_at"] = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||
review_summary = build_truth_review_summary(spec, scenario_state, f"export:{export_path}")
|
||
review_markdown = build_truth_review_markdown(spec, scenario_state, review_summary)
|
||
business_review = build_business_review_summary(spec, scenario_state)
|
||
write_json(output_dir / "scenario_state.json", scenario_state)
|
||
write_json(output_dir / "truth_review.json", {"summary": review_summary, "steps": scenario_state["step_outputs"]})
|
||
write_text(output_dir / "truth_review.md", review_markdown)
|
||
write_json(output_dir / "business_review.json", business_review)
|
||
write_text(output_dir / "business_review.md", build_business_review_markdown(business_review))
|
||
acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary)
|
||
return {
|
||
"scenario_state": scenario_state,
|
||
"review_summary": review_summary,
|
||
"acceptance_matrix": acceptance_bundle["acceptance_matrix"],
|
||
"pack_state": acceptance_bundle["pack_state"],
|
||
}
|
||
|
||
|
||
def run_live(spec: dict[str, Any], output_dir: Path, args: argparse.Namespace) -> dict[str, Any]:
|
||
runner_args = build_runner_args(args)
|
||
dcl.ensure_backend_health(runner_args.backend_url, runner_args.timeout_seconds)
|
||
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
manifest = build_generated_manifest(spec)
|
||
write_json(output_dir / "truth_harness_spec.json", spec)
|
||
write_json(output_dir / "scenario_manifest.json", manifest)
|
||
|
||
steps_dir = output_dir / "steps"
|
||
steps_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
runtime_bindings = build_runtime_bindings()
|
||
step_results: dict[str, Any] = {}
|
||
scenario_state: dict[str, Any] = {
|
||
"schema_version": TRUTH_HARNESS_STATE_SCHEMA_VERSION,
|
||
"scenario_id": spec["scenario_id"],
|
||
"domain": spec["domain"],
|
||
"title": spec["title"],
|
||
"session_id": None,
|
||
"review_source": "live_strict_replay",
|
||
"step_outputs": {},
|
||
"updated_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat(),
|
||
}
|
||
write_json(output_dir / "scenario_state.json", scenario_state)
|
||
|
||
current_session_id: str | None = None
|
||
for index, step in enumerate(spec["steps"], start=1):
|
||
question = step["question_template"]
|
||
payload = build_plain_assistant_message_payload(runner_args, question=question, session_id=current_session_id)
|
||
response_payload = dcl.http_json(
|
||
f"{runner_args.backend_url}/api/assistant/message",
|
||
method="POST",
|
||
payload=payload,
|
||
timeout=max(30, int(runner_args.timeout_seconds)),
|
||
)
|
||
current_session_id = str(response_payload.get("session_id") or current_session_id or "").strip() or None
|
||
if not current_session_id:
|
||
raise RuntimeError(f"Assistant response for step `{step['step_id']}` does not contain session_id")
|
||
session_record = dcl.fetch_session_snapshot(
|
||
runner_args.backend_url, current_session_id, runner_args.timeout_seconds
|
||
)
|
||
conversation = dcl.extract_conversation_from_session(session_record)
|
||
export_markdown = dcl.build_conversation_export(current_session_id, conversation, mode="technical")
|
||
base_step_state, turn_artifact = build_step_state_from_turn(
|
||
spec={**spec, "session_id": current_session_id},
|
||
step=step,
|
||
step_index=index,
|
||
question=question,
|
||
conversation=conversation,
|
||
session_record=session_record,
|
||
export_markdown=export_markdown,
|
||
)
|
||
reviewed_step = evaluate_truth_step(
|
||
step=step,
|
||
step_state=base_step_state,
|
||
step_results=step_results,
|
||
bindings=spec.get("bindings") or {},
|
||
runtime_bindings=runtime_bindings,
|
||
)
|
||
scenario_state["session_id"] = current_session_id
|
||
scenario_state["step_outputs"][step["step_id"]] = reviewed_step
|
||
step_results[step["step_id"]] = build_placeholder_step_result(reviewed_step)
|
||
scenario_state["updated_at"] = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||
save_step_bundle(
|
||
step_dir=steps_dir / step["step_id"],
|
||
export_markdown=export_markdown,
|
||
turn_artifact=turn_artifact,
|
||
session_record=session_record,
|
||
response_payload=response_payload,
|
||
step_state=reviewed_step,
|
||
)
|
||
write_json(output_dir / "scenario_state.json", scenario_state)
|
||
print(
|
||
f"[truth-harness] step {index}/{len(spec['steps'])}: {step['step_id']} -> "
|
||
f"{reviewed_step.get('review_status')} ({reviewed_step.get('reply_type') or 'n/a'})"
|
||
)
|
||
|
||
review_summary = build_truth_review_summary(spec, scenario_state, "live_strict_replay")
|
||
review_markdown = build_truth_review_markdown(spec, scenario_state, review_summary)
|
||
business_review = build_business_review_summary(spec, scenario_state)
|
||
write_text(output_dir / "session_id.txt", f"{scenario_state.get('session_id') or ''}\n")
|
||
write_json(output_dir / "scenario_state.json", scenario_state)
|
||
write_json(output_dir / "truth_review.json", {"summary": review_summary, "steps": scenario_state["step_outputs"]})
|
||
write_text(output_dir / "truth_review.md", review_markdown)
|
||
write_json(output_dir / "business_review.json", business_review)
|
||
write_text(output_dir / "business_review.md", build_business_review_markdown(business_review))
|
||
acceptance_bundle = write_acceptance_artifacts(output_dir, spec, scenario_state, review_summary)
|
||
print(f"[truth-harness] saved artifacts to {output_dir}")
|
||
print(f"[truth-harness] overall_status={review_summary['overall_status']}")
|
||
return {
|
||
"scenario_state": scenario_state,
|
||
"review_summary": review_summary,
|
||
"acceptance_matrix": acceptance_bundle["acceptance_matrix"],
|
||
"pack_state": acceptance_bundle["pack_state"],
|
||
}
|
||
|
||
|
||
def build_bootstrap_spec(export_path: Path, scenario_id: str, domain: str, title: str | None) -> dict[str, Any]:
|
||
export_text = export_path.read_text(encoding="utf-8-sig")
|
||
_, conversation = dcl.parse_export_markdown(export_text)
|
||
pairs = build_conversation_pairs(conversation)
|
||
steps = []
|
||
for index, (user_item, _assistant_item) in enumerate(pairs, start=1):
|
||
steps.append(
|
||
{
|
||
"step_id": f"step_{index:02d}",
|
||
"title": f"Step {index:02d}",
|
||
"question": str(user_item.get("text") or "").strip(),
|
||
"criticality": DEFAULT_CRITICALITY,
|
||
}
|
||
)
|
||
return {
|
||
"schema_version": TRUTH_HARNESS_SPEC_SCHEMA_VERSION,
|
||
"scenario_id": scenario_id,
|
||
"domain": domain,
|
||
"title": title or scenario_id,
|
||
"description": f"Exact sequential replay spec bootstrapped from {export_path}",
|
||
"source_export": str(export_path),
|
||
"bindings": {},
|
||
"steps": steps,
|
||
}
|
||
|
||
|
||
def default_output_dir(base_name: str) -> Path:
|
||
stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||
return DEFAULT_OUTPUT_ROOT / f"{base_name}_{stamp}"
|
||
|
||
|
||
def handle_bootstrap(args: argparse.Namespace) -> int:
|
||
export_path = Path(args.export).resolve()
|
||
output_path = Path(args.output).resolve()
|
||
spec = build_bootstrap_spec(
|
||
export_path=export_path,
|
||
scenario_id=args.scenario_id,
|
||
domain=args.domain,
|
||
title=args.title,
|
||
)
|
||
write_json(output_path, spec)
|
||
print(f"[truth-harness] wrote bootstrap spec to {output_path}")
|
||
return 0
|
||
|
||
|
||
def handle_review_export(args: argparse.Namespace) -> int:
|
||
spec_path = Path(args.spec).resolve()
|
||
spec = load_truth_harness_spec(spec_path)
|
||
export_path = Path(args.export or spec.get("source_export") or "").resolve()
|
||
if not export_path.exists():
|
||
raise RuntimeError("Export file for review-export does not exist")
|
||
output_dir = Path(args.output_dir).resolve() if args.output_dir else default_output_dir(
|
||
f"{spec['scenario_id']}_review"
|
||
)
|
||
result = review_export(spec, export_path, output_dir)
|
||
print(f"[truth-harness] review-export overall_status={result['review_summary']['overall_status']}")
|
||
print(f"[truth-harness] review-export final_status={result['pack_state']['final_status']}")
|
||
print(f"[truth-harness] artifacts={output_dir}")
|
||
return 0
|
||
|
||
|
||
def handle_run_live(args: argparse.Namespace) -> int:
|
||
spec_path = Path(args.spec).resolve()
|
||
spec = load_truth_harness_spec(spec_path)
|
||
output_dir = Path(args.output_dir).resolve() if args.output_dir else default_output_dir(
|
||
f"{spec['scenario_id']}_live"
|
||
)
|
||
if args.require_mcp_live_readiness:
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
readiness = mcp_readiness.check_readiness(
|
||
backend_url=args.backend_url,
|
||
proxy_url=args.mcp_proxy_url,
|
||
channel=args.mcp_channel,
|
||
confirm_live=True,
|
||
require_polling_before_live=not bool(args.mcp_live_probe_without_observed_polling),
|
||
wait_for_polling_seconds=float(args.mcp_wait_for_polling_seconds),
|
||
poll_interval_seconds=float(args.mcp_poll_interval_seconds),
|
||
probe_timeout_seconds=float(args.mcp_readiness_probe_timeout_seconds),
|
||
probe_limit=int(args.mcp_readiness_probe_limit),
|
||
)
|
||
write_json(output_dir / "mcp_live_readiness.json", readiness)
|
||
print(
|
||
"[truth-harness] mcp-live-readiness "
|
||
f"status={readiness['verdict']['status']} "
|
||
f"ready={readiness['verdict']['ready_for_live_replay']} "
|
||
f"reason={readiness['verdict']['reason']}"
|
||
)
|
||
if not readiness["verdict"]["ready_for_live_replay"]:
|
||
print(f"[truth-harness] run-live skipped before first step; artifacts={output_dir}")
|
||
return 2
|
||
result = run_live(spec, output_dir, args)
|
||
print(f"[truth-harness] run-live overall_status={result['review_summary']['overall_status']}")
|
||
print(f"[truth-harness] run-live final_status={result['pack_state']['final_status']}")
|
||
print(f"[truth-harness] artifacts={output_dir}")
|
||
return 0
|
||
|
||
|
||
def build_parser() -> argparse.ArgumentParser:
|
||
parser = argparse.ArgumentParser(description="Strict sequential truth harness for real assistant dialogue scenarios")
|
||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||
|
||
bootstrap = subparsers.add_parser(
|
||
"bootstrap", help="Create an exact-question truth harness spec from a technical export"
|
||
)
|
||
bootstrap.add_argument("--export", required=True)
|
||
bootstrap.add_argument("--output", required=True)
|
||
bootstrap.add_argument("--scenario-id", required=True)
|
||
bootstrap.add_argument("--domain", required=True)
|
||
bootstrap.add_argument("--title")
|
||
bootstrap.set_defaults(func=handle_bootstrap)
|
||
|
||
review_export_cmd = subparsers.add_parser(
|
||
"review-export", help="Review an existing technical export against a truth harness spec"
|
||
)
|
||
review_export_cmd.add_argument("--spec", required=True)
|
||
review_export_cmd.add_argument("--export")
|
||
review_export_cmd.add_argument("--output-dir")
|
||
review_export_cmd.set_defaults(func=handle_review_export)
|
||
|
||
run_live_cmd = subparsers.add_parser(
|
||
"run-live",
|
||
help="Run the exact scenario live, strictly sequentially, with no injected carryover context",
|
||
)
|
||
run_live_cmd.add_argument("--spec", required=True)
|
||
run_live_cmd.add_argument("--output-dir")
|
||
run_live_cmd.add_argument("--backend-url", default=dcl.DEFAULT_BACKEND_URL)
|
||
run_live_cmd.add_argument("--prompt-version", default=dcl.DEFAULT_PROMPT_VERSION)
|
||
run_live_cmd.add_argument("--llm-provider", default=dcl.DEFAULT_LLM_PROVIDER)
|
||
run_live_cmd.add_argument("--llm-model", default=dcl.DEFAULT_LLM_MODEL)
|
||
run_live_cmd.add_argument("--llm-base-url", default=dcl.DEFAULT_LLM_BASE_URL)
|
||
run_live_cmd.add_argument("--llm-api-key", default=dcl.DEFAULT_LLM_API_KEY)
|
||
run_live_cmd.add_argument("--temperature", type=float, default=dcl.DEFAULT_TEMPERATURE)
|
||
run_live_cmd.add_argument("--max-output-tokens", type=int, default=dcl.DEFAULT_MAX_OUTPUT_TOKENS)
|
||
run_live_cmd.add_argument("--timeout-seconds", type=int, default=120)
|
||
run_live_cmd.add_argument("--use-mock", action="store_true")
|
||
run_live_cmd.add_argument(
|
||
"--require-mcp-live-readiness",
|
||
action="store_true",
|
||
help="Run a backend/proxy/live-1C readiness gate before sending the first assistant turn.",
|
||
)
|
||
run_live_cmd.add_argument("--mcp-proxy-url", default=mcp_readiness.DEFAULT_PROXY_URL)
|
||
run_live_cmd.add_argument("--mcp-channel", default=mcp_readiness.DEFAULT_CHANNEL)
|
||
run_live_cmd.add_argument(
|
||
"--mcp-readiness-probe-timeout-seconds",
|
||
type=float,
|
||
default=mcp_readiness.DEFAULT_PROBE_TIMEOUT_SECONDS,
|
||
)
|
||
run_live_cmd.add_argument("--mcp-readiness-probe-limit", type=int, default=1)
|
||
run_live_cmd.add_argument(
|
||
"--mcp-wait-for-polling-seconds",
|
||
type=float,
|
||
default=0,
|
||
help="Wait for proxy health to observe /1c/poll activity before the live readiness probe.",
|
||
)
|
||
run_live_cmd.add_argument(
|
||
"--mcp-poll-interval-seconds",
|
||
type=float,
|
||
default=mcp_readiness.DEFAULT_POLL_INTERVAL_SECONDS,
|
||
)
|
||
run_live_cmd.add_argument(
|
||
"--mcp-live-probe-without-observed-polling",
|
||
action="store_true",
|
||
help="Allow the preflight live probe even when proxy health has not observed /1c/poll activity.",
|
||
)
|
||
run_live_cmd.set_defaults(func=handle_run_live)
|
||
|
||
return parser
|
||
|
||
|
||
def main() -> int:
|
||
parser = build_parser()
|
||
args = parser.parse_args()
|
||
return int(args.func(args))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|