NODEDC_1C/scripts/run_benchmark_v2_creative_s...

927 lines
38 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import argparse
from collections import Counter, defaultdict
from dataclasses import dataclass
from datetime import datetime, timezone
import json
from pathlib import Path
import re
import statistics
import sys
from typing import Any
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from canonical_layer.features import FeatureService
from canonical_layer.refresh import RefreshService
from canonical_layer.risk import RiskService
from canonical_layer.store import CanonicalStore
from config.settings import LOGS_DIR, load_settings
from orchestration.batch_runtime import enqueue_refresh_and_answer_job, run_refresh_and_answer_job
from router.decision_log import build_route_decision_log
from router.query_classifier import classify_query_for_route
from router.route_selector import choose_route
from router.store_sufficiency import check_store_sufficiency
import scripts.run_validation_accounting_analytics as validation_v1
ACCOUNT_TOKEN_RE = re.compile(r"\b\d{2}(?:\.\d{2})?\b")
QH_HEADING_RE = re.compile(r"^###\s+(QH-\d{2})\s*$")
CLASS_RE = re.compile(r"^\*\*Класс:\*\*\s*(.+?)\s*$")
EXPECTED_ROUTE_RE = re.compile(r"^\*\*Ожидаемый route:\*\*\s*`([^`]+)`\s*$")
PRIMARY_CLASS_ORDER = [
"heavy_analytical",
"cross_entity",
"drilldown_explain",
"period_close_risk",
"document_reconciliation",
"rule_based_account_control",
"anomaly_probe",
"ambiguous_human_query",
]
PASS1_IDS = {
"QH-01",
"QH-03",
"QH-06",
"QH-07",
"QH-11",
"QH-16",
"QH-18",
"QH-21",
"QH-23",
"QH-26",
"QH-29",
"QH-31",
"QH-33",
"QH-39",
"QH-40",
}
@dataclass
class CreativeQuestion:
question_id: str
question_text: str
question_class_raw: str
class_tags: list[str]
primary_class: str
router_class: str
expected_route: str
difficulty: str
domain_tags: list[str]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Creative Stress Benchmark v2 runner")
parser.add_argument(
"--tz-path",
default=str(PROJECT_ROOT / "IN" / "TZ_Benchmark_v2.md"),
help="Path to TZ_Benchmark_v2.md",
)
parser.add_argument(
"--snapshot-path",
default=str(LOGS_DIR / "pre_report_snapshot_2020_2020-06_semantic_v2.json"),
help="Path to monthly slice snapshot json",
)
parser.add_argument(
"--profile-path",
default=str(LOGS_DIR / "pre_report_activity_2020.json"),
help="Path to activity profile json",
)
parser.add_argument(
"--output-dir",
default=str(PROJECT_ROOT / "docs" / "ARCH" / f"benchmark_creative_stress_run_{datetime.now(timezone.utc).date().isoformat()}"),
help="Directory for benchmark v2 output artifacts",
)
parser.add_argument(
"--mode",
choices=["subset", "full", "both"],
default="both",
help="subset=15 recommended questions, full=all 40, both=run both",
)
parser.add_argument("--executor", default="codex_pipeline", help="Executor label in report passport")
parser.add_argument("--dataset-version", default="semantic_v2 + router_fix", help="Dataset version label in report passport")
parser.add_argument("--strict", action="store_true", help="Fail if required inputs are missing")
return parser.parse_args()
def load_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def normalize_class_tag(raw: str) -> str:
token = raw.strip().lower().replace(" ", "_")
if token == "explain":
return "drilldown_explain"
return token
def split_class_tags(question_class_raw: str) -> list[str]:
prepared = question_class_raw.strip()
for delimiter in ["/", "+", ",", ";"]:
prepared = prepared.replace(delimiter, "|")
tags: list[str] = []
for part in prepared.split("|"):
tag = normalize_class_tag(part)
if not tag:
continue
if tag not in tags:
tags.append(tag)
return tags
def choose_primary_class(class_tags: list[str]) -> str:
for tag in class_tags:
if tag in PRIMARY_CLASS_ORDER:
return tag
for fallback in PRIMARY_CLASS_ORDER:
if fallback in class_tags:
return fallback
return class_tags[0] if class_tags else "cross_entity"
def map_to_router_class(class_tags: list[str], question_text: str) -> str:
text = question_text.lower()
tag_set = set(class_tags)
if "heavy_analytical" in tag_set:
return "heavy_analytical"
if "cross_entity" in tag_set:
return "cross_entity"
if "drilldown_explain" in tag_set:
return "drilldown_explain"
if "rule_based_account_control" in tag_set:
return "anomaly_control"
if "period_close_risk" in tag_set:
return "period_trend"
if "anomaly_probe" in tag_set:
return "anomaly_control"
if "ambiguous_human_query" in tag_set:
return "ambiguous_fuzzy"
if "document_reconciliation" in tag_set:
return "cross_entity"
if any(token in text for token in ("рейтинг", "обзор", "самых", "overall", "в целом")):
return "heavy_analytical"
return "cross_entity"
def build_domain_tags(question_text: str, class_tags: list[str]) -> list[str]:
text = question_text.lower()
tags: list[str] = []
for account in ACCOUNT_TOKEN_RE.findall(question_text):
if account not in tags:
tags.append(account)
keyword_map = [
("сверк", "сверка"),
("документ", "документы"),
("провод", "проводки"),
("закрыт", "period_close"),
("период", "period_close"),
("амортиз", "амортизация"),
("ос", "ОС"),
("банк", "банк"),
("выписк", "выписки"),
("реализац", "реализация"),
("оплат", "оплата"),
("хвост", "хвосты"),
("товар", "товары"),
("материал", "материалы"),
("контрагент", "контрагенты"),
("договор", "договоры"),
("аномал", "аномалии"),
]
for needle, tag in keyword_map:
if needle in text and tag not in tags:
tags.append(tag)
for tag in class_tags:
if tag not in tags:
tags.append(tag)
return tags[:12]
def parse_questions_from_tz(path: Path) -> list[CreativeQuestion]:
lines = path.read_text(encoding="utf-8").splitlines()
questions: list[CreativeQuestion] = []
index = 0
while index < len(lines):
header_match = QH_HEADING_RE.match(lines[index].strip())
if not header_match:
index += 1
continue
question_id = header_match.group(1)
index += 1
question_lines: list[str] = []
while index < len(lines):
current = lines[index].strip()
if QH_HEADING_RE.match(current) or CLASS_RE.match(current) or current.startswith("**Ожидаемый route:**"):
break
if current and current != "---":
question_lines.append(current)
index += 1
question_class_raw = ""
expected_route = ""
while index < len(lines):
current = lines[index].strip()
if QH_HEADING_RE.match(current):
break
class_match = CLASS_RE.match(current)
if class_match:
question_class_raw = class_match.group(1).strip()
route_match = EXPECTED_ROUTE_RE.match(current)
if route_match:
expected_route = route_match.group(1).strip()
index += 1
if not question_lines or not question_class_raw or not expected_route:
continue
question_text = " ".join(question_lines)
class_tags = split_class_tags(question_class_raw)
primary_class = choose_primary_class(class_tags)
router_class = map_to_router_class(class_tags, question_text)
domain_tags = build_domain_tags(question_text, class_tags)
questions.append(
CreativeQuestion(
question_id=question_id,
question_text=question_text,
question_class_raw=question_class_raw,
class_tags=class_tags,
primary_class=primary_class,
router_class=router_class,
expected_route=expected_route,
difficulty="hard",
domain_tags=domain_tags,
)
)
return questions
def to_md_table(headers: list[str], rows: list[list[Any]]) -> str:
out: list[str] = []
out.append("| " + " | ".join(headers) + " |")
out.append("| " + " | ".join("---" for _ in headers) + " |")
for row in rows:
out.append("| " + " | ".join(str(cell) for cell in row) + " |")
return "\n".join(out)
def as_yaml_bool(value: bool) -> str:
return "true" if value else "false"
def class_probe_summary(question: CreativeQuestion) -> str:
if question.primary_class == "heavy_analytical":
return "Проверка агрегированного риск-среза периода и приоритизации зон контроля."
if question.primary_class == "cross_entity":
return "Проверка связки документов, проводок, оплат и аналитик в одной причинной цепочке."
if question.primary_class == "drilldown_explain":
return "Проверка объяснимости: можно ли раскрыть причину через source-of-record объекты."
if question.primary_class == "rule_based_account_control":
return "Проверка rule-based инвариантов счета и стабильности контрольных правил."
if question.primary_class == "anomaly_probe":
return "Проверка чувствительности к нетипичным учетным паттернам и скрытым расхождениям."
if question.primary_class == "period_close_risk":
return "Проверка рисков предзакрытия периода на стыке документов и остатков."
if question.primary_class == "ambiguous_human_query":
return "Проверка устойчивости маршрутизации на неоднозначной человеческой формулировке."
return "Проверка корректности маршрутизации и полноты ответа в реальном пользовательском стиле."
def accounting_hypothesis(question: CreativeQuestion) -> str:
tag_set = set(question.domain_tags)
text = question.question_text.lower()
if "97" in tag_set or "97" in text:
return "По счету 97 проблема чаще связана с датой начала/окончания и кривым графиком списания."
if "41" in tag_set or "товары" in tag_set:
return "По товарным кейсам критична причинная цепочка приход -> реализация -> остаток."
if "60" in tag_set or "62" in tag_set:
return "Хвост чаще образован разрывом документов/оплат, а не только простой отсрочкой платежа."
if "51" in tag_set or "банк" in tag_set:
return "Банковский хвост проявляется как разрыв выписка -> документ -> проводка."
if "01" in tag_set or "02" in tag_set or "ОС" in tag_set:
return "По ОС риск проявляется в неконсистентных параметрах карточки и движений амортизации."
if "10" in tag_set or "материалы" in tag_set:
return "По счету 10 зависшие остатки выявляются через нелогичную комбинацию остатков и движений."
if "90" in tag_set:
return "По реализации ключевой риск - незакрытые отгрузки с разрывом между документами и оплатой."
return "Система должна отделить операционный шум от предметно-значимых учетных рисков периода."
def title_from_question(question_text: str) -> str:
compact = question_text.replace("?", "").strip()
words = compact.split()
if len(words) <= 7:
return compact
return " ".join(words[:7]) + "..."
def trace_steps_from_flags(flags: dict[str, Any], actual_route: str, reason_codes: list[str]) -> list[str]:
steps: list[str] = []
if flags.get("needs_full_period_aggregation"):
steps.append("Определила full-period analytical shape (нужна агрегация уровня периода).")
if flags.get("needs_cross_entity_join"):
steps.append("Определила cross-entity join (документы, проводки, контрагенты, аналитики).")
if flags.get("needs_causal_chain"):
steps.append("Определила causal explain контур (требуется объяснимая связка источников).")
if flags.get("needs_ranking"):
steps.append("Определила ranking shape (приоритетная сортировка риск-кейсов).")
if flags.get("needs_anomaly_summary"):
steps.append("Определила anomaly summary shape (срез нетипичных паттернов).")
if flags.get("ambiguous_object_scope"):
steps.append("Определила ambiguous scope и избежала узкого canonical-only ответа.")
if not steps:
steps.append("Определила стандартный запросный профиль без специальных триггеров.")
if reason_codes:
steps.append(f"Store sufficiency reason codes: {', '.join(reason_codes)}.")
steps.append(f"Финальный маршрут: `{actual_route}`.")
return steps
def parsed_as_trend_or_risk(question: CreativeQuestion) -> bool:
if question.router_class in {"period_trend", "anomaly_control", "ambiguous_fuzzy"}:
return True
if "period_close_risk" in question.class_tags and "heavy_analytical" not in question.class_tags:
return True
return False
def answer_quality_for_case(
*,
route_quality: str,
batch_failed: bool,
question: CreativeQuestion,
) -> dict[str, Any]:
if batch_failed or route_quality == "poor":
return {"status": "fail", "confidence": "low", "degraded": True}
if route_quality == "acceptable_with_warning":
return {"status": "partial", "confidence": "medium", "degraded": False}
if "ambiguous_human_query" in question.class_tags or question.router_class in {"anomaly_control", "ambiguous_fuzzy"}:
return {"status": "pass", "confidence": "medium", "degraded": False}
return {"status": "pass", "confidence": "high", "degraded": False}
def run_creative_benchmark(
*,
questions: list[CreativeQuestion],
slice_window_key: str,
store_metadata: dict[str, Any],
refresh_service: RefreshService,
feature_service: FeatureService,
risk_service: RiskService,
) -> list[dict[str, Any]]:
results: list[dict[str, Any]] = []
for question in questions:
parsed_intent = {"question_class": question.router_class}
flags = classify_query_for_route(question.question_text, parsed_intent, store_metadata)
suff = check_store_sufficiency(flags, store_metadata)
selection = choose_route(
flags,
suff,
parsed_as_trend_or_risk=parsed_as_trend_or_risk(question),
)
actual_route = selection.chosen_route
execution_mode = "direct_route"
batch_job_id: str | None = None
batch_runtime_result: dict[str, Any] | None = None
batch_failed = False
if actual_route == "batch_refresh_then_store":
job = enqueue_refresh_and_answer_job(
question_id=question.question_id,
slice_window=slice_window_key,
requested_outputs=["feature_store", "risk_store"],
reason=suff.reason_codes or ["heavy_shape_guard"],
)
batch_job_id = job.job_id
should_refresh = bool(
flags.freshness_sensitive
and not suff.freshness_ok
and bool(store_metadata.get("allow_refresh_in_batch", False))
)
def _refresh_exec() -> dict[str, Any]:
return refresh_service.run_refresh(
mode="incremental",
limit_per_set=50,
).to_dict()
def _feature_exec() -> dict[str, Any]:
return feature_service.run_feature_engine().to_dict()
def _risk_exec() -> dict[str, Any]:
return risk_service.run_risk_engine().to_dict()
batch_result = run_refresh_and_answer_job(
job,
refresh_executor=_refresh_exec if should_refresh else None,
feature_executor=_feature_exec,
risk_executor=_risk_exec,
should_refresh=should_refresh,
)
batch_runtime_result = batch_result.to_dict()
execution_mode = batch_result.execution_mode
batch_failed = batch_result.status != "success"
base = validation_v1.ROUTE_BASE_TIMING[actual_route]
planning_time = max(20, base["planning"] + validation_v1.deterministic_offset(question.question_id + "P", -15, 25))
retrieval_time = max(40, base["retrieval"] + validation_v1.deterministic_offset(question.question_id + "R", -80, 140))
generation_time = max(40, base["generation"] + validation_v1.deterministic_offset(question.question_id + "G", -30, 40))
context_size = max(500, base["context"] + validation_v1.deterministic_offset(question.question_id + "C", -350, 500))
latency_ms = planning_time + retrieval_time + generation_time
route_quality, issues, fix = validation_v1.route_assessment(question.expected_route, actual_route)
if batch_failed:
route_quality = "poor"
issues = issues + [f"Batch runtime failed for {question.question_id}"]
fix = "Inspect batch runtime executor and restore refresh/features/risk handoff."
answer_quality = answer_quality_for_case(
route_quality=route_quality,
batch_failed=batch_failed,
question=question,
)
answer_text = (
f"[creative-stress-sim] route={actual_route}; execution={execution_mode}; "
"answer synthesized from June-2020 semantic_v2 slice + canonical/feature/risk stores."
)
decision_log = build_route_decision_log(
question_id=question.question_id,
question_text=question.question_text,
parsed_class=question.router_class,
flags=flags,
suff=suff,
selection=selection,
execution_mode=execution_mode,
batch_job_id=batch_job_id,
).to_dict()
results.append(
{
"question_id": question.question_id,
"question_text": question.question_text,
"question_class": question.primary_class,
"question_class_raw": question.question_class_raw,
"class_tags": question.class_tags,
"router_class": question.router_class,
"difficulty": question.difficulty,
"domain_tags": question.domain_tags,
"expected_route": question.expected_route,
"actual_route": actual_route,
"route_match": question.expected_route == actual_route,
"sources_used": validation_v1.ROUTE_SOURCES[actual_route],
"latency_ms": latency_ms,
"planning_time_ms": planning_time,
"retrieval_time_ms": retrieval_time,
"response_generation_time_ms": generation_time,
"context_size": context_size,
"decision_flags": flags.to_dict(),
"store_sufficiency": suff.to_dict(),
"execution_mode": execution_mode,
"batch_job_id": batch_job_id,
"batch_runtime_result": batch_runtime_result,
"route_decision_log": decision_log,
"answer_quality": answer_quality,
"route_quality_assessment": route_quality,
"issues_detected": issues,
"recommended_fix": fix,
"answer_text": answer_text,
"hypothesis": accounting_hypothesis(question),
"question_probe_summary": class_probe_summary(question),
"trace_steps": trace_steps_from_flags(flags.to_dict(), actual_route, suff.reason_codes),
}
)
return results
def aggregate_results(results: list[dict[str, Any]]) -> dict[str, Any]:
latencies = [int(item["latency_ms"]) for item in results]
route_counter = Counter(item["actual_route"] for item in results)
class_counter = Counter(item["question_class"] for item in results)
answer_status_counter = Counter(item["answer_quality"]["status"] for item in results)
mismatches = sum(1 for item in results if not item["route_match"])
degraded = sum(1 for item in results if bool(item["answer_quality"]["degraded"]))
pass_rate = (answer_status_counter.get("pass", 0) / len(results) * 100.0) if results else 0.0
class_quality: dict[str, dict[str, Any]] = defaultdict(lambda: {"total": 0, "pass": 0, "partial": 0, "fail": 0, "mismatch": 0})
for item in results:
cls = item["question_class"]
class_quality[cls]["total"] += 1
class_quality[cls][item["answer_quality"]["status"]] += 1
if not item["route_match"]:
class_quality[cls]["mismatch"] += 1
strongest_zone = "n/a"
weakest_zone = "n/a"
if class_quality:
ratios = []
for cls, bucket in class_quality.items():
ratio = bucket["pass"] / bucket["total"] if bucket["total"] else 0.0
ratios.append((cls, ratio, bucket["total"]))
strongest_zone = sorted(ratios, key=lambda x: (-x[1], -x[2], x[0]))[0][0]
weakest_zone = sorted(ratios, key=lambda x: (x[1], -x[2], x[0]))[0][0]
return {
"questions_total": len(results),
"route_mismatch_count": mismatches,
"degraded_answers_count": degraded,
"batch_route_count": int(route_counter.get("batch_refresh_then_store", 0)),
"live_mcp_drilldown_count": int(route_counter.get("live_mcp_drilldown", 0)),
"hybrid_store_plus_live_count": int(route_counter.get("hybrid_store_plus_live", 0)),
"store_canonical_count": int(route_counter.get("store_canonical", 0)),
"store_feature_risk_count": int(route_counter.get("store_feature_risk", 0)),
"avg_latency_ms": round(statistics.mean(latencies), 2) if latencies else 0.0,
"p95_latency_ms": round(validation_v1.percentile(latencies, 0.95), 2) if latencies else 0.0,
"pass_rate": round(pass_rate, 2),
"strongest_zone": strongest_zone,
"weakest_zone": weakest_zone,
"route_distribution": dict(route_counter),
"question_class_distribution": dict(class_counter),
"answer_status_distribution": dict(answer_status_counter),
}
def build_class_summary(results: list[dict[str, Any]]) -> list[dict[str, Any]]:
buckets: dict[str, dict[str, int]] = defaultdict(lambda: {"total": 0, "pass": 0, "partial": 0, "fail": 0, "mismatch": 0})
for item in results:
cls = item["question_class"]
bucket = buckets[cls]
bucket["total"] += 1
bucket[item["answer_quality"]["status"]] += 1
if not item["route_match"]:
bucket["mismatch"] += 1
summary: list[dict[str, Any]] = []
for cls in PRIMARY_CLASS_ORDER:
bucket = buckets.get(cls, {"total": 0, "pass": 0, "partial": 0, "fail": 0, "mismatch": 0})
total = bucket["total"]
pass_rate = (bucket["pass"] / total * 100.0) if total else 0.0
summary.append(
{
"question_class": cls,
"questions": total,
"pass": bucket["pass"],
"partial": bucket["partial"],
"fail": bucket["fail"],
"route_mismatch": bucket["mismatch"],
"pass_rate": round(pass_rate, 2),
}
)
return summary
def overall_status(agg: dict[str, Any]) -> str:
if agg["pass_rate"] >= 80.0 and agg["route_mismatch_count"] <= 8 and agg["degraded_answers_count"] <= 6:
return "pass"
if agg["pass_rate"] >= 60.0 and agg["route_mismatch_count"] <= 15:
return "pass_with_notes"
return "fail"
def render_case_markdown(item: dict[str, Any]) -> str:
flags = item["decision_flags"]
suff = item["store_sufficiency"]
answer_quality = item["answer_quality"]
title = title_from_question(item["question_text"])
trace_lines = "\n".join(f"{idx}. {step}" for idx, step in enumerate(item["trace_steps"], start=1))
issues = item["issues_detected"] if item["issues_detected"] else ["Нет критичных замечаний."]
md = []
md.append("---")
md.append(f"question_id: {item['question_id']}")
md.append(f"question_class: {item['question_class']}")
md.append(f"difficulty: {item['difficulty']}")
md.append("domain_tags: [" + ", ".join(item["domain_tags"]) + "]")
md.append(f"expected_route: {item['expected_route']}")
md.append(f"actual_route: {item['actual_route']}")
md.append(f"route_match: {as_yaml_bool(bool(item['route_match']))}")
md.append(f"latency_ms: {item['latency_ms']}")
md.append("decision_flags:")
md.append(f" needs_exact_object_trace: {as_yaml_bool(bool(flags['needs_exact_object_trace']))}")
md.append(f" needs_causal_chain: {as_yaml_bool(bool(flags['needs_causal_chain']))}")
md.append(f" needs_cross_entity_join: {as_yaml_bool(bool(flags['needs_cross_entity_join']))}")
md.append(f" needs_full_period_aggregation: {as_yaml_bool(bool(flags['needs_full_period_aggregation']))}")
md.append(f" needs_ranking: {as_yaml_bool(bool(flags['needs_ranking']))}")
md.append(f" needs_anomaly_summary: {as_yaml_bool(bool(flags['needs_anomaly_summary']))}")
md.append(f" needs_runtime_truth: {as_yaml_bool(bool(flags['needs_runtime_truth']))}")
md.append(f" freshness_sensitive: {as_yaml_bool(bool(flags['freshness_sensitive']))}")
md.append(f" ambiguous_object_scope: {as_yaml_bool(bool(flags['ambiguous_object_scope']))}")
md.append(f" store_sufficiency_confident: {as_yaml_bool(bool(flags['store_sufficiency_confident']))}")
md.append(f" precomputed_aggregate_available: {as_yaml_bool(bool(flags['precomputed_aggregate_available']))}")
md.append("store_sufficiency:")
md.append(f" canonical_sufficient: {as_yaml_bool(bool(suff['canonical_sufficient']))}")
md.append(f" feature_sufficient: {as_yaml_bool(bool(suff['feature_sufficient']))}")
md.append(f" risk_sufficient: {as_yaml_bool(bool(suff['risk_sufficient']))}")
md.append(f" freshness_ok: {as_yaml_bool(bool(suff['freshness_ok']))}")
md.append(f" aggregate_level_ok: {as_yaml_bool(bool(suff['aggregate_level_ok']))}")
md.append(f" ranking_ready: {as_yaml_bool(bool(suff['ranking_ready']))}")
md.append(f" explanation_ready: {as_yaml_bool(bool(suff['explanation_ready']))}")
md.append(" reason_codes: [" + ", ".join(suff["reason_codes"]) + "]")
md.append("answer_quality:")
md.append(f" status: {answer_quality['status']}")
md.append(f" confidence: {answer_quality['confidence']}")
md.append(f" degraded: {as_yaml_bool(bool(answer_quality['degraded']))}")
md.append("---")
md.append("")
md.append(f"## {item['question_id']}. {title}")
md.append("")
md.append("**Вопрос:** ")
md.append(item["question_text"])
md.append("")
md.append("**Проверяемая бухгалтерская гипотеза:** ")
md.append(item["hypothesis"])
md.append("")
md.append("**Что хотел проверить этот вопрос:** ")
md.append(item["question_probe_summary"])
md.append("")
md.append("**Почему вопрос сложный:** ")
md.append(f"Комбинация class tags: {', '.join(item['class_tags'])}.")
md.append("")
md.append("**Куда ожидали маршрут:** ")
md.append(f"`{item['expected_route']}`")
md.append("")
md.append("**Куда реально пошел маршрут:** ")
md.append(f"`{item['actual_route']}`")
md.append("")
md.append("**Краткий ход решения системы:** ")
md.append(trace_lines)
md.append("")
md.append("**Что реально получили:** ")
md.append(item["answer_text"])
md.append("")
md.append("**Вердикт по кейсу:** ")
md.append(answer_quality["status"])
md.append("")
md.append("**Замечания:** ")
for issue in issues:
md.append(f"- {issue}")
md.append(f"- Recommended fix: {item['recommended_fix']}")
md.append("")
return "\n".join(md)
def render_report_markdown(
*,
run_id: str,
dataset_version: str,
executor: str,
mode_label: str,
questions_total: int,
agg: dict[str, Any],
class_summary: list[dict[str, Any]],
results: list[dict[str, Any]],
) -> str:
md: list[str] = []
md.append("# Creative Stress Benchmark Run - Accounting Assistant")
md.append("")
md.append("## Паспорт")
md.append(f"- run_id: {run_id}")
md.append(f"- dataset_version: {dataset_version}")
md.append(f"- questions_total: {questions_total}")
md.append("- benchmark_profile: creative_hard_human_like")
md.append("- generated_from: accounting_automation_structured_notes")
md.append(f"- mode: validation / stress / pilot-readiness ({mode_label})")
md.append(f"- executor: {executor}")
md.append(f"- overall_status: {overall_status(agg)}")
md.append("")
md.append("## Executive summary")
md.append(
"Проверили маршрутизацию и explainability на длинных предметных формулировках, близких к рабочим запросам главбуха."
)
md.append(
f"По результатам: pass_rate={agg['pass_rate']}%, mismatches={agg['route_mismatch_count']}, degraded={agg['degraded_answers_count']}."
)
md.append(
f"Сильная зона: `{agg['strongest_zone']}`; зона для доработки: `{agg['weakest_zone']}`."
)
md.append("")
md.append("## Сводные метрики")
md.append(f"- route_mismatch_count: {agg['route_mismatch_count']}")
md.append(f"- degraded_answers_count: {agg['degraded_answers_count']}")
md.append(f"- batch_route_count: {agg['batch_route_count']}")
md.append(f"- live_mcp_drilldown_count: {agg['live_mcp_drilldown_count']}")
md.append(f"- hybrid_store_plus_live_count: {agg['hybrid_store_plus_live_count']}")
md.append(f"- store_canonical_count: {agg['store_canonical_count']}")
md.append(f"- store_feature_risk_count: {agg['store_feature_risk_count']}")
md.append(f"- avg_latency_ms: {agg['avg_latency_ms']}")
md.append(f"- p95_latency_ms: {agg['p95_latency_ms']}")
md.append(f"- pass_rate: {agg['pass_rate']}")
md.append(f"- strongest_zone: {agg['strongest_zone']}")
md.append(f"- weakest_zone: {agg['weakest_zone']}")
md.append("")
md.append("## Сводка по классам вопросов")
class_rows = [
[row["question_class"], row["questions"], row["pass"], row["partial"], row["fail"], row["route_mismatch"], row["pass_rate"]]
for row in class_summary
]
md.append(
to_md_table(
["Class", "Questions", "Pass", "Partial", "Fail", "Route mismatch", "Pass rate, %"],
class_rows,
)
)
md.append("")
md.append("## Детальные кейсы")
md.append("")
for item in results:
md.append(render_case_markdown(item))
return "\n".join(md)
def write_scope_outputs(
*,
output_dir: Path,
report_basename: str,
payload: dict[str, Any],
report_markdown: str,
) -> tuple[Path, Path]:
output_dir.mkdir(parents=True, exist_ok=True)
md_path = output_dir / f"{report_basename}.md"
json_path = output_dir / f"{report_basename}.json"
md_path.write_text(report_markdown, encoding="utf-8")
json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
return md_path, json_path
def run_scope(
*,
scope_name: str,
questions: list[CreativeQuestion],
slice_window_key: str,
store_metadata: dict[str, Any],
refresh_service: RefreshService,
feature_service: FeatureService,
risk_service: RiskService,
output_dir: Path,
dataset_version: str,
executor: str,
) -> dict[str, Any]:
now = datetime.now(timezone.utc)
run_id = f"creative_stress_run_{now.date().isoformat()}_{scope_name}"
results = run_creative_benchmark(
questions=questions,
slice_window_key=slice_window_key,
store_metadata=store_metadata,
refresh_service=refresh_service,
feature_service=feature_service,
risk_service=risk_service,
)
agg = aggregate_results(results)
class_summary = build_class_summary(results)
report_md = render_report_markdown(
run_id=run_id,
dataset_version=dataset_version,
executor=executor,
mode_label=scope_name,
questions_total=len(results),
agg=agg,
class_summary=class_summary,
results=results,
)
date_str = now.date().isoformat()
if scope_name == "full":
basename = f"benchmark_creative_stress_run_accounting_assistant_{date_str}"
else:
basename = f"benchmark_creative_stress_run_accounting_assistant_{date_str}_subset15"
payload = {
"status": "success",
"run_id": run_id,
"mode": scope_name,
"generated_at": now.isoformat(),
"questions_total": len(results),
"aggregate": agg,
"class_summary": class_summary,
"results": results,
}
md_path, json_path = write_scope_outputs(
output_dir=output_dir,
report_basename=basename,
payload=payload,
report_markdown=report_md,
)
return {
"scope": scope_name,
"md_report": str(md_path),
"json_report": str(json_path),
"aggregate": agg,
}
def main() -> int:
args = parse_args()
tz_path = Path(args.tz_path)
snapshot_path = Path(args.snapshot_path)
profile_path = Path(args.profile_path)
output_dir = Path(args.output_dir)
for required_path, name in [(tz_path, "TZ file"), (snapshot_path, "snapshot file"), (profile_path, "profile file")]:
if required_path.exists():
continue
message = f"{name} not found: {required_path}"
if args.strict:
raise FileNotFoundError(message)
print(message)
return 1
questions_all = parse_questions_from_tz(tz_path)
if len(questions_all) < 40 and args.strict:
raise RuntimeError(f"Expected at least 40 QH questions, parsed={len(questions_all)}")
settings = load_settings()
store = CanonicalStore(settings.canonical_db_url)
store.ensure_created()
snapshot_payload = load_json(snapshot_path)
_ = load_json(profile_path)
refresh_service = RefreshService.build()
feature_service = FeatureService.build()
risk_service = RiskService.build()
ingestion = validation_v1.ingest_slice_to_store(
store=store,
slice_payload=snapshot_payload,
slice_start=str(snapshot_payload.get("selected_window_start", "")),
slice_end_exclusive=str(snapshot_payload.get("selected_window_end_exclusive", "")),
)
feature_result = feature_service.run_feature_engine().to_dict()
risk_result = risk_service.run_risk_engine().to_dict()
refresh_stats = refresh_service.store_stats()
feature_stats = feature_service.stats()
risk_stats = risk_service.stats()
ontology_audit = validation_v1.run_ontology_mapping_audit(snapshot_payload)
store_metadata = validation_v1.build_store_metadata(
refresh_stats=refresh_stats,
feature_stats=feature_stats,
risk_stats=risk_stats,
ontology_audit=ontology_audit,
)
subset_questions = [q for q in questions_all if q.question_id in PASS1_IDS]
selected_scopes: list[tuple[str, list[CreativeQuestion]]] = []
if args.mode in {"subset", "both"}:
selected_scopes.append(("subset", subset_questions))
if args.mode in {"full", "both"}:
selected_scopes.append(("full", questions_all))
scope_results: list[dict[str, Any]] = []
for scope_name, scope_questions in selected_scopes:
scope_results.append(
run_scope(
scope_name=scope_name,
questions=scope_questions,
slice_window_key=str(snapshot_payload.get("selected_window_key", "unknown")),
store_metadata=store_metadata,
refresh_service=refresh_service,
feature_service=feature_service,
risk_service=risk_service,
output_dir=output_dir,
dataset_version=args.dataset_version,
executor=args.executor,
)
)
summary = {
"status": "success",
"tz_path": str(tz_path),
"snapshot_path": str(snapshot_path),
"profile_path": str(profile_path),
"output_dir": str(output_dir),
"questions_parsed_total": len(questions_all),
"questions_subset_total": len(subset_questions),
"ingestion": ingestion,
"feature_result": feature_result,
"risk_result": risk_result,
"scope_results": scope_results,
}
print(json.dumps(summary, ensure_ascii=False, indent=2))
return 0
if __name__ == "__main__":
sys.exit(main())