927 lines
38 KiB
Python
927 lines
38 KiB
Python
from __future__ import annotations
|
||
|
||
import argparse
|
||
from collections import Counter, defaultdict
|
||
from dataclasses import dataclass
|
||
from datetime import datetime, timezone
|
||
import json
|
||
from pathlib import Path
|
||
import re
|
||
import statistics
|
||
import sys
|
||
from typing import Any
|
||
|
||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||
if str(PROJECT_ROOT) not in sys.path:
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
|
||
from canonical_layer.features import FeatureService
|
||
from canonical_layer.refresh import RefreshService
|
||
from canonical_layer.risk import RiskService
|
||
from canonical_layer.store import CanonicalStore
|
||
from config.settings import LOGS_DIR, load_settings
|
||
from orchestration.batch_runtime import enqueue_refresh_and_answer_job, run_refresh_and_answer_job
|
||
from router.decision_log import build_route_decision_log
|
||
from router.query_classifier import classify_query_for_route
|
||
from router.route_selector import choose_route
|
||
from router.store_sufficiency import check_store_sufficiency
|
||
import scripts.run_validation_accounting_analytics as validation_v1
|
||
|
||
|
||
ACCOUNT_TOKEN_RE = re.compile(r"\b\d{2}(?:\.\d{2})?\b")
|
||
QH_HEADING_RE = re.compile(r"^###\s+(QH-\d{2})\s*$")
|
||
CLASS_RE = re.compile(r"^\*\*Класс:\*\*\s*(.+?)\s*$")
|
||
EXPECTED_ROUTE_RE = re.compile(r"^\*\*Ожидаемый route:\*\*\s*`([^`]+)`\s*$")
|
||
|
||
PRIMARY_CLASS_ORDER = [
|
||
"heavy_analytical",
|
||
"cross_entity",
|
||
"drilldown_explain",
|
||
"period_close_risk",
|
||
"document_reconciliation",
|
||
"rule_based_account_control",
|
||
"anomaly_probe",
|
||
"ambiguous_human_query",
|
||
]
|
||
|
||
PASS1_IDS = {
|
||
"QH-01",
|
||
"QH-03",
|
||
"QH-06",
|
||
"QH-07",
|
||
"QH-11",
|
||
"QH-16",
|
||
"QH-18",
|
||
"QH-21",
|
||
"QH-23",
|
||
"QH-26",
|
||
"QH-29",
|
||
"QH-31",
|
||
"QH-33",
|
||
"QH-39",
|
||
"QH-40",
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class CreativeQuestion:
|
||
question_id: str
|
||
question_text: str
|
||
question_class_raw: str
|
||
class_tags: list[str]
|
||
primary_class: str
|
||
router_class: str
|
||
expected_route: str
|
||
difficulty: str
|
||
domain_tags: list[str]
|
||
|
||
|
||
def parse_args() -> argparse.Namespace:
|
||
parser = argparse.ArgumentParser(description="Creative Stress Benchmark v2 runner")
|
||
parser.add_argument(
|
||
"--tz-path",
|
||
default=str(PROJECT_ROOT / "IN" / "TZ_Benchmark_v2.md"),
|
||
help="Path to TZ_Benchmark_v2.md",
|
||
)
|
||
parser.add_argument(
|
||
"--snapshot-path",
|
||
default=str(LOGS_DIR / "pre_report_snapshot_2020_2020-06_semantic_v2.json"),
|
||
help="Path to monthly slice snapshot json",
|
||
)
|
||
parser.add_argument(
|
||
"--profile-path",
|
||
default=str(LOGS_DIR / "pre_report_activity_2020.json"),
|
||
help="Path to activity profile json",
|
||
)
|
||
parser.add_argument(
|
||
"--output-dir",
|
||
default=str(PROJECT_ROOT / "docs" / "ARCH" / f"benchmark_creative_stress_run_{datetime.now(timezone.utc).date().isoformat()}"),
|
||
help="Directory for benchmark v2 output artifacts",
|
||
)
|
||
parser.add_argument(
|
||
"--mode",
|
||
choices=["subset", "full", "both"],
|
||
default="both",
|
||
help="subset=15 recommended questions, full=all 40, both=run both",
|
||
)
|
||
parser.add_argument("--executor", default="codex_pipeline", help="Executor label in report passport")
|
||
parser.add_argument("--dataset-version", default="semantic_v2 + router_fix", help="Dataset version label in report passport")
|
||
parser.add_argument("--strict", action="store_true", help="Fail if required inputs are missing")
|
||
return parser.parse_args()
|
||
|
||
|
||
def load_json(path: Path) -> dict[str, Any]:
|
||
return json.loads(path.read_text(encoding="utf-8"))
|
||
|
||
|
||
def normalize_class_tag(raw: str) -> str:
|
||
token = raw.strip().lower().replace(" ", "_")
|
||
if token == "explain":
|
||
return "drilldown_explain"
|
||
return token
|
||
|
||
|
||
def split_class_tags(question_class_raw: str) -> list[str]:
|
||
prepared = question_class_raw.strip()
|
||
for delimiter in ["/", "+", ",", ";"]:
|
||
prepared = prepared.replace(delimiter, "|")
|
||
tags: list[str] = []
|
||
for part in prepared.split("|"):
|
||
tag = normalize_class_tag(part)
|
||
if not tag:
|
||
continue
|
||
if tag not in tags:
|
||
tags.append(tag)
|
||
return tags
|
||
|
||
|
||
def choose_primary_class(class_tags: list[str]) -> str:
|
||
for tag in class_tags:
|
||
if tag in PRIMARY_CLASS_ORDER:
|
||
return tag
|
||
for fallback in PRIMARY_CLASS_ORDER:
|
||
if fallback in class_tags:
|
||
return fallback
|
||
return class_tags[0] if class_tags else "cross_entity"
|
||
|
||
|
||
def map_to_router_class(class_tags: list[str], question_text: str) -> str:
|
||
text = question_text.lower()
|
||
tag_set = set(class_tags)
|
||
|
||
if "heavy_analytical" in tag_set:
|
||
return "heavy_analytical"
|
||
if "cross_entity" in tag_set:
|
||
return "cross_entity"
|
||
if "drilldown_explain" in tag_set:
|
||
return "drilldown_explain"
|
||
if "rule_based_account_control" in tag_set:
|
||
return "anomaly_control"
|
||
if "period_close_risk" in tag_set:
|
||
return "period_trend"
|
||
if "anomaly_probe" in tag_set:
|
||
return "anomaly_control"
|
||
if "ambiguous_human_query" in tag_set:
|
||
return "ambiguous_fuzzy"
|
||
if "document_reconciliation" in tag_set:
|
||
return "cross_entity"
|
||
|
||
if any(token in text for token in ("рейтинг", "обзор", "самых", "overall", "в целом")):
|
||
return "heavy_analytical"
|
||
return "cross_entity"
|
||
|
||
|
||
def build_domain_tags(question_text: str, class_tags: list[str]) -> list[str]:
|
||
text = question_text.lower()
|
||
tags: list[str] = []
|
||
|
||
for account in ACCOUNT_TOKEN_RE.findall(question_text):
|
||
if account not in tags:
|
||
tags.append(account)
|
||
|
||
keyword_map = [
|
||
("сверк", "сверка"),
|
||
("документ", "документы"),
|
||
("провод", "проводки"),
|
||
("закрыт", "period_close"),
|
||
("период", "period_close"),
|
||
("амортиз", "амортизация"),
|
||
("ос", "ОС"),
|
||
("банк", "банк"),
|
||
("выписк", "выписки"),
|
||
("реализац", "реализация"),
|
||
("оплат", "оплата"),
|
||
("хвост", "хвосты"),
|
||
("товар", "товары"),
|
||
("материал", "материалы"),
|
||
("контрагент", "контрагенты"),
|
||
("договор", "договоры"),
|
||
("аномал", "аномалии"),
|
||
]
|
||
for needle, tag in keyword_map:
|
||
if needle in text and tag not in tags:
|
||
tags.append(tag)
|
||
|
||
for tag in class_tags:
|
||
if tag not in tags:
|
||
tags.append(tag)
|
||
return tags[:12]
|
||
|
||
|
||
def parse_questions_from_tz(path: Path) -> list[CreativeQuestion]:
|
||
lines = path.read_text(encoding="utf-8").splitlines()
|
||
questions: list[CreativeQuestion] = []
|
||
index = 0
|
||
|
||
while index < len(lines):
|
||
header_match = QH_HEADING_RE.match(lines[index].strip())
|
||
if not header_match:
|
||
index += 1
|
||
continue
|
||
|
||
question_id = header_match.group(1)
|
||
index += 1
|
||
|
||
question_lines: list[str] = []
|
||
while index < len(lines):
|
||
current = lines[index].strip()
|
||
if QH_HEADING_RE.match(current) or CLASS_RE.match(current) or current.startswith("**Ожидаемый route:**"):
|
||
break
|
||
if current and current != "---":
|
||
question_lines.append(current)
|
||
index += 1
|
||
|
||
question_class_raw = ""
|
||
expected_route = ""
|
||
|
||
while index < len(lines):
|
||
current = lines[index].strip()
|
||
if QH_HEADING_RE.match(current):
|
||
break
|
||
class_match = CLASS_RE.match(current)
|
||
if class_match:
|
||
question_class_raw = class_match.group(1).strip()
|
||
route_match = EXPECTED_ROUTE_RE.match(current)
|
||
if route_match:
|
||
expected_route = route_match.group(1).strip()
|
||
index += 1
|
||
|
||
if not question_lines or not question_class_raw or not expected_route:
|
||
continue
|
||
|
||
question_text = " ".join(question_lines)
|
||
class_tags = split_class_tags(question_class_raw)
|
||
primary_class = choose_primary_class(class_tags)
|
||
router_class = map_to_router_class(class_tags, question_text)
|
||
domain_tags = build_domain_tags(question_text, class_tags)
|
||
|
||
questions.append(
|
||
CreativeQuestion(
|
||
question_id=question_id,
|
||
question_text=question_text,
|
||
question_class_raw=question_class_raw,
|
||
class_tags=class_tags,
|
||
primary_class=primary_class,
|
||
router_class=router_class,
|
||
expected_route=expected_route,
|
||
difficulty="hard",
|
||
domain_tags=domain_tags,
|
||
)
|
||
)
|
||
return questions
|
||
|
||
|
||
def to_md_table(headers: list[str], rows: list[list[Any]]) -> str:
|
||
out: list[str] = []
|
||
out.append("| " + " | ".join(headers) + " |")
|
||
out.append("| " + " | ".join("---" for _ in headers) + " |")
|
||
for row in rows:
|
||
out.append("| " + " | ".join(str(cell) for cell in row) + " |")
|
||
return "\n".join(out)
|
||
|
||
|
||
def as_yaml_bool(value: bool) -> str:
|
||
return "true" if value else "false"
|
||
|
||
|
||
def class_probe_summary(question: CreativeQuestion) -> str:
|
||
if question.primary_class == "heavy_analytical":
|
||
return "Проверка агрегированного риск-среза периода и приоритизации зон контроля."
|
||
if question.primary_class == "cross_entity":
|
||
return "Проверка связки документов, проводок, оплат и аналитик в одной причинной цепочке."
|
||
if question.primary_class == "drilldown_explain":
|
||
return "Проверка объяснимости: можно ли раскрыть причину через source-of-record объекты."
|
||
if question.primary_class == "rule_based_account_control":
|
||
return "Проверка rule-based инвариантов счета и стабильности контрольных правил."
|
||
if question.primary_class == "anomaly_probe":
|
||
return "Проверка чувствительности к нетипичным учетным паттернам и скрытым расхождениям."
|
||
if question.primary_class == "period_close_risk":
|
||
return "Проверка рисков предзакрытия периода на стыке документов и остатков."
|
||
if question.primary_class == "ambiguous_human_query":
|
||
return "Проверка устойчивости маршрутизации на неоднозначной человеческой формулировке."
|
||
return "Проверка корректности маршрутизации и полноты ответа в реальном пользовательском стиле."
|
||
|
||
|
||
def accounting_hypothesis(question: CreativeQuestion) -> str:
|
||
tag_set = set(question.domain_tags)
|
||
text = question.question_text.lower()
|
||
if "97" in tag_set or "97" in text:
|
||
return "По счету 97 проблема чаще связана с датой начала/окончания и кривым графиком списания."
|
||
if "41" in tag_set or "товары" in tag_set:
|
||
return "По товарным кейсам критична причинная цепочка приход -> реализация -> остаток."
|
||
if "60" in tag_set or "62" in tag_set:
|
||
return "Хвост чаще образован разрывом документов/оплат, а не только простой отсрочкой платежа."
|
||
if "51" in tag_set or "банк" in tag_set:
|
||
return "Банковский хвост проявляется как разрыв выписка -> документ -> проводка."
|
||
if "01" in tag_set or "02" in tag_set or "ОС" in tag_set:
|
||
return "По ОС риск проявляется в неконсистентных параметрах карточки и движений амортизации."
|
||
if "10" in tag_set or "материалы" in tag_set:
|
||
return "По счету 10 зависшие остатки выявляются через нелогичную комбинацию остатков и движений."
|
||
if "90" in tag_set:
|
||
return "По реализации ключевой риск - незакрытые отгрузки с разрывом между документами и оплатой."
|
||
return "Система должна отделить операционный шум от предметно-значимых учетных рисков периода."
|
||
|
||
|
||
def title_from_question(question_text: str) -> str:
|
||
compact = question_text.replace("?", "").strip()
|
||
words = compact.split()
|
||
if len(words) <= 7:
|
||
return compact
|
||
return " ".join(words[:7]) + "..."
|
||
|
||
|
||
def trace_steps_from_flags(flags: dict[str, Any], actual_route: str, reason_codes: list[str]) -> list[str]:
|
||
steps: list[str] = []
|
||
if flags.get("needs_full_period_aggregation"):
|
||
steps.append("Определила full-period analytical shape (нужна агрегация уровня периода).")
|
||
if flags.get("needs_cross_entity_join"):
|
||
steps.append("Определила cross-entity join (документы, проводки, контрагенты, аналитики).")
|
||
if flags.get("needs_causal_chain"):
|
||
steps.append("Определила causal explain контур (требуется объяснимая связка источников).")
|
||
if flags.get("needs_ranking"):
|
||
steps.append("Определила ranking shape (приоритетная сортировка риск-кейсов).")
|
||
if flags.get("needs_anomaly_summary"):
|
||
steps.append("Определила anomaly summary shape (срез нетипичных паттернов).")
|
||
if flags.get("ambiguous_object_scope"):
|
||
steps.append("Определила ambiguous scope и избежала узкого canonical-only ответа.")
|
||
if not steps:
|
||
steps.append("Определила стандартный запросный профиль без специальных триггеров.")
|
||
if reason_codes:
|
||
steps.append(f"Store sufficiency reason codes: {', '.join(reason_codes)}.")
|
||
steps.append(f"Финальный маршрут: `{actual_route}`.")
|
||
return steps
|
||
|
||
|
||
def parsed_as_trend_or_risk(question: CreativeQuestion) -> bool:
|
||
if question.router_class in {"period_trend", "anomaly_control", "ambiguous_fuzzy"}:
|
||
return True
|
||
if "period_close_risk" in question.class_tags and "heavy_analytical" not in question.class_tags:
|
||
return True
|
||
return False
|
||
|
||
|
||
def answer_quality_for_case(
|
||
*,
|
||
route_quality: str,
|
||
batch_failed: bool,
|
||
question: CreativeQuestion,
|
||
) -> dict[str, Any]:
|
||
if batch_failed or route_quality == "poor":
|
||
return {"status": "fail", "confidence": "low", "degraded": True}
|
||
if route_quality == "acceptable_with_warning":
|
||
return {"status": "partial", "confidence": "medium", "degraded": False}
|
||
if "ambiguous_human_query" in question.class_tags or question.router_class in {"anomaly_control", "ambiguous_fuzzy"}:
|
||
return {"status": "pass", "confidence": "medium", "degraded": False}
|
||
return {"status": "pass", "confidence": "high", "degraded": False}
|
||
|
||
|
||
def run_creative_benchmark(
|
||
*,
|
||
questions: list[CreativeQuestion],
|
||
slice_window_key: str,
|
||
store_metadata: dict[str, Any],
|
||
refresh_service: RefreshService,
|
||
feature_service: FeatureService,
|
||
risk_service: RiskService,
|
||
) -> list[dict[str, Any]]:
|
||
results: list[dict[str, Any]] = []
|
||
|
||
for question in questions:
|
||
parsed_intent = {"question_class": question.router_class}
|
||
flags = classify_query_for_route(question.question_text, parsed_intent, store_metadata)
|
||
suff = check_store_sufficiency(flags, store_metadata)
|
||
selection = choose_route(
|
||
flags,
|
||
suff,
|
||
parsed_as_trend_or_risk=parsed_as_trend_or_risk(question),
|
||
)
|
||
actual_route = selection.chosen_route
|
||
|
||
execution_mode = "direct_route"
|
||
batch_job_id: str | None = None
|
||
batch_runtime_result: dict[str, Any] | None = None
|
||
batch_failed = False
|
||
|
||
if actual_route == "batch_refresh_then_store":
|
||
job = enqueue_refresh_and_answer_job(
|
||
question_id=question.question_id,
|
||
slice_window=slice_window_key,
|
||
requested_outputs=["feature_store", "risk_store"],
|
||
reason=suff.reason_codes or ["heavy_shape_guard"],
|
||
)
|
||
batch_job_id = job.job_id
|
||
should_refresh = bool(
|
||
flags.freshness_sensitive
|
||
and not suff.freshness_ok
|
||
and bool(store_metadata.get("allow_refresh_in_batch", False))
|
||
)
|
||
|
||
def _refresh_exec() -> dict[str, Any]:
|
||
return refresh_service.run_refresh(
|
||
mode="incremental",
|
||
limit_per_set=50,
|
||
).to_dict()
|
||
|
||
def _feature_exec() -> dict[str, Any]:
|
||
return feature_service.run_feature_engine().to_dict()
|
||
|
||
def _risk_exec() -> dict[str, Any]:
|
||
return risk_service.run_risk_engine().to_dict()
|
||
|
||
batch_result = run_refresh_and_answer_job(
|
||
job,
|
||
refresh_executor=_refresh_exec if should_refresh else None,
|
||
feature_executor=_feature_exec,
|
||
risk_executor=_risk_exec,
|
||
should_refresh=should_refresh,
|
||
)
|
||
batch_runtime_result = batch_result.to_dict()
|
||
execution_mode = batch_result.execution_mode
|
||
batch_failed = batch_result.status != "success"
|
||
|
||
base = validation_v1.ROUTE_BASE_TIMING[actual_route]
|
||
planning_time = max(20, base["planning"] + validation_v1.deterministic_offset(question.question_id + "P", -15, 25))
|
||
retrieval_time = max(40, base["retrieval"] + validation_v1.deterministic_offset(question.question_id + "R", -80, 140))
|
||
generation_time = max(40, base["generation"] + validation_v1.deterministic_offset(question.question_id + "G", -30, 40))
|
||
context_size = max(500, base["context"] + validation_v1.deterministic_offset(question.question_id + "C", -350, 500))
|
||
latency_ms = planning_time + retrieval_time + generation_time
|
||
|
||
route_quality, issues, fix = validation_v1.route_assessment(question.expected_route, actual_route)
|
||
if batch_failed:
|
||
route_quality = "poor"
|
||
issues = issues + [f"Batch runtime failed for {question.question_id}"]
|
||
fix = "Inspect batch runtime executor and restore refresh/features/risk handoff."
|
||
|
||
answer_quality = answer_quality_for_case(
|
||
route_quality=route_quality,
|
||
batch_failed=batch_failed,
|
||
question=question,
|
||
)
|
||
|
||
answer_text = (
|
||
f"[creative-stress-sim] route={actual_route}; execution={execution_mode}; "
|
||
"answer synthesized from June-2020 semantic_v2 slice + canonical/feature/risk stores."
|
||
)
|
||
|
||
decision_log = build_route_decision_log(
|
||
question_id=question.question_id,
|
||
question_text=question.question_text,
|
||
parsed_class=question.router_class,
|
||
flags=flags,
|
||
suff=suff,
|
||
selection=selection,
|
||
execution_mode=execution_mode,
|
||
batch_job_id=batch_job_id,
|
||
).to_dict()
|
||
|
||
results.append(
|
||
{
|
||
"question_id": question.question_id,
|
||
"question_text": question.question_text,
|
||
"question_class": question.primary_class,
|
||
"question_class_raw": question.question_class_raw,
|
||
"class_tags": question.class_tags,
|
||
"router_class": question.router_class,
|
||
"difficulty": question.difficulty,
|
||
"domain_tags": question.domain_tags,
|
||
"expected_route": question.expected_route,
|
||
"actual_route": actual_route,
|
||
"route_match": question.expected_route == actual_route,
|
||
"sources_used": validation_v1.ROUTE_SOURCES[actual_route],
|
||
"latency_ms": latency_ms,
|
||
"planning_time_ms": planning_time,
|
||
"retrieval_time_ms": retrieval_time,
|
||
"response_generation_time_ms": generation_time,
|
||
"context_size": context_size,
|
||
"decision_flags": flags.to_dict(),
|
||
"store_sufficiency": suff.to_dict(),
|
||
"execution_mode": execution_mode,
|
||
"batch_job_id": batch_job_id,
|
||
"batch_runtime_result": batch_runtime_result,
|
||
"route_decision_log": decision_log,
|
||
"answer_quality": answer_quality,
|
||
"route_quality_assessment": route_quality,
|
||
"issues_detected": issues,
|
||
"recommended_fix": fix,
|
||
"answer_text": answer_text,
|
||
"hypothesis": accounting_hypothesis(question),
|
||
"question_probe_summary": class_probe_summary(question),
|
||
"trace_steps": trace_steps_from_flags(flags.to_dict(), actual_route, suff.reason_codes),
|
||
}
|
||
)
|
||
|
||
return results
|
||
|
||
|
||
def aggregate_results(results: list[dict[str, Any]]) -> dict[str, Any]:
|
||
latencies = [int(item["latency_ms"]) for item in results]
|
||
route_counter = Counter(item["actual_route"] for item in results)
|
||
class_counter = Counter(item["question_class"] for item in results)
|
||
answer_status_counter = Counter(item["answer_quality"]["status"] for item in results)
|
||
|
||
mismatches = sum(1 for item in results if not item["route_match"])
|
||
degraded = sum(1 for item in results if bool(item["answer_quality"]["degraded"]))
|
||
pass_rate = (answer_status_counter.get("pass", 0) / len(results) * 100.0) if results else 0.0
|
||
|
||
class_quality: dict[str, dict[str, Any]] = defaultdict(lambda: {"total": 0, "pass": 0, "partial": 0, "fail": 0, "mismatch": 0})
|
||
for item in results:
|
||
cls = item["question_class"]
|
||
class_quality[cls]["total"] += 1
|
||
class_quality[cls][item["answer_quality"]["status"]] += 1
|
||
if not item["route_match"]:
|
||
class_quality[cls]["mismatch"] += 1
|
||
|
||
strongest_zone = "n/a"
|
||
weakest_zone = "n/a"
|
||
if class_quality:
|
||
ratios = []
|
||
for cls, bucket in class_quality.items():
|
||
ratio = bucket["pass"] / bucket["total"] if bucket["total"] else 0.0
|
||
ratios.append((cls, ratio, bucket["total"]))
|
||
strongest_zone = sorted(ratios, key=lambda x: (-x[1], -x[2], x[0]))[0][0]
|
||
weakest_zone = sorted(ratios, key=lambda x: (x[1], -x[2], x[0]))[0][0]
|
||
|
||
return {
|
||
"questions_total": len(results),
|
||
"route_mismatch_count": mismatches,
|
||
"degraded_answers_count": degraded,
|
||
"batch_route_count": int(route_counter.get("batch_refresh_then_store", 0)),
|
||
"live_mcp_drilldown_count": int(route_counter.get("live_mcp_drilldown", 0)),
|
||
"hybrid_store_plus_live_count": int(route_counter.get("hybrid_store_plus_live", 0)),
|
||
"store_canonical_count": int(route_counter.get("store_canonical", 0)),
|
||
"store_feature_risk_count": int(route_counter.get("store_feature_risk", 0)),
|
||
"avg_latency_ms": round(statistics.mean(latencies), 2) if latencies else 0.0,
|
||
"p95_latency_ms": round(validation_v1.percentile(latencies, 0.95), 2) if latencies else 0.0,
|
||
"pass_rate": round(pass_rate, 2),
|
||
"strongest_zone": strongest_zone,
|
||
"weakest_zone": weakest_zone,
|
||
"route_distribution": dict(route_counter),
|
||
"question_class_distribution": dict(class_counter),
|
||
"answer_status_distribution": dict(answer_status_counter),
|
||
}
|
||
|
||
|
||
def build_class_summary(results: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||
buckets: dict[str, dict[str, int]] = defaultdict(lambda: {"total": 0, "pass": 0, "partial": 0, "fail": 0, "mismatch": 0})
|
||
for item in results:
|
||
cls = item["question_class"]
|
||
bucket = buckets[cls]
|
||
bucket["total"] += 1
|
||
bucket[item["answer_quality"]["status"]] += 1
|
||
if not item["route_match"]:
|
||
bucket["mismatch"] += 1
|
||
|
||
summary: list[dict[str, Any]] = []
|
||
for cls in PRIMARY_CLASS_ORDER:
|
||
bucket = buckets.get(cls, {"total": 0, "pass": 0, "partial": 0, "fail": 0, "mismatch": 0})
|
||
total = bucket["total"]
|
||
pass_rate = (bucket["pass"] / total * 100.0) if total else 0.0
|
||
summary.append(
|
||
{
|
||
"question_class": cls,
|
||
"questions": total,
|
||
"pass": bucket["pass"],
|
||
"partial": bucket["partial"],
|
||
"fail": bucket["fail"],
|
||
"route_mismatch": bucket["mismatch"],
|
||
"pass_rate": round(pass_rate, 2),
|
||
}
|
||
)
|
||
return summary
|
||
|
||
|
||
def overall_status(agg: dict[str, Any]) -> str:
|
||
if agg["pass_rate"] >= 80.0 and agg["route_mismatch_count"] <= 8 and agg["degraded_answers_count"] <= 6:
|
||
return "pass"
|
||
if agg["pass_rate"] >= 60.0 and agg["route_mismatch_count"] <= 15:
|
||
return "pass_with_notes"
|
||
return "fail"
|
||
|
||
|
||
def render_case_markdown(item: dict[str, Any]) -> str:
|
||
flags = item["decision_flags"]
|
||
suff = item["store_sufficiency"]
|
||
answer_quality = item["answer_quality"]
|
||
title = title_from_question(item["question_text"])
|
||
trace_lines = "\n".join(f"{idx}. {step}" for idx, step in enumerate(item["trace_steps"], start=1))
|
||
issues = item["issues_detected"] if item["issues_detected"] else ["Нет критичных замечаний."]
|
||
|
||
md = []
|
||
md.append("---")
|
||
md.append(f"question_id: {item['question_id']}")
|
||
md.append(f"question_class: {item['question_class']}")
|
||
md.append(f"difficulty: {item['difficulty']}")
|
||
md.append("domain_tags: [" + ", ".join(item["domain_tags"]) + "]")
|
||
md.append(f"expected_route: {item['expected_route']}")
|
||
md.append(f"actual_route: {item['actual_route']}")
|
||
md.append(f"route_match: {as_yaml_bool(bool(item['route_match']))}")
|
||
md.append(f"latency_ms: {item['latency_ms']}")
|
||
md.append("decision_flags:")
|
||
md.append(f" needs_exact_object_trace: {as_yaml_bool(bool(flags['needs_exact_object_trace']))}")
|
||
md.append(f" needs_causal_chain: {as_yaml_bool(bool(flags['needs_causal_chain']))}")
|
||
md.append(f" needs_cross_entity_join: {as_yaml_bool(bool(flags['needs_cross_entity_join']))}")
|
||
md.append(f" needs_full_period_aggregation: {as_yaml_bool(bool(flags['needs_full_period_aggregation']))}")
|
||
md.append(f" needs_ranking: {as_yaml_bool(bool(flags['needs_ranking']))}")
|
||
md.append(f" needs_anomaly_summary: {as_yaml_bool(bool(flags['needs_anomaly_summary']))}")
|
||
md.append(f" needs_runtime_truth: {as_yaml_bool(bool(flags['needs_runtime_truth']))}")
|
||
md.append(f" freshness_sensitive: {as_yaml_bool(bool(flags['freshness_sensitive']))}")
|
||
md.append(f" ambiguous_object_scope: {as_yaml_bool(bool(flags['ambiguous_object_scope']))}")
|
||
md.append(f" store_sufficiency_confident: {as_yaml_bool(bool(flags['store_sufficiency_confident']))}")
|
||
md.append(f" precomputed_aggregate_available: {as_yaml_bool(bool(flags['precomputed_aggregate_available']))}")
|
||
md.append("store_sufficiency:")
|
||
md.append(f" canonical_sufficient: {as_yaml_bool(bool(suff['canonical_sufficient']))}")
|
||
md.append(f" feature_sufficient: {as_yaml_bool(bool(suff['feature_sufficient']))}")
|
||
md.append(f" risk_sufficient: {as_yaml_bool(bool(suff['risk_sufficient']))}")
|
||
md.append(f" freshness_ok: {as_yaml_bool(bool(suff['freshness_ok']))}")
|
||
md.append(f" aggregate_level_ok: {as_yaml_bool(bool(suff['aggregate_level_ok']))}")
|
||
md.append(f" ranking_ready: {as_yaml_bool(bool(suff['ranking_ready']))}")
|
||
md.append(f" explanation_ready: {as_yaml_bool(bool(suff['explanation_ready']))}")
|
||
md.append(" reason_codes: [" + ", ".join(suff["reason_codes"]) + "]")
|
||
md.append("answer_quality:")
|
||
md.append(f" status: {answer_quality['status']}")
|
||
md.append(f" confidence: {answer_quality['confidence']}")
|
||
md.append(f" degraded: {as_yaml_bool(bool(answer_quality['degraded']))}")
|
||
md.append("---")
|
||
md.append("")
|
||
md.append(f"## {item['question_id']}. {title}")
|
||
md.append("")
|
||
md.append("**Вопрос:** ")
|
||
md.append(item["question_text"])
|
||
md.append("")
|
||
md.append("**Проверяемая бухгалтерская гипотеза:** ")
|
||
md.append(item["hypothesis"])
|
||
md.append("")
|
||
md.append("**Что хотел проверить этот вопрос:** ")
|
||
md.append(item["question_probe_summary"])
|
||
md.append("")
|
||
md.append("**Почему вопрос сложный:** ")
|
||
md.append(f"Комбинация class tags: {', '.join(item['class_tags'])}.")
|
||
md.append("")
|
||
md.append("**Куда ожидали маршрут:** ")
|
||
md.append(f"`{item['expected_route']}`")
|
||
md.append("")
|
||
md.append("**Куда реально пошел маршрут:** ")
|
||
md.append(f"`{item['actual_route']}`")
|
||
md.append("")
|
||
md.append("**Краткий ход решения системы:** ")
|
||
md.append(trace_lines)
|
||
md.append("")
|
||
md.append("**Что реально получили:** ")
|
||
md.append(item["answer_text"])
|
||
md.append("")
|
||
md.append("**Вердикт по кейсу:** ")
|
||
md.append(answer_quality["status"])
|
||
md.append("")
|
||
md.append("**Замечания:** ")
|
||
for issue in issues:
|
||
md.append(f"- {issue}")
|
||
md.append(f"- Recommended fix: {item['recommended_fix']}")
|
||
md.append("")
|
||
return "\n".join(md)
|
||
|
||
|
||
def render_report_markdown(
|
||
*,
|
||
run_id: str,
|
||
dataset_version: str,
|
||
executor: str,
|
||
mode_label: str,
|
||
questions_total: int,
|
||
agg: dict[str, Any],
|
||
class_summary: list[dict[str, Any]],
|
||
results: list[dict[str, Any]],
|
||
) -> str:
|
||
md: list[str] = []
|
||
md.append("# Creative Stress Benchmark Run - Accounting Assistant")
|
||
md.append("")
|
||
md.append("## Паспорт")
|
||
md.append(f"- run_id: {run_id}")
|
||
md.append(f"- dataset_version: {dataset_version}")
|
||
md.append(f"- questions_total: {questions_total}")
|
||
md.append("- benchmark_profile: creative_hard_human_like")
|
||
md.append("- generated_from: accounting_automation_structured_notes")
|
||
md.append(f"- mode: validation / stress / pilot-readiness ({mode_label})")
|
||
md.append(f"- executor: {executor}")
|
||
md.append(f"- overall_status: {overall_status(agg)}")
|
||
md.append("")
|
||
md.append("## Executive summary")
|
||
md.append(
|
||
"Проверили маршрутизацию и explainability на длинных предметных формулировках, близких к рабочим запросам главбуха."
|
||
)
|
||
md.append(
|
||
f"По результатам: pass_rate={agg['pass_rate']}%, mismatches={agg['route_mismatch_count']}, degraded={agg['degraded_answers_count']}."
|
||
)
|
||
md.append(
|
||
f"Сильная зона: `{agg['strongest_zone']}`; зона для доработки: `{agg['weakest_zone']}`."
|
||
)
|
||
md.append("")
|
||
md.append("## Сводные метрики")
|
||
md.append(f"- route_mismatch_count: {agg['route_mismatch_count']}")
|
||
md.append(f"- degraded_answers_count: {agg['degraded_answers_count']}")
|
||
md.append(f"- batch_route_count: {agg['batch_route_count']}")
|
||
md.append(f"- live_mcp_drilldown_count: {agg['live_mcp_drilldown_count']}")
|
||
md.append(f"- hybrid_store_plus_live_count: {agg['hybrid_store_plus_live_count']}")
|
||
md.append(f"- store_canonical_count: {agg['store_canonical_count']}")
|
||
md.append(f"- store_feature_risk_count: {agg['store_feature_risk_count']}")
|
||
md.append(f"- avg_latency_ms: {agg['avg_latency_ms']}")
|
||
md.append(f"- p95_latency_ms: {agg['p95_latency_ms']}")
|
||
md.append(f"- pass_rate: {agg['pass_rate']}")
|
||
md.append(f"- strongest_zone: {agg['strongest_zone']}")
|
||
md.append(f"- weakest_zone: {agg['weakest_zone']}")
|
||
md.append("")
|
||
md.append("## Сводка по классам вопросов")
|
||
class_rows = [
|
||
[row["question_class"], row["questions"], row["pass"], row["partial"], row["fail"], row["route_mismatch"], row["pass_rate"]]
|
||
for row in class_summary
|
||
]
|
||
md.append(
|
||
to_md_table(
|
||
["Class", "Questions", "Pass", "Partial", "Fail", "Route mismatch", "Pass rate, %"],
|
||
class_rows,
|
||
)
|
||
)
|
||
md.append("")
|
||
md.append("## Детальные кейсы")
|
||
md.append("")
|
||
for item in results:
|
||
md.append(render_case_markdown(item))
|
||
return "\n".join(md)
|
||
|
||
|
||
def write_scope_outputs(
|
||
*,
|
||
output_dir: Path,
|
||
report_basename: str,
|
||
payload: dict[str, Any],
|
||
report_markdown: str,
|
||
) -> tuple[Path, Path]:
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
md_path = output_dir / f"{report_basename}.md"
|
||
json_path = output_dir / f"{report_basename}.json"
|
||
md_path.write_text(report_markdown, encoding="utf-8")
|
||
json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
return md_path, json_path
|
||
|
||
|
||
def run_scope(
|
||
*,
|
||
scope_name: str,
|
||
questions: list[CreativeQuestion],
|
||
slice_window_key: str,
|
||
store_metadata: dict[str, Any],
|
||
refresh_service: RefreshService,
|
||
feature_service: FeatureService,
|
||
risk_service: RiskService,
|
||
output_dir: Path,
|
||
dataset_version: str,
|
||
executor: str,
|
||
) -> dict[str, Any]:
|
||
now = datetime.now(timezone.utc)
|
||
run_id = f"creative_stress_run_{now.date().isoformat()}_{scope_name}"
|
||
results = run_creative_benchmark(
|
||
questions=questions,
|
||
slice_window_key=slice_window_key,
|
||
store_metadata=store_metadata,
|
||
refresh_service=refresh_service,
|
||
feature_service=feature_service,
|
||
risk_service=risk_service,
|
||
)
|
||
agg = aggregate_results(results)
|
||
class_summary = build_class_summary(results)
|
||
report_md = render_report_markdown(
|
||
run_id=run_id,
|
||
dataset_version=dataset_version,
|
||
executor=executor,
|
||
mode_label=scope_name,
|
||
questions_total=len(results),
|
||
agg=agg,
|
||
class_summary=class_summary,
|
||
results=results,
|
||
)
|
||
|
||
date_str = now.date().isoformat()
|
||
if scope_name == "full":
|
||
basename = f"benchmark_creative_stress_run_accounting_assistant_{date_str}"
|
||
else:
|
||
basename = f"benchmark_creative_stress_run_accounting_assistant_{date_str}_subset15"
|
||
|
||
payload = {
|
||
"status": "success",
|
||
"run_id": run_id,
|
||
"mode": scope_name,
|
||
"generated_at": now.isoformat(),
|
||
"questions_total": len(results),
|
||
"aggregate": agg,
|
||
"class_summary": class_summary,
|
||
"results": results,
|
||
}
|
||
md_path, json_path = write_scope_outputs(
|
||
output_dir=output_dir,
|
||
report_basename=basename,
|
||
payload=payload,
|
||
report_markdown=report_md,
|
||
)
|
||
return {
|
||
"scope": scope_name,
|
||
"md_report": str(md_path),
|
||
"json_report": str(json_path),
|
||
"aggregate": agg,
|
||
}
|
||
|
||
|
||
def main() -> int:
|
||
args = parse_args()
|
||
tz_path = Path(args.tz_path)
|
||
snapshot_path = Path(args.snapshot_path)
|
||
profile_path = Path(args.profile_path)
|
||
output_dir = Path(args.output_dir)
|
||
|
||
for required_path, name in [(tz_path, "TZ file"), (snapshot_path, "snapshot file"), (profile_path, "profile file")]:
|
||
if required_path.exists():
|
||
continue
|
||
message = f"{name} not found: {required_path}"
|
||
if args.strict:
|
||
raise FileNotFoundError(message)
|
||
print(message)
|
||
return 1
|
||
|
||
questions_all = parse_questions_from_tz(tz_path)
|
||
if len(questions_all) < 40 and args.strict:
|
||
raise RuntimeError(f"Expected at least 40 QH questions, parsed={len(questions_all)}")
|
||
|
||
settings = load_settings()
|
||
store = CanonicalStore(settings.canonical_db_url)
|
||
store.ensure_created()
|
||
|
||
snapshot_payload = load_json(snapshot_path)
|
||
_ = load_json(profile_path)
|
||
|
||
refresh_service = RefreshService.build()
|
||
feature_service = FeatureService.build()
|
||
risk_service = RiskService.build()
|
||
|
||
ingestion = validation_v1.ingest_slice_to_store(
|
||
store=store,
|
||
slice_payload=snapshot_payload,
|
||
slice_start=str(snapshot_payload.get("selected_window_start", "")),
|
||
slice_end_exclusive=str(snapshot_payload.get("selected_window_end_exclusive", "")),
|
||
)
|
||
feature_result = feature_service.run_feature_engine().to_dict()
|
||
risk_result = risk_service.run_risk_engine().to_dict()
|
||
|
||
refresh_stats = refresh_service.store_stats()
|
||
feature_stats = feature_service.stats()
|
||
risk_stats = risk_service.stats()
|
||
ontology_audit = validation_v1.run_ontology_mapping_audit(snapshot_payload)
|
||
|
||
store_metadata = validation_v1.build_store_metadata(
|
||
refresh_stats=refresh_stats,
|
||
feature_stats=feature_stats,
|
||
risk_stats=risk_stats,
|
||
ontology_audit=ontology_audit,
|
||
)
|
||
|
||
subset_questions = [q for q in questions_all if q.question_id in PASS1_IDS]
|
||
selected_scopes: list[tuple[str, list[CreativeQuestion]]] = []
|
||
if args.mode in {"subset", "both"}:
|
||
selected_scopes.append(("subset", subset_questions))
|
||
if args.mode in {"full", "both"}:
|
||
selected_scopes.append(("full", questions_all))
|
||
|
||
scope_results: list[dict[str, Any]] = []
|
||
for scope_name, scope_questions in selected_scopes:
|
||
scope_results.append(
|
||
run_scope(
|
||
scope_name=scope_name,
|
||
questions=scope_questions,
|
||
slice_window_key=str(snapshot_payload.get("selected_window_key", "unknown")),
|
||
store_metadata=store_metadata,
|
||
refresh_service=refresh_service,
|
||
feature_service=feature_service,
|
||
risk_service=risk_service,
|
||
output_dir=output_dir,
|
||
dataset_version=args.dataset_version,
|
||
executor=args.executor,
|
||
)
|
||
)
|
||
|
||
summary = {
|
||
"status": "success",
|
||
"tz_path": str(tz_path),
|
||
"snapshot_path": str(snapshot_path),
|
||
"profile_path": str(profile_path),
|
||
"output_dir": str(output_dir),
|
||
"questions_parsed_total": len(questions_all),
|
||
"questions_subset_total": len(subset_questions),
|
||
"ingestion": ingestion,
|
||
"feature_result": feature_result,
|
||
"risk_result": risk_result,
|
||
"scope_results": scope_results,
|
||
}
|
||
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|