NODEDC_1C/scripts/agent_semantic_pack_builder.py

525 lines
24 KiB
Python

from __future__ import annotations
import argparse
import json
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
REPO_ROOT = Path(__file__).resolve().parent.parent
ORCHESTRATION_DIR = REPO_ROOT / "docs" / "orchestration"
SAVED_SESSIONS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "saved_sessions"
TRUTH_HARNESS_GLOB = "address_truth_harness*.json"
CATALOG_SCHEMA_VERSION = "agent_semantic_source_catalog_v1"
INVENTORY_KEYWORDS = ("остатк", "склад", "41 счет")
META_SCOPE_KEYWORDS = ("по какой компании", "по какой организации", "по общей базе", "по какой базе")
META_CAPABILITY_KEYWORDS = ("что ты умеешь", "что можешь", "умеешь")
META_MEMORY_KEYWORDS = ("ты помнишь", "что мы уже выяснили", "по этой позиции уже")
META_HISTORICAL_KEYWORDS = ("историческ",)
SMALLTALK_KEYWORDS = ("привет", "как дела", "йо", "че как", "приветик")
COUNTERPARTY_DOCS_KEYWORDS = ("покажи все документы", "покажи все доки")
COUNTERPARTY_SHIPMENT_KEYWORDS = ("что нам отгружал", "какой товар или услугу")
ACCOUNT_60_KEYWORDS = ("хвосты по счету 60",)
RECEIVABLES_KEYWORDS = ("кто нам должен",)
VAT_KEYWORDS = ("ндс",)
RECIPE_LIBRARY: dict[str, dict[str, Any]] = {
"turnaround_11_phase7_meta_domain_mix": {
"scenario_id": "address_truth_harness_phase7_meta_domain_mix",
"domain": "address_phase7_meta_domain_mix",
"title": "Phase 7 mixed replay for documents, selected-object continuity, meta context, and cross-domain pivots",
"description": (
"Mixed AGENT replay for turnaround 11. The pack interleaves counterparty documents, "
"inventory root and selected-object continuity, meta-space interruptions, memory recap, "
"receivables to inventory same-date pivot, and an account 60 tail check."
),
"bindings": {},
"step_plan": [
{
"slot_id": "slot_01_smalltalk",
"criticality": "info",
"preferred_candidate_ids": [
"address_truth_harness_phase6_provider_axis_mix:step_01_smalltalk",
"address_truth_harness_test2:step_01_chat_opening",
],
"required_tags": ["meta_smalltalk"],
},
{
"slot_id": "slot_03_counterparty_documents",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_phase4_coverage_evidence_mix:step_01_counterparty_documents",
"address_truth_harness_targeted_counterparty_tails:step_01_documents_by_counterparty",
],
"required_tags": ["counterparty_documents"],
},
{
"slot_id": "slot_04_counterparty_shipment_fallback",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_phase4_coverage_evidence_mix:step_02_counterparty_shipments_or_fallback",
"address_truth_harness_targeted_counterparty_tails:step_02_counterparty_item_flow",
],
"required_tags": ["counterparty_shipment_fallback"],
},
{
"slot_id": "slot_05_inventory_root",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_phase7_acceptance_gate_mix:step_01_inventory_march_2021",
"address_truth_harness_phase5_meta_memory_mix:step_01_inventory_root_march_2021",
],
"required_tags": ["inventory_root"],
},
{
"slot_id": "slot_06_selected_object_supplier",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_phase7_acceptance_gate_mix:step_02_selected_item_supplier",
"address_truth_harness_inventory_provenance_restore:step_02_selected_item_supplier",
],
"required_tags": ["selected_object_supplier"],
},
{
"slot_id": "slot_07_meta_capability",
"criticality": "warning",
"preferred_candidate_ids": [
"address_truth_harness_phase5_meta_memory_mix:step_05_capability_meta_interrupt",
"address_truth_harness_phase6_provider_axis_mix:step_03_capability_meta",
],
"required_tags": ["meta_capability"],
},
{
"slot_id": "slot_08_selected_object_documents",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_phase7_acceptance_gate_mix:step_03_selected_item_documents",
"address_truth_harness_inventory_provenance_restore:step_03_selected_item_documents",
],
"required_tags": ["selected_object_documents"],
},
{
"slot_id": "slot_09_meta_memory",
"criticality": "warning",
"preferred_candidate_ids": [
"address_truth_harness_phase5_meta_memory_mix:step_06_memory_recap_after_interrupts",
],
"required_tags": ["meta_memory"],
"override_fields": {
"required_answer_patterns_all": [
"(?i)помню",
"(?i)столешница 600\\*3050\\*26 альмандин"
]
},
},
{
"slot_id": "slot_10_same_date_restore",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_phase7_acceptance_gate_mix:step_04_inventory_same_date_restore",
"address_truth_harness_phase4_coverage_evidence_mix:step_05_inventory_same_date_restore",
],
"required_tags": ["same_date_restore"],
},
{
"slot_id": "slot_11_receivables_root",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_test2:step_02_receivables_march_2020",
],
"required_tags": ["settlements_receivables"],
},
{
"slot_id": "slot_12_same_date_pivot",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_test2:step_03_inventory_same_date",
],
"required_tags": ["same_date_pivot"],
},
{
"slot_id": "slot_13_meta_historical",
"criticality": "warning",
"preferred_candidate_ids": [
"address_truth_harness_phase7_acceptance_gate_mix:step_06_historical_capability_followup",
"address_truth_harness_phase5_meta_memory_mix:step_02_inventory_history_capability_followup",
],
"required_tags": ["meta_historical_capability"],
},
{
"slot_id": "slot_14_account_60",
"criticality": "critical",
"preferred_candidate_ids": [
"address_truth_harness_targeted_counterparty_tails:step_04_open_items_account_60",
],
"required_tags": ["settlements_account_60"],
},
{
"slot_id": "slot_15_meta_scope",
"criticality": "warning",
"preferred_candidate_ids": [
"address_truth_harness_phase6_provider_axis_mix:step_02_data_scope_meta",
"address_truth_harness_phase5_meta_memory_mix:step_03_data_scope_meta_interrupt",
],
"required_tags": ["meta_scope"],
},
],
}
}
def read_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8-sig"))
def write_json(path: Path, payload: Any) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8", newline="\n")
def write_text(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8", newline="\n")
def _now_iso() -> str:
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
def _normalize_tags(raw_tags: list[str]) -> list[str]:
ordered: list[str] = []
seen: set[str] = set()
for raw_tag in raw_tags:
tag = str(raw_tag or "").strip().lower()
if not tag or tag in seen:
continue
seen.add(tag)
ordered.append(tag)
return ordered
def _contains_any(text: str, needles: tuple[str, ...]) -> bool:
normalized = text.casefold()
return any(needle.casefold() in normalized for needle in needles)
def _base_step_tags(question: str, step_id: str, title: str, expected_intents: list[str]) -> list[str]:
haystack = " | ".join([question, step_id, title, " ".join(expected_intents)])
tags: list[str] = []
if _contains_any(haystack, SMALLTALK_KEYWORDS):
tags.append("meta_smalltalk")
if _contains_any(haystack, META_SCOPE_KEYWORDS) or "data_scope" in step_id:
tags.append("meta_scope")
if _contains_any(haystack, META_CAPABILITY_KEYWORDS) or "capability_meta" in step_id:
tags.append("meta_capability")
if _contains_any(haystack, META_MEMORY_KEYWORDS) or "memory" in step_id:
tags.append("meta_memory")
if _contains_any(haystack, META_HISTORICAL_KEYWORDS) or "historical" in step_id:
tags.append("meta_historical_capability")
if _contains_any(haystack, COUNTERPARTY_DOCS_KEYWORDS) or "counterparty_documents" in step_id or "documents_by_counterparty" in step_id:
tags.append("counterparty_documents")
if _contains_any(haystack, COUNTERPARTY_SHIPMENT_KEYWORDS) or "counterparty_shipments" in step_id or "counterparty_item_flow" in step_id:
tags.append("counterparty_shipment_fallback")
if any(intent == "inventory_on_hand_as_of_date" for intent in expected_intents) or _contains_any(haystack, INVENTORY_KEYWORDS):
tags.append("inventory_root")
if "same_date_restore" in step_id or "same_date_restore" in title.casefold():
tags.append("same_date_restore")
if "inventory_same_date" in step_id and "restore" not in step_id:
tags.append("same_date_pivot")
if any(intent == "receivables_confirmed_as_of_date" for intent in expected_intents) or _contains_any(haystack, RECEIVABLES_KEYWORDS):
tags.append("settlements_receivables")
if "open_items_account_60" in step_id or _contains_any(haystack, ACCOUNT_60_KEYWORDS):
tags.append("settlements_account_60")
if "vat" in step_id or _contains_any(haystack, VAT_KEYWORDS):
tags.append("vat")
if any(intent == "inventory_purchase_provenance_for_item" for intent in expected_intents):
tags.extend(["selected_object", "selected_object_supplier"])
if any(intent == "inventory_purchase_documents_for_item" for intent in expected_intents):
tags.extend(["selected_object", "selected_object_documents"])
if any(intent == "inventory_sale_trace_for_item" for intent in expected_intents):
tags.extend(["selected_object", "selected_object_sale"])
if "selected_item" in step_id or "по выбранному объекту" in question.casefold():
tags.append("selected_object")
return _normalize_tags(tags)
def classify_truth_harness_step(spec_path: Path, spec: dict[str, Any], step: dict[str, Any]) -> dict[str, Any]:
question = str(step.get("question") or "").strip()
step_id = str(step.get("step_id") or "").strip()
title = str(step.get("title") or step_id).strip() or step_id
expected_intents = [str(item).strip() for item in (step.get("expected_intents") or []) if str(item).strip()]
expected_catalog_alignment_status = str(step.get("expected_catalog_alignment_status") or "").strip()
expected_catalog_chain_top_match = str(step.get("expected_catalog_chain_top_match") or "").strip()
expected_catalog_selected_matches_top = step.get("expected_catalog_selected_matches_top")
semantic_tag_inputs = [*step.get("semantic_tags", []), *_base_step_tags(question, step_id, title, expected_intents)]
if (
expected_catalog_alignment_status
or expected_catalog_chain_top_match
or expected_catalog_selected_matches_top is not None
):
semantic_tag_inputs.append("planner_catalog_alignment")
semantic_tags = _normalize_tags(semantic_tag_inputs)
return {
"entry_id": f"{spec_path.stem}:{step_id}",
"source_type": "truth_harness_step",
"source_file": str(spec_path.relative_to(REPO_ROOT)).replace("\\", "/"),
"source_title": spec.get("title"),
"scenario_id": spec.get("scenario_id"),
"domain": spec.get("domain"),
"reusable_in_agent_pack": True,
"step_id": step_id,
"title": title,
"question": question,
"criticality": str(step.get("criticality") or "critical"),
"expected_intents": expected_intents,
"expected_catalog_alignment_status": expected_catalog_alignment_status or None,
"expected_catalog_chain_top_match": expected_catalog_chain_top_match or None,
"expected_catalog_selected_matches_top": expected_catalog_selected_matches_top,
"semantic_tags": semantic_tags,
"step_payload": step,
}
def classify_saved_session_question(session_path: Path, session: dict[str, Any], question: str, index: int) -> dict[str, Any]:
tags = _base_step_tags(
question=question,
step_id=f"saved_session_q{index:02d}",
title=str(session.get("title") or ""),
expected_intents=[],
)
return {
"entry_id": f"{session_path.stem}:q{index:02d}",
"source_type": "saved_session_question",
"source_file": str(session_path.relative_to(REPO_ROOT)).replace("\\", "/"),
"source_title": session.get("title"),
"scenario_id": session.get("generation_id"),
"domain": session.get("domain"),
"reusable_in_agent_pack": False,
"step_id": f"saved_session_q{index:02d}",
"title": f"Saved session question {index}",
"question": question,
"criticality": "info",
"expected_intents": [],
"semantic_tags": tags,
"session_mode": session.get("mode"),
"agent_run": bool(session.get("agent_run") or (session.get("context") or {}).get("agent_run")),
}
def build_source_catalog() -> dict[str, Any]:
truth_harness_entries: list[dict[str, Any]] = []
saved_session_entries: list[dict[str, Any]] = []
for spec_path in sorted(ORCHESTRATION_DIR.glob(TRUTH_HARNESS_GLOB)):
spec = read_json(spec_path)
for step in spec.get("steps") or []:
if isinstance(step, dict) and step.get("question"):
truth_harness_entries.append(classify_truth_harness_step(spec_path, spec, step))
for session_path in sorted(SAVED_SESSIONS_DIR.glob("*.json")):
session = read_json(session_path)
for index, question in enumerate(session.get("questions") or [], start=1):
text = str(question or "").strip()
if text:
saved_session_entries.append(classify_saved_session_question(session_path, session, text, index))
tag_counter: Counter[str] = Counter()
reusable_tag_counter: Counter[str] = Counter()
for entry in [*truth_harness_entries, *saved_session_entries]:
for tag in entry.get("semantic_tags") or []:
tag_counter[tag] += 1
if entry.get("reusable_in_agent_pack"):
reusable_tag_counter[tag] += 1
return {
"schema_version": CATALOG_SCHEMA_VERSION,
"generated_at": _now_iso(),
"summary": {
"truth_harness_steps_total": len(truth_harness_entries),
"saved_session_questions_total": len(saved_session_entries),
"reusable_truth_harness_tags": dict(sorted(reusable_tag_counter.items())),
"all_tags": dict(sorted(tag_counter.items())),
},
"truth_harness_entries": truth_harness_entries,
"saved_session_entries": saved_session_entries,
}
def _catalog_markdown(catalog: dict[str, Any]) -> str:
summary = catalog.get("summary") if isinstance(catalog.get("summary"), dict) else {}
lines = [
"# Agent semantic source catalog",
"",
f"- truth_harness_steps_total: `{summary.get('truth_harness_steps_total', 0)}`",
f"- saved_session_questions_total: `{summary.get('saved_session_questions_total', 0)}`",
"",
"## Reusable truth-harness tags",
]
reusable_tags = summary.get("reusable_truth_harness_tags") or {}
for tag, count in reusable_tags.items():
lines.append(f"- `{tag}`: `{count}`")
lines.extend(["", "## Reusable truth-harness steps"])
for entry in catalog.get("truth_harness_entries") or []:
tags = ", ".join(entry.get("semantic_tags") or []) or "none"
catalog_bits = []
if entry.get("expected_catalog_alignment_status"):
catalog_bits.append(f"status={entry.get('expected_catalog_alignment_status')}")
if entry.get("expected_catalog_chain_top_match"):
catalog_bits.append(f"top={entry.get('expected_catalog_chain_top_match')}")
if entry.get("expected_catalog_selected_matches_top") is not None:
catalog_bits.append(f"selected_matches_top={entry.get('expected_catalog_selected_matches_top')}")
catalog_suffix = f" | catalog_alignment: {', '.join(catalog_bits)}" if catalog_bits else ""
lines.append(
f"- `{entry.get('entry_id')}` | tags: {tags}{catalog_suffix} | question: {entry.get('question')}"
)
lines.extend(["", "## Saved session questions"])
for entry in catalog.get("saved_session_entries") or []:
tags = ", ".join(entry.get("semantic_tags") or []) or "none"
lines.append(
f"- `{entry.get('entry_id')}` | tags: {tags} | question: {entry.get('question')}"
)
return "\n".join(lines).strip() + "\n"
def _find_catalog_entry(catalog: dict[str, Any], entry_id: str) -> dict[str, Any] | None:
for entry in catalog.get("truth_harness_entries") or []:
if entry.get("entry_id") == entry_id:
return entry
return None
def _entry_matches_tags(entry: dict[str, Any], required_tags: list[str]) -> bool:
entry_tags = set(entry.get("semantic_tags") or [])
return all(tag in entry_tags for tag in required_tags)
def _select_recipe_entry(
catalog: dict[str, Any],
slot: dict[str, Any],
used_entry_ids: set[str],
) -> dict[str, Any]:
preferred_ids = [str(item).strip() for item in (slot.get("preferred_candidate_ids") or []) if str(item).strip()]
required_tags = [str(item).strip().lower() for item in (slot.get("required_tags") or []) if str(item).strip()]
for entry_id in preferred_ids:
entry = _find_catalog_entry(catalog, entry_id)
if entry and entry.get("reusable_in_agent_pack") and entry_id not in used_entry_ids:
if not required_tags or _entry_matches_tags(entry, required_tags):
return entry
candidates = [
entry
for entry in catalog.get("truth_harness_entries") or []
if entry.get("reusable_in_agent_pack")
and entry.get("entry_id") not in used_entry_ids
and _entry_matches_tags(entry, required_tags)
]
if not candidates:
raise RuntimeError(
f"Could not resolve slot `{slot.get('slot_id')}` with tags {required_tags or ['<none>']}"
)
return candidates[0]
def build_recipe_spec(catalog: dict[str, Any], recipe_name: str) -> dict[str, Any]:
recipe = RECIPE_LIBRARY.get(recipe_name)
if recipe is None:
supported = ", ".join(sorted(RECIPE_LIBRARY))
raise RuntimeError(f"Unknown recipe `{recipe_name}`. Supported recipes: {supported}")
used_entry_ids: set[str] = set()
steps: list[dict[str, Any]] = []
for slot_index, slot in enumerate(recipe.get("step_plan") or [], start=1):
entry = _select_recipe_entry(catalog, slot, used_entry_ids)
used_entry_ids.add(str(entry["entry_id"]))
payload = dict(entry["step_payload"])
payload["criticality"] = str(slot.get("criticality") or payload.get("criticality") or "critical")
payload["semantic_tags"] = _normalize_tags(
[*payload.get("semantic_tags", []), *(entry.get("semantic_tags") or [])]
)
override_fields = slot.get("override_fields")
if isinstance(override_fields, dict):
payload.update(override_fields)
payload["notes"] = (
f"{str(payload.get('notes') or '').strip()} "
f"[mixed_pack_slot={slot.get('slot_id')} source={entry.get('entry_id')}]"
).strip()
steps.append(payload)
return {
"schema_version": "domain_truth_harness_spec_v1",
"scenario_id": recipe["scenario_id"],
"domain": recipe["domain"],
"title": recipe["title"],
"description": recipe["description"],
"bindings": recipe.get("bindings") or {},
"steps": steps,
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Inventory reusable semantic sources and build mixed AGENT replay packs.")
subparsers = parser.add_subparsers(dest="command", required=True)
inventory_parser = subparsers.add_parser("inventory", help="Build a source catalog from harness specs and saved sessions.")
inventory_parser.add_argument(
"--output-json",
default="docs/orchestration/agent_semantic_source_catalog.json",
help="Where to write the catalog JSON.",
)
inventory_parser.add_argument(
"--output-md",
default="docs/orchestration/agent_semantic_source_catalog.md",
help="Where to write the catalog markdown summary.",
)
build_parser = subparsers.add_parser("build-pack", help="Build a mixed truth-harness pack from catalogued sources.")
build_parser.add_argument("--recipe", required=True, help="Recipe name to build.")
build_parser.add_argument(
"--output-spec",
default="docs/orchestration/address_truth_harness_phase7_meta_domain_mix.json",
help="Where to write the generated truth-harness spec.",
)
return parser.parse_args()
def _resolve_repo_path(raw_path: str) -> Path:
path = Path(raw_path)
return path if path.is_absolute() else (REPO_ROOT / path).resolve()
def run_inventory(args: argparse.Namespace) -> int:
catalog = build_source_catalog()
output_json = _resolve_repo_path(args.output_json)
output_md = _resolve_repo_path(args.output_md)
write_json(output_json, catalog)
write_text(output_md, _catalog_markdown(catalog))
print(f"Catalog written to {output_json}")
print(f"Summary written to {output_md}")
print(
"Reusable truth-harness tags:",
", ".join(sorted((catalog.get("summary") or {}).get("reusable_truth_harness_tags", {}).keys())),
)
return 0
def run_build_pack(args: argparse.Namespace) -> int:
catalog = build_source_catalog()
spec = build_recipe_spec(catalog, args.recipe)
output_spec = _resolve_repo_path(args.output_spec)
write_json(output_spec, spec)
print(f"Mixed pack written to {output_spec}")
print(f"scenario_id={spec['scenario_id']}")
print(f"steps={len(spec['steps'])}")
return 0
def main() -> int:
args = parse_args()
if args.command == "inventory":
return run_inventory(args)
if args.command == "build-pack":
return run_build_pack(args)
raise RuntimeError(f"Unsupported command: {args.command}")
if __name__ == "__main__":
raise SystemExit(main())