NODEDC_1C/scripts/agent_semantic_pack_builder.py

from __future__ import annotations

import argparse
import json
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


REPO_ROOT = Path(__file__).resolve().parent.parent
ORCHESTRATION_DIR = REPO_ROOT / "docs" / "orchestration"
SAVED_SESSIONS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "saved_sessions"
TRUTH_HARNESS_GLOB = "address_truth_harness*.json"
CATALOG_SCHEMA_VERSION = "agent_semantic_source_catalog_v1"

INVENTORY_KEYWORDS = ("остатк", "склад", "41 счет")
META_SCOPE_KEYWORDS = ("по какой компании", "по какой организации", "по общей базе", "по какой базе")
META_CAPABILITY_KEYWORDS = ("что ты умеешь", "что можешь", "умеешь")
META_MEMORY_KEYWORDS = ("ты помнишь", "что мы уже выяснили", "по этой позиции уже")
META_HISTORICAL_KEYWORDS = ("историческ",)
SMALLTALK_KEYWORDS = ("привет", "как дела", "йо", "че как", "приветик")
COUNTERPARTY_DOCS_KEYWORDS = ("покажи все документы", "покажи все доки")
COUNTERPARTY_SHIPMENT_KEYWORDS = ("что нам отгружал", "какой товар или услугу")
ACCOUNT_60_KEYWORDS = ("хвосты по счету 60",)
RECEIVABLES_KEYWORDS = ("кто нам должен",)
VAT_KEYWORDS = ("ндс",)

RECIPE_LIBRARY: dict[str, dict[str, Any]] = {
    "turnaround_11_phase7_meta_domain_mix": {
        "scenario_id": "address_truth_harness_phase7_meta_domain_mix",
        "domain": "address_phase7_meta_domain_mix",
        "title": "Phase 7 mixed replay for documents, selected-object continuity, meta context, and cross-domain pivots",
        "description": (
            "Mixed AGENT replay for turnaround 11. The pack interleaves counterparty documents, "
            "inventory root and selected-object continuity, meta-space interruptions, memory recap, "
            "receivables to inventory same-date pivot, and an account 60 tail check."
        ),
        "bindings": {},
        "step_plan": [
            {
                "slot_id": "slot_01_smalltalk",
                "criticality": "info",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase6_provider_axis_mix:step_01_smalltalk",
                    "address_truth_harness_test2:step_01_chat_opening",
                ],
                "required_tags": ["meta_smalltalk"],
            },
            {
                "slot_id": "slot_03_counterparty_documents",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase4_coverage_evidence_mix:step_01_counterparty_documents",
                    "address_truth_harness_targeted_counterparty_tails:step_01_documents_by_counterparty",
                ],
                "required_tags": ["counterparty_documents"],
            },
            {
                "slot_id": "slot_04_counterparty_shipment_fallback",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase4_coverage_evidence_mix:step_02_counterparty_shipments_or_fallback",
                    "address_truth_harness_targeted_counterparty_tails:step_02_counterparty_item_flow",
                ],
                "required_tags": ["counterparty_shipment_fallback"],
            },
            {
                "slot_id": "slot_05_inventory_root",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase7_acceptance_gate_mix:step_01_inventory_march_2021",
                    "address_truth_harness_phase5_meta_memory_mix:step_01_inventory_root_march_2021",
                ],
                "required_tags": ["inventory_root"],
            },
            {
                "slot_id": "slot_06_selected_object_supplier",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase7_acceptance_gate_mix:step_02_selected_item_supplier",
                    "address_truth_harness_inventory_provenance_restore:step_02_selected_item_supplier",
                ],
                "required_tags": ["selected_object_supplier"],
            },
            {
                "slot_id": "slot_07_meta_capability",
                "criticality": "warning",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase5_meta_memory_mix:step_05_capability_meta_interrupt",
                    "address_truth_harness_phase6_provider_axis_mix:step_03_capability_meta",
                ],
                "required_tags": ["meta_capability"],
            },
            {
                "slot_id": "slot_08_selected_object_documents",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase7_acceptance_gate_mix:step_03_selected_item_documents",
                    "address_truth_harness_inventory_provenance_restore:step_03_selected_item_documents",
                ],
                "required_tags": ["selected_object_documents"],
            },
            {
                "slot_id": "slot_09_meta_memory",
                "criticality": "warning",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase5_meta_memory_mix:step_06_memory_recap_after_interrupts",
                ],
                "required_tags": ["meta_memory"],
                "override_fields": {
                    "required_answer_patterns_all": [
                        "(?i)помню",
                        "(?i)столешница 600\\*3050\\*26 альмандин"
                    ]
                },
            },
            {
                "slot_id": "slot_10_same_date_restore",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase7_acceptance_gate_mix:step_04_inventory_same_date_restore",
                    "address_truth_harness_phase4_coverage_evidence_mix:step_05_inventory_same_date_restore",
                ],
                "required_tags": ["same_date_restore"],
            },
            {
                "slot_id": "slot_11_receivables_root",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_test2:step_02_receivables_march_2020",
                ],
                "required_tags": ["settlements_receivables"],
            },
            {
                "slot_id": "slot_12_same_date_pivot",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_test2:step_03_inventory_same_date",
                ],
                "required_tags": ["same_date_pivot"],
            },
            {
                "slot_id": "slot_13_meta_historical",
                "criticality": "warning",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase7_acceptance_gate_mix:step_06_historical_capability_followup",
                    "address_truth_harness_phase5_meta_memory_mix:step_02_inventory_history_capability_followup",
                ],
                "required_tags": ["meta_historical_capability"],
            },
            {
                "slot_id": "slot_14_account_60",
                "criticality": "critical",
                "preferred_candidate_ids": [
                    "address_truth_harness_targeted_counterparty_tails:step_04_open_items_account_60",
                ],
                "required_tags": ["settlements_account_60"],
            },
            {
                "slot_id": "slot_15_meta_scope",
                "criticality": "warning",
                "preferred_candidate_ids": [
                    "address_truth_harness_phase6_provider_axis_mix:step_02_data_scope_meta",
                    "address_truth_harness_phase5_meta_memory_mix:step_03_data_scope_meta_interrupt",
                ],
                "required_tags": ["meta_scope"],
            },
        ],
    }
}


def read_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8-sig"))


def write_json(path: Path, payload: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8", newline="\n")


def write_text(path: Path, content: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8", newline="\n")


def _now_iso() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()


def _normalize_tags(raw_tags: list[str]) -> list[str]:
    ordered: list[str] = []
    seen: set[str] = set()
    for raw_tag in raw_tags:
        tag = str(raw_tag or "").strip().lower()
        if not tag or tag in seen:
            continue
        seen.add(tag)
        ordered.append(tag)
    return ordered


def _contains_any(text: str, needles: tuple[str, ...]) -> bool:
    normalized = text.casefold()
    return any(needle.casefold() in normalized for needle in needles)


def _base_step_tags(question: str, step_id: str, title: str, expected_intents: list[str]) -> list[str]:
    haystack = " | ".join([question, step_id, title, " ".join(expected_intents)])
    tags: list[str] = []
    if _contains_any(haystack, SMALLTALK_KEYWORDS):
        tags.append("meta_smalltalk")
    if _contains_any(haystack, META_SCOPE_KEYWORDS) or "data_scope" in step_id:
        tags.append("meta_scope")
    if _contains_any(haystack, META_CAPABILITY_KEYWORDS) or "capability_meta" in step_id:
        tags.append("meta_capability")
    if _contains_any(haystack, META_MEMORY_KEYWORDS) or "memory" in step_id:
        tags.append("meta_memory")
    if _contains_any(haystack, META_HISTORICAL_KEYWORDS) or "historical" in step_id:
        tags.append("meta_historical_capability")
    if _contains_any(haystack, COUNTERPARTY_DOCS_KEYWORDS) or "counterparty_documents" in step_id or "documents_by_counterparty" in step_id:
        tags.append("counterparty_documents")
    if _contains_any(haystack, COUNTERPARTY_SHIPMENT_KEYWORDS) or "counterparty_shipments" in step_id or "counterparty_item_flow" in step_id:
        tags.append("counterparty_shipment_fallback")
    if any(intent == "inventory_on_hand_as_of_date" for intent in expected_intents) or _contains_any(haystack, INVENTORY_KEYWORDS):
        tags.append("inventory_root")
    if "same_date_restore" in step_id or "same_date_restore" in title.casefold():
        tags.append("same_date_restore")
    if "inventory_same_date" in step_id and "restore" not in step_id:
        tags.append("same_date_pivot")
    if any(intent == "receivables_confirmed_as_of_date" for intent in expected_intents) or _contains_any(haystack, RECEIVABLES_KEYWORDS):
        tags.append("settlements_receivables")
    if "open_items_account_60" in step_id or _contains_any(haystack, ACCOUNT_60_KEYWORDS):
        tags.append("settlements_account_60")
    if "vat" in step_id or _contains_any(haystack, VAT_KEYWORDS):
        tags.append("vat")
    if any(intent == "inventory_purchase_provenance_for_item" for intent in expected_intents):
        tags.extend(["selected_object", "selected_object_supplier"])
    if any(intent == "inventory_purchase_documents_for_item" for intent in expected_intents):
        tags.extend(["selected_object", "selected_object_documents"])
    if any(intent == "inventory_sale_trace_for_item" for intent in expected_intents):
        tags.extend(["selected_object", "selected_object_sale"])
    if "selected_item" in step_id or "по выбранному объекту" in question.casefold():
        tags.append("selected_object")
    return _normalize_tags(tags)


def classify_truth_harness_step(spec_path: Path, spec: dict[str, Any], step: dict[str, Any]) -> dict[str, Any]:
    question = str(step.get("question") or "").strip()
    step_id = str(step.get("step_id") or "").strip()
    title = str(step.get("title") or step_id).strip() or step_id
    expected_intents = [str(item).strip() for item in (step.get("expected_intents") or []) if str(item).strip()]
    expected_catalog_alignment_status = str(step.get("expected_catalog_alignment_status") or "").strip()
    expected_catalog_chain_top_match = str(step.get("expected_catalog_chain_top_match") or "").strip()
    expected_catalog_selected_matches_top = step.get("expected_catalog_selected_matches_top")
    semantic_tag_inputs = [*step.get("semantic_tags", []), *_base_step_tags(question, step_id, title, expected_intents)]
    if (
        expected_catalog_alignment_status
        or expected_catalog_chain_top_match
        or expected_catalog_selected_matches_top is not None
    ):
        semantic_tag_inputs.append("planner_catalog_alignment")
    semantic_tags = _normalize_tags(semantic_tag_inputs)
    return {
        "entry_id": f"{spec_path.stem}:{step_id}",
        "source_type": "truth_harness_step",
        "source_file": str(spec_path.relative_to(REPO_ROOT)).replace("\\", "/"),
        "source_title": spec.get("title"),
        "scenario_id": spec.get("scenario_id"),
        "domain": spec.get("domain"),
        "reusable_in_agent_pack": True,
        "step_id": step_id,
        "title": title,
        "question": question,
        "criticality": str(step.get("criticality") or "critical"),
        "expected_intents": expected_intents,
        "expected_catalog_alignment_status": expected_catalog_alignment_status or None,
        "expected_catalog_chain_top_match": expected_catalog_chain_top_match or None,
        "expected_catalog_selected_matches_top": expected_catalog_selected_matches_top,
        "semantic_tags": semantic_tags,
        "step_payload": step,
    }


def classify_saved_session_question(session_path: Path, session: dict[str, Any], question: str, index: int) -> dict[str, Any]:
    tags = _base_step_tags(
        question=question,
        step_id=f"saved_session_q{index:02d}",
        title=str(session.get("title") or ""),
        expected_intents=[],
    )
    return {
        "entry_id": f"{session_path.stem}:q{index:02d}",
        "source_type": "saved_session_question",
        "source_file": str(session_path.relative_to(REPO_ROOT)).replace("\\", "/"),
        "source_title": session.get("title"),
        "scenario_id": session.get("generation_id"),
        "domain": session.get("domain"),
        "reusable_in_agent_pack": False,
        "step_id": f"saved_session_q{index:02d}",
        "title": f"Saved session question {index}",
        "question": question,
        "criticality": "info",
        "expected_intents": [],
        "semantic_tags": tags,
        "session_mode": session.get("mode"),
        "agent_run": bool(session.get("agent_run") or (session.get("context") or {}).get("agent_run")),
    }


def build_source_catalog() -> dict[str, Any]:
    truth_harness_entries: list[dict[str, Any]] = []
    saved_session_entries: list[dict[str, Any]] = []
    for spec_path in sorted(ORCHESTRATION_DIR.glob(TRUTH_HARNESS_GLOB)):
        spec = read_json(spec_path)
        for step in spec.get("steps") or []:
            if isinstance(step, dict) and step.get("question"):
                truth_harness_entries.append(classify_truth_harness_step(spec_path, spec, step))
    for session_path in sorted(SAVED_SESSIONS_DIR.glob("*.json")):
        session = read_json(session_path)
        for index, question in enumerate(session.get("questions") or [], start=1):
            text = str(question or "").strip()
            if text:
                saved_session_entries.append(classify_saved_session_question(session_path, session, text, index))

    tag_counter: Counter[str] = Counter()
    reusable_tag_counter: Counter[str] = Counter()
    for entry in [*truth_harness_entries, *saved_session_entries]:
        for tag in entry.get("semantic_tags") or []:
            tag_counter[tag] += 1
            if entry.get("reusable_in_agent_pack"):
                reusable_tag_counter[tag] += 1

    return {
        "schema_version": CATALOG_SCHEMA_VERSION,
        "generated_at": _now_iso(),
        "summary": {
            "truth_harness_steps_total": len(truth_harness_entries),
            "saved_session_questions_total": len(saved_session_entries),
            "reusable_truth_harness_tags": dict(sorted(reusable_tag_counter.items())),
            "all_tags": dict(sorted(tag_counter.items())),
        },
        "truth_harness_entries": truth_harness_entries,
        "saved_session_entries": saved_session_entries,
    }


def _catalog_markdown(catalog: dict[str, Any]) -> str:
    summary = catalog.get("summary") if isinstance(catalog.get("summary"), dict) else {}
    lines = [
        "# Agent semantic source catalog",
        "",
        f"- truth_harness_steps_total: `{summary.get('truth_harness_steps_total', 0)}`",
        f"- saved_session_questions_total: `{summary.get('saved_session_questions_total', 0)}`",
        "",
        "## Reusable truth-harness tags",
    ]
    reusable_tags = summary.get("reusable_truth_harness_tags") or {}
    for tag, count in reusable_tags.items():
        lines.append(f"- `{tag}`: `{count}`")
    lines.extend(["", "## Reusable truth-harness steps"])
    for entry in catalog.get("truth_harness_entries") or []:
        tags = ", ".join(entry.get("semantic_tags") or []) or "none"
        catalog_bits = []
        if entry.get("expected_catalog_alignment_status"):
            catalog_bits.append(f"status={entry.get('expected_catalog_alignment_status')}")
        if entry.get("expected_catalog_chain_top_match"):
            catalog_bits.append(f"top={entry.get('expected_catalog_chain_top_match')}")
        if entry.get("expected_catalog_selected_matches_top") is not None:
            catalog_bits.append(f"selected_matches_top={entry.get('expected_catalog_selected_matches_top')}")
        catalog_suffix = f" | catalog_alignment: {', '.join(catalog_bits)}" if catalog_bits else ""
        lines.append(
            f"- `{entry.get('entry_id')}` | tags: {tags}{catalog_suffix} | question: {entry.get('question')}"
        )
    lines.extend(["", "## Saved session questions"])
    for entry in catalog.get("saved_session_entries") or []:
        tags = ", ".join(entry.get("semantic_tags") or []) or "none"
        lines.append(
            f"- `{entry.get('entry_id')}` | tags: {tags} | question: {entry.get('question')}"
        )
    return "\n".join(lines).strip() + "\n"


def _find_catalog_entry(catalog: dict[str, Any], entry_id: str) -> dict[str, Any] | None:
    for entry in catalog.get("truth_harness_entries") or []:
        if entry.get("entry_id") == entry_id:
            return entry
    return None


def _entry_matches_tags(entry: dict[str, Any], required_tags: list[str]) -> bool:
    entry_tags = set(entry.get("semantic_tags") or [])
    return all(tag in entry_tags for tag in required_tags)


def _select_recipe_entry(
    catalog: dict[str, Any],
    slot: dict[str, Any],
    used_entry_ids: set[str],
) -> dict[str, Any]:
    preferred_ids = [str(item).strip() for item in (slot.get("preferred_candidate_ids") or []) if str(item).strip()]
    required_tags = [str(item).strip().lower() for item in (slot.get("required_tags") or []) if str(item).strip()]
    for entry_id in preferred_ids:
        entry = _find_catalog_entry(catalog, entry_id)
        if entry and entry.get("reusable_in_agent_pack") and entry_id not in used_entry_ids:
            if not required_tags or _entry_matches_tags(entry, required_tags):
                return entry
    candidates = [
        entry
        for entry in catalog.get("truth_harness_entries") or []
        if entry.get("reusable_in_agent_pack")
        and entry.get("entry_id") not in used_entry_ids
        and _entry_matches_tags(entry, required_tags)
    ]
    if not candidates:
        raise RuntimeError(
            f"Could not resolve slot `{slot.get('slot_id')}` with tags {required_tags or ['<none>']}"
        )
    return candidates[0]


def build_recipe_spec(catalog: dict[str, Any], recipe_name: str) -> dict[str, Any]:
    recipe = RECIPE_LIBRARY.get(recipe_name)
    if recipe is None:
        supported = ", ".join(sorted(RECIPE_LIBRARY))
        raise RuntimeError(f"Unknown recipe `{recipe_name}`. Supported recipes: {supported}")
    used_entry_ids: set[str] = set()
    steps: list[dict[str, Any]] = []
    for slot_index, slot in enumerate(recipe.get("step_plan") or [], start=1):
        entry = _select_recipe_entry(catalog, slot, used_entry_ids)
        used_entry_ids.add(str(entry["entry_id"]))
        payload = dict(entry["step_payload"])
        payload["criticality"] = str(slot.get("criticality") or payload.get("criticality") or "critical")
        payload["semantic_tags"] = _normalize_tags(
            [*payload.get("semantic_tags", []), *(entry.get("semantic_tags") or [])]
        )
        override_fields = slot.get("override_fields")
        if isinstance(override_fields, dict):
            payload.update(override_fields)
        payload["notes"] = (
            f"{str(payload.get('notes') or '').strip()} "
            f"[mixed_pack_slot={slot.get('slot_id')} source={entry.get('entry_id')}]"
        ).strip()
        steps.append(payload)
    return {
        "schema_version": "domain_truth_harness_spec_v1",
        "scenario_id": recipe["scenario_id"],
        "domain": recipe["domain"],
        "title": recipe["title"],
        "description": recipe["description"],
        "bindings": recipe.get("bindings") or {},
        "steps": steps,
    }


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Inventory reusable semantic sources and build mixed AGENT replay packs.")
    subparsers = parser.add_subparsers(dest="command", required=True)

    inventory_parser = subparsers.add_parser("inventory", help="Build a source catalog from harness specs and saved sessions.")
    inventory_parser.add_argument(
        "--output-json",
        default="docs/orchestration/agent_semantic_source_catalog.json",
        help="Where to write the catalog JSON.",
    )
    inventory_parser.add_argument(
        "--output-md",
        default="docs/orchestration/agent_semantic_source_catalog.md",
        help="Where to write the catalog markdown summary.",
    )

    build_parser = subparsers.add_parser("build-pack", help="Build a mixed truth-harness pack from catalogued sources.")
    build_parser.add_argument("--recipe", required=True, help="Recipe name to build.")
    build_parser.add_argument(
        "--output-spec",
        default="docs/orchestration/address_truth_harness_phase7_meta_domain_mix.json",
        help="Where to write the generated truth-harness spec.",
    )
    return parser.parse_args()


def _resolve_repo_path(raw_path: str) -> Path:
    path = Path(raw_path)
    return path if path.is_absolute() else (REPO_ROOT / path).resolve()


def run_inventory(args: argparse.Namespace) -> int:
    catalog = build_source_catalog()
    output_json = _resolve_repo_path(args.output_json)
    output_md = _resolve_repo_path(args.output_md)
    write_json(output_json, catalog)
    write_text(output_md, _catalog_markdown(catalog))
    print(f"Catalog written to {output_json}")
    print(f"Summary written to {output_md}")
    print(
        "Reusable truth-harness tags:",
        ", ".join(sorted((catalog.get("summary") or {}).get("reusable_truth_harness_tags", {}).keys())),
    )
    return 0


def run_build_pack(args: argparse.Namespace) -> int:
    catalog = build_source_catalog()
    spec = build_recipe_spec(catalog, args.recipe)
    output_spec = _resolve_repo_path(args.output_spec)
    write_json(output_spec, spec)
    print(f"Mixed pack written to {output_spec}")
    print(f"scenario_id={spec['scenario_id']}")
    print(f"steps={len(spec['steps'])}")
    return 0


def main() -> int:
    args = parse_args()
    if args.command == "inventory":
        return run_inventory(args)
    if args.command == "build-pack":
        return run_build_pack(args)
    raise RuntimeError(f"Unsupported command: {args.command}")


if __name__ == "__main__":
    raise SystemExit(main())