From 9957f82c21aecbc5e0099f27b30c93bad18745d1 Mon Sep 17 00:00:00 2001 From: dctouch Date: Sun, 24 May 2026 10:34:20 +0300 Subject: [PATCH] =?UTF-8?q?=D0=A3=D1=81=D0=B8=D0=BB=D0=B8=D1=82=D1=8C=20re?= =?UTF-8?q?liability=20gate=20=D0=B8=20=D0=BF=D1=80=D0=B8=D0=BD=D1=8F?= =?UTF-8?q?=D1=82=D1=8C=20margin=20semantic=20replay?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .codex/agents/business_answer_reviewer.toml | 36 + .../agents/evidence_field_truth_reviewer.toml | 38 + .codex/agents/regression_pack_reviewer.toml | 35 + .codex/agents/route_capability_reviewer.toml | 37 + .../references/subagent_review_protocol.md | 66 ++ ...in_profitability_reliability_20260524.json | 79 ++ .../contracts/margin_profitability_v1.json | 77 ++ docs/orchestration/issue_catalog.json | 221 ++++++ .../dist/services/addressIntentResolver.js | 5 +- .../services/addressInventoryIntentSignals.js | 21 + .../address_runtime/decomposeStage.js | 9 +- .../address_runtime/inventoryReplyBuilders.js | 45 +- .../src/services/addressIntentResolver.ts | 8 +- .../services/addressInventoryIntentSignals.ts | 30 + .../address_runtime/decomposeStage.ts | 14 +- .../address_runtime/inventoryReplyBuilders.ts | 65 +- .../addressIntentResolverRegression.test.ts | 10 + .../addressInventoryIntentSignals.test.ts | 9 + ...fitabilitySelectedObjectRegression.test.ts | 27 + .../eval_cases/eval-QvCdJw3L2F.report.json | 137 ++++ .../eval_cases/eval-_NYgFC2nU2.report.json | 112 +++ .../eval_cases/eval-lFWABdc8V1.report.json | 112 +++ .../data/presets/preset-it0w_T10.json | 16 +- .../data/presets/preset-rk8wKqPt.json | 14 +- .../data/presets/preset-splJ9OGZ.json | 16 +- scripts/agent_runtime_manifest.py | 467 ++++++++++++ scripts/domain_case_loop.py | 714 +++++++++++++++++- scripts/domain_truth_harness.py | 38 + scripts/prompt_registry_healthcheck.py | 72 ++ scripts/save_agent_semantic_run.py | 33 + scripts/stage_agent_loop.py | 14 + scripts/test_agent_runtime_manifest.py | 158 ++++ scripts/test_domain_case_loop_lead_handoff.py | 110 +++ scripts/test_domain_case_loop_step_state.py | 117 +++ scripts/test_save_agent_semantic_run.py | 68 ++ 35 files changed, 2963 insertions(+), 67 deletions(-) create mode 100644 .codex/agents/business_answer_reviewer.toml create mode 100644 .codex/agents/evidence_field_truth_reviewer.toml create mode 100644 .codex/agents/regression_pack_reviewer.toml create mode 100644 .codex/agents/route_capability_reviewer.toml create mode 100644 .codex/skills/domain-case-loop/references/subagent_review_protocol.md create mode 100644 docs/orchestration/agent_margin_profitability_reliability_20260524.json create mode 100644 docs/orchestration/contracts/margin_profitability_v1.json create mode 100644 docs/orchestration/issue_catalog.json create mode 100644 llm_normalizer/data/eval_cases/eval-QvCdJw3L2F.report.json create mode 100644 llm_normalizer/data/eval_cases/eval-_NYgFC2nU2.report.json create mode 100644 llm_normalizer/data/eval_cases/eval-lFWABdc8V1.report.json create mode 100644 scripts/agent_runtime_manifest.py create mode 100644 scripts/prompt_registry_healthcheck.py create mode 100644 scripts/test_agent_runtime_manifest.py diff --git a/.codex/agents/business_answer_reviewer.toml b/.codex/agents/business_answer_reviewer.toml new file mode 100644 index 0000000..393c2bc --- /dev/null +++ b/.codex/agents/business_answer_reviewer.toml @@ -0,0 +1,36 @@ +name = "business_answer_reviewer" +description = "Read-only reviewer for user-facing business answers in NDC_1C semantic replay artifacts." +model = "gpt-5.4" +model_reasoning_effort = "medium" +sandbox_mode = "read-only" +developer_instructions = """ +You are a read-only business-answer reviewer for NDC_1C. + +You are a tool for Lead/Orchestrator, not a handoff owner. +You do not edit files, save autoruns, accept runs, or mutate contracts. + +Read only user-facing answer surfaces: +- output.md +- baseline_output.md / rerun_output.md +- step output excerpts embedded in review bundles + +Do not rely on route ids, debug ids, or capability ids as acceptance proof. + +Return a compact JSON object: +- reviewer: business_answer_reviewer +- status: accepted | partial | blocked +- direct_answer_ok: boolean +- business_usefulness_ok: boolean +- technical_garbage_present: boolean +- issue_codes: string[] +- evidence_paths: string[] +- findings: string[] +- suggested_contract_notes: string[] + +Judge: +- whether the first line answers the user's business question directly; +- whether the answer is understandable for a manager, accountant, or operator; +- whether service/debug/runtime mechanics leak into the final answer; +- whether a limited answer clearly states what is unknown and the next useful action. +""" +nickname_candidates = ["Beacon", "Ledger", "Plain"] diff --git a/.codex/agents/evidence_field_truth_reviewer.toml b/.codex/agents/evidence_field_truth_reviewer.toml new file mode 100644 index 0000000..4195d3a --- /dev/null +++ b/.codex/agents/evidence_field_truth_reviewer.toml @@ -0,0 +1,38 @@ +name = "evidence_field_truth_reviewer" +description = "Read-only reviewer for evidence truth, field mapping, dates, amounts, selected objects, and carryover in NDC_1C replay artifacts." +model = "gpt-5.4" +model_reasoning_effort = "high" +sandbox_mode = "read-only" +developer_instructions = """ +You are a read-only evidence and field-truth reviewer for NDC_1C. + +You are a tool for Lead/Orchestrator, not a handoff owner. +You do not edit files, save autoruns, accept runs, or mutate contracts. + +Read: +- turn.json +- step_state.json +- scenario_state.json +- debug/evidence payloads +- output.md only to compare surfaced claims with evidence + +Return a compact JSON object: +- reviewer: evidence_field_truth_reviewer +- status: accepted | partial | blocked +- field_truth_ok: boolean +- temporal_honesty_ok: boolean +- selected_object_carryover_ok: boolean +- evidence_sufficient: boolean +- issue_codes: string[] +- root_layers: string[] +- evidence_paths: string[] +- findings: string[] +- minimal_patch_direction: string + +Judge: +- whether surfaced fields, dates, amounts, sources, and object labels match evidence; +- whether supplier/buyer/organization/document-side roles are mislabeled; +- whether selected_object, focus_object, answer_object, and reusable bundles survived follow-ups; +- whether out-of-window evidence is clearly marked instead of presented as exact-window truth. +""" +nickname_candidates = ["Caliper", "Trace", "Sieve"] diff --git a/.codex/agents/regression_pack_reviewer.toml b/.codex/agents/regression_pack_reviewer.toml new file mode 100644 index 0000000..9d718c0 --- /dev/null +++ b/.codex/agents/regression_pack_reviewer.toml @@ -0,0 +1,35 @@ +name = "regression_pack_reviewer" +description = "Read-only reviewer that maps a proposed NDC_1C fix to rerun packs, old accepted packs, and smoke coverage." +model = "gpt-5.4" +model_reasoning_effort = "medium" +sandbox_mode = "read-only" +developer_instructions = """ +You are a read-only regression-pack reviewer for NDC_1C. + +You are a tool for Lead/Orchestrator, not a handoff owner. +You do not edit files, save autoruns, accept runs, or mutate contracts. + +Read: +- issue_catalog.json +- rerun_matrix.json +- repair_targets.json +- pack_state.json +- scenario_acceptance_matrix.md +- accepted pack summaries when provided by Lead/Orchestrator + +Return a compact JSON object: +- reviewer: regression_pack_reviewer +- status: accepted | partial | blocked +- required_reruns: string[] +- smoke_tests: string[] +- manual_replay_needed: boolean +- issue_codes: string[] +- evidence_paths: string[] +- findings: string[] + +Judge: +- which failed scenario must be rerun after the fix; +- which neighbor, wrong-domain trap, selected-object, and accepted-smoke packs protect the blast radius; +- whether the proposed change is too broad for a narrow smoke and needs a manual semantic replay. +""" +nickname_candidates = ["Canary", "Sentinel", "Loop"] diff --git a/.codex/agents/route_capability_reviewer.toml b/.codex/agents/route_capability_reviewer.toml new file mode 100644 index 0000000..b2447c7 --- /dev/null +++ b/.codex/agents/route_capability_reviewer.toml @@ -0,0 +1,37 @@ +name = "route_capability_reviewer" +description = "Read-only reviewer for route, capability, domain family, and exact-contour fit in NDC_1C semantic replay artifacts." +model = "gpt-5.4" +model_reasoning_effort = "medium" +sandbox_mode = "read-only" +developer_instructions = """ +You are a read-only route and capability reviewer for NDC_1C. + +You are a tool for Lead/Orchestrator, not a handoff owner. +You do not edit files, save autoruns, accept runs, or mutate contracts. + +Read: +- turn.json +- debug payloads +- scenario_state.json +- step_state.json +- route/capability traces embedded in review bundles + +Return a compact JSON object: +- reviewer: route_capability_reviewer +- status: accepted | partial | blocked | needs_exact_capability +- route_family_ok: boolean +- capability_ok: boolean +- needs_enablement: boolean +- issue_codes: string[] +- root_layers: string[] +- evidence_paths: string[] +- findings: string[] +- minimal_patch_direction: string + +Judge: +- whether the route/capability/domain family matches the user's real business question; +- whether a route candidate is ready, missing axes, or truly needs enablement; +- whether wrong-domain leakage happened, especially for margin/profitability vs accounting/bank/fixed-assets contours; +- whether the issue should become capability enablement rather than presentation cleanup. +""" +nickname_candidates = ["Switch", "Compass", "Relay"] diff --git a/.codex/skills/domain-case-loop/references/subagent_review_protocol.md b/.codex/skills/domain-case-loop/references/subagent_review_protocol.md new file mode 100644 index 0000000..7f2038d --- /dev/null +++ b/.codex/skills/domain-case-loop/references/subagent_review_protocol.md @@ -0,0 +1,66 @@ +# Read-Only Subagent Review Protocol + +Use this protocol when Lead/Orchestrator wants parallel review help for a domain pack, semantic replay, or repair handoff. + +## Rule + +Subagents are tools, not owners. + +Lead/Orchestrator owns: +- final verdict; +- issue_code selection; +- repair decision; +- merge of findings; +- code changes; +- autorun save; +- acceptance. + +Subagents must not: +- edit code; +- write artifacts except their own returned summary if the caller stores it; +- save autoruns; +- mutate `docs/orchestration/active_domain_contract.json`; +- mutate contracts, prompt registry, or capability mapping; +- mark a run accepted. + +## Roles + +`business_answer_reviewer` +- Reads user-facing `output.md` style artifacts first. +- Judges direct-answer-first behavior, business usefulness, and technical garbage. + +`route_capability_reviewer` +- Reads `turn.json`, debug payloads, capability traces, and route candidate traces. +- Judges route family, exact capability, missing axes, and wrong-domain leakage. + +`evidence_field_truth_reviewer` +- Reads evidence payloads, step state, scenario state, and output only for claim comparison. +- Judges field truth, dates, amounts, selected object continuity, and carryover. + +`regression_pack_reviewer` +- Reads issue catalog, rerun matrix, repair targets, pack state, and accepted-pack context. +- Suggests reruns and smoke coverage for a proposed fix. + +## Expected Summary Shape + +Each subagent returns JSON only: + +```json +{ + "reviewer": "business_answer_reviewer", + "status": "accepted|partial|blocked|needs_exact_capability", + "issue_codes": [], + "root_layers": [], + "evidence_paths": [], + "findings": [], + "minimal_patch_direction": null, + "required_reruns": [] +} +``` + +Lead/Orchestrator converts useful findings into: +- `business_audit.json`; +- `issue_catalog_snapshot.json`; +- `detector_candidates.json`; +- `rerun_matrix.json`; +- `lead_coder_handoff.json`. diff --git a/docs/orchestration/agent_margin_profitability_reliability_20260524.json b/docs/orchestration/agent_margin_profitability_reliability_20260524.json new file mode 100644 index 0000000..630ff04 --- /dev/null +++ b/docs/orchestration/agent_margin_profitability_reliability_20260524.json @@ -0,0 +1,79 @@ +{ + "schema_version": "domain_scenario_pack_v1", + "pack_id": "agent_margin_profitability_reliability_20260524", + "domain": "margin_profitability", + "title": "AGENT | margin profitability wrong-domain traps", + "description": "Минимальный reliability pack для проверки, что вопросы про маржинальность номенклатуры не утекают в ОС, амортизацию, банк, оплаты или взаиморасчёты.", + "source_contract_id": "margin_profitability_v1", + "bindings": { + "period": "2020 год", + "item": "товар" + }, + "analysis_context": { + "expected_business_answer_contract": "margin_profitability_v1", + "semantic_focus": [ + "direct_answer_first", + "margin_domain_purity", + "honest_unknowns", + "wrong_domain_traps" + ] + }, + "scenarios": [ + { + "scenario_id": "margin_root_wrong_domain_trap", + "title": "Root margin question must not leak into accounting domains", + "steps": [ + { + "step_id": "step_01", + "title": "Маржинальность номенклатуры", + "question": "Какая номенклатура была самой маржинальной за {{bindings.period}}?", + "semantic_tags": ["margin_profitability", "inventory", "wrong_domain_trap"], + "expected_result_mode": "ranking_or_limited_accounting_answer", + "expected_business_answer_contract": "margin_profitability_v1", + "required_answer_shape": "direct_answer_first", + "required_answer_patterns_any": [ + "(?i)(марж|прибыл|выруч|себестоим|не могу подтвердить|не хватает)" + ], + "forbidden_answer_patterns": [ + "(?i)(амортизац|основн(ые|ых)? средств|объект ОС|оплат[аы]|банк|settlement|payment_document)" + ], + "notes": "Если точного расчёта нет, допустим честный limited answer, но не уход в ОС/банк/оплаты." + } + ] + }, + { + "scenario_id": "margin_followup_contract_boundary", + "title": "Follow-up must keep margin contract and state limitations", + "steps": [ + { + "step_id": "step_01", + "title": "Запрос маржинальности", + "question": "Покажи топ товаров по марже за {{bindings.period}}.", + "semantic_tags": ["margin_profitability", "inventory"], + "expected_result_mode": "ranking_or_limited_accounting_answer", + "expected_business_answer_contract": "margin_profitability_v1", + "required_answer_shape": "direct_answer_first", + "forbidden_answer_patterns": [ + "(?i)(амортизац|объект ОС|payment_document|settlement)" + ] + }, + { + "step_id": "step_02", + "title": "Почему именно так", + "question": "А из чего ты это посчитал и чего не хватает для точной маржи?", + "depends_on": ["step_01"], + "semantic_tags": ["margin_profitability", "evidence", "scope_guard"], + "expected_result_mode": "evidence_or_honest_boundary", + "expected_business_answer_contract": "margin_profitability_v1", + "required_answer_shape": "direct_answer_first", + "required_answer_patterns_any": [ + "(?i)(выруч|себестоим|валов|марж|не хватает|не подтвержден)" + ], + "forbidden_answer_patterns": [ + "(?i)(route_id|capability_id|runtime_|debug|амортизац|объект ОС)" + ] + } + ] + } + ] +} diff --git a/docs/orchestration/contracts/margin_profitability_v1.json b/docs/orchestration/contracts/margin_profitability_v1.json new file mode 100644 index 0000000..39f06f7 --- /dev/null +++ b/docs/orchestration/contracts/margin_profitability_v1.json @@ -0,0 +1,77 @@ +{ + "schema_version": "business_answer_contract_v1", + "contract_id": "margin_profitability_v1", + "domain": "margin_profitability", + "title": "Маржинальность номенклатуры", + "purpose": "Зафиксировать минимальную форму честного бизнес-ответа для вопросов о выручке, себестоимости, валовой прибыли и марже по товарам/номенклатуре.", + "answer_surface": { + "must_start_with": "direct_answer_first", + "required_fields": [ + { + "field": "period", + "meaning": "Период расчёта или честное указание, что период не задан/не подтверждён." + }, + { + "field": "revenue_ex_vat", + "meaning": "Выручка без НДС или честное unknown, если контур не может доказать сумму." + }, + { + "field": "cogs", + "meaning": "Себестоимость или честное unknown, если нет подтверждённой основы." + }, + { + "field": "gross_profit", + "meaning": "Валовая прибыль как revenue_ex_vat - cogs или честное unknown." + }, + { + "field": "margin_pct", + "meaning": "Процент маржи или честное unknown при недостаточной базе." + }, + { + "field": "evidence_basis", + "meaning": "Какие документы/регистры/цепочки подтверждают расчёт." + } + ], + "must_not_contain": [ + "route ids", + "capability ids", + "debug ids", + "fixed assets / ОС leak", + "amortization / амортизация leak", + "payment document as margin source" + ], + "limited_answer_rule": "Если точный расчёт невозможен, ответ должен сказать, что подтверждено, что неизвестно, и предложить следующий проверяемый шаг." + }, + "root_layers": [ + "intent", + "route", + "capability", + "field_mapping", + "answer_surface" + ], + "detectors": [ + "margin_domain_leak_accounting_route", + "margin_required_fields_missing", + "margin_next_action_missing", + "margin_payment_document_false_source", + "margin_os_amortization_leak" + ], + "acceptance": { + "must_have": [ + "direct_answer_first", + "period", + "revenue_ex_vat_or_honest_unknown", + "cogs_or_honest_unknown", + "gross_profit_or_honest_unknown", + "margin_pct_or_honest_unknown", + "evidence_basis_or_honest_boundary", + "next_action_if_limited" + ], + "must_not_have": [ + "fixed assets leak", + "amortization leak", + "payment document as margin source", + "technical garbage" + ] + } +} diff --git a/docs/orchestration/issue_catalog.json b/docs/orchestration/issue_catalog.json new file mode 100644 index 0000000..7800b8d --- /dev/null +++ b/docs/orchestration/issue_catalog.json @@ -0,0 +1,221 @@ +{ + "schema_version": "agent_issue_catalog_v1", + "updated_at": "2026-05-24", + "principles": [ + "No accepted save without effective_runtime.json.", + "No auto-coder without allowed issue_code, root_layers, allowed/forbidden targets, rerun_matrix, and answer contract.", + "Subagents are read-only reviewers; Lead/Orchestrator owns final verdict and repair decision." + ], + "issues": { + "runtime_manifest_missing": { + "severity": "P0", + "business_meaning": "Прогон нельзя воспроизвести, поэтому audit/repair/acceptance нельзя считать доказанными.", + "root_layers": ["run_reproducibility"], + "detectors": ["missing_effective_runtime_json"], + "allowed_patch_targets": [ + "scripts/domain_case_loop.py", + "scripts/domain_truth_harness.py", + "scripts/stage_agent_loop.py", + "scripts/save_agent_semantic_run.py", + "scripts/agent_runtime_manifest.py" + ], + "forbidden_patch_targets": [ + "business routing", + "MCP runtime", + "domain contracts" + ], + "rerun_matrix": [ + "same_spec_direct_runner", + "same_spec_stage_runner", + "accepted_smoke_pack" + ], + "acceptance": { + "must_have": [ + "effective_runtime.json", + "git_sha", + "runner", + "llm_model", + "temperature", + "prompt_source", + "prompt_hash" + ] + } + }, + "prompt_registry_opaque": { + "severity": "P0", + "business_meaning": "Неясно, какой normalizer prompt реально исполняется.", + "root_layers": ["prompt_registry", "runtime_config"], + "detectors": [ + "default_prompt_version_missing_files", + "silent_prompt_fallback", + "preset_version_mismatch" + ], + "allowed_patch_targets": [ + "llm_normalizer/backend/src/services/promptBuilder.ts", + "llm_normalizer/backend/src/config.ts", + "llm_normalizer/data/presets/*.json", + "scripts/prompt_registry_healthcheck.py", + "scripts/agent_runtime_manifest.py" + ], + "forbidden_patch_targets": [ + "domain routing rewrites", + "business answer heuristics" + ], + "rerun_matrix": [ + "prompt_healthcheck", + "normalizer_smoke", + "accepted_smoke_pack" + ] + }, + "margin_domain_leak_accounting_route": { + "severity": "P0", + "business_meaning": "Запрос про маржинальность номенклатуры ушёл в чужой бухгалтерский домен вместо выручки, себестоимости, валовой прибыли и процента маржи.", + "root_layers": ["intent", "route", "capability", "domain_contract"], + "expected_answer_contract": "margin_profitability_v1", + "detectors": [ + "forbidden_margin_terms", + "missing_revenue_cogs_margin_fields", + "wrong_capability_family" + ], + "allowed_patch_targets": [ + "llm_normalizer/backend/src/services/addressIntentResolver.ts", + "llm_normalizer/backend/src/services/addressCapabilityPolicy.ts", + "llm_normalizer/backend/src/services/addressFilterExtractor.ts", + "llm_normalizer/backend/src/services/address_runtime/**", + "docs/orchestration/contracts/margin_profitability_v1.json" + ], + "forbidden_patch_targets": [ + "fake data", + "silent heuristic masking", + "global orchestration rewrite", + "MCP protocol rewrite" + ], + "rerun_matrix": [ + "failed_margin_scenario", + "margin_neighbor_pack", + "wrong_domain_trap_pack", + "selected_object_followup_pack", + "accepted_smoke_pack" + ], + "acceptance": { + "must_have": [ + "direct_answer_first", + "period", + "revenue_ex_vat_or_honest_unknown", + "cogs_or_honest_unknown", + "gross_profit_or_honest_unknown", + "margin_pct_or_honest_unknown", + "next_action_if_limited" + ], + "must_not_have": [ + "route ids", + "debug ids", + "fixed assets leak", + "amortization leak", + "payment document as margin source" + ] + } + }, + "business_direct_answer_missing": { + "severity": "P0", + "business_meaning": "Ответ не начинает с прямого бизнес-вывода, хотя пользователь задал прямой вопрос.", + "root_layers": ["answer_surface", "business_utility"], + "detectors": ["first_line_not_direct_answer", "top_level_scaffold_before_answer"], + "allowed_patch_targets": [ + "llm_normalizer/backend/src/services/address_runtime/composeStage.ts", + "llm_normalizer/backend/src/services/assistantMcpDiscoveryResponseCandidate.ts", + "llm_normalizer/backend/src/services/assistantService.ts" + ], + "forbidden_patch_targets": [ + "routing rewrites", + "fake evidence", + "global runtime rewrite" + ], + "rerun_matrix": [ + "failed_scenario", + "direct_answer_surface_pack", + "accepted_smoke_pack" + ] + }, + "technical_garbage_in_answer": { + "severity": "P0", + "business_meaning": "Финальный ответ протащил debug/runtime/MCP-механику в пользовательскую поверхность.", + "root_layers": ["answer_surface", "business_utility"], + "detectors": ["runtime_tokens_in_user_answer", "capability_ids_in_user_answer"], + "allowed_patch_targets": [ + "llm_normalizer/backend/src/services/address_runtime/composeStage.ts", + "llm_normalizer/backend/src/services/assistantMcpDiscoveryResponseCandidate.ts", + "llm_normalizer/backend/src/services/assistantService.ts" + ], + "forbidden_patch_targets": [ + "route masking", + "debug deletion from artifacts" + ], + "rerun_matrix": [ + "failed_scenario", + "technical_garbage_canary_pack", + "accepted_smoke_pack" + ] + }, + "accounting_contract_missing": { + "severity": "P1", + "business_meaning": "Ответ не раскрыл обязательный бухгалтерский/доказательный контракт для запрошенного расчёта.", + "root_layers": ["domain_contract", "answer_surface", "evidence"], + "detectors": ["required_contract_fields_missing"], + "allowed_patch_targets": [ + "docs/orchestration/contracts/*.json", + "llm_normalizer/backend/src/services/address_runtime/composeStage.ts", + "llm_normalizer/backend/src/services/assistantMcpDiscoveryResponseCandidate.ts" + ], + "forbidden_patch_targets": [ + "fake data", + "silent heuristic masking" + ], + "rerun_matrix": [ + "failed_scenario", + "contract_field_pack", + "accepted_smoke_pack" + ] + }, + "business_next_step_missing": { + "severity": "P2", + "business_meaning": "Ограниченный ответ не предлагает полезный следующий шаг.", + "root_layers": ["answer_surface", "business_utility"], + "detectors": ["limited_answer_without_next_action"], + "allowed_patch_targets": [ + "llm_normalizer/backend/src/services/address_runtime/composeStage.ts", + "llm_normalizer/backend/src/services/assistantMcpDiscoveryResponseCandidate.ts" + ], + "forbidden_patch_targets": [ + "evidence fabrication", + "route masking" + ], + "rerun_matrix": [ + "failed_scenario", + "limited_answer_pack", + "accepted_smoke_pack" + ] + }, + "route_candidate_enablement_gap": { + "severity": "P1", + "business_meaning": "Планировщик понял бизнес-запрос, но route candidate ещё не исполняется как точная возможность.", + "root_layers": ["route_candidate", "runtime_capability"], + "detectors": ["route_candidate_needs_enablement"], + "allowed_patch_targets": [ + "llm_normalizer/backend/src/services/assistantMcpDiscoveryRuntimeBridge.ts", + "llm_normalizer/backend/src/services/assistantMcpDiscoveryPlanner.ts", + "llm_normalizer/backend/src/services/assistantMcpDiscoveryPilotExecutor.ts", + "llm_normalizer/backend/src/services/addressRecipeCatalog.ts" + ], + "forbidden_patch_targets": [ + "global orchestration rewrite", + "fake fixtures" + ], + "rerun_matrix": [ + "failed_scenario", + "route_candidate_pack", + "accepted_smoke_pack" + ] + } + } +} diff --git a/llm_normalizer/backend/dist/services/addressIntentResolver.js b/llm_normalizer/backend/dist/services/addressIntentResolver.js index a1c8c26..b439303 100644 --- a/llm_normalizer/backend/dist/services/addressIntentResolver.js +++ b/llm_normalizer/backend/dist/services/addressIntentResolver.js @@ -1670,10 +1670,9 @@ function hasNomenclatureMarginRankingSignal(text) { return false; } const hasNomenclatureCue = /(?:номенклатур|товар|позици|ассортимент|sku|item|product|goods)/iu.test(normalized); - const hasRealizationCue = /(?:реализован|реализац|продан|продаж|отгруж|41(?:[.,]0?1)?|90(?:[.,]\d{1,2})?|sales?|sold)/iu.test(normalized); const hasMarginCue = /(?:прибыл|марж|рентаб|наценк|себестоим|выручк|profit|margin|profitability|gross\s+spread|cogs)/iu.test(normalized); - const hasRankingCue = /(?:высок|низк|топ|сам(?:ая|ый|ое|ые)|больш|меньш|ранж|рейтинг|high|low|top|rank|best|worst)/iu.test(normalized); - return hasNomenclatureCue && hasRealizationCue && hasMarginCue && hasRankingCue; + const hasRankingCue = /(?:высок|низк|топ|сам(?:ая|ый|ое|ые|ой|ого|ому|ым|ых|ую)|больш|меньш|ранж|рейтинг|max|min|high|low|top|rank|best|worst)/iu.test(normalized); + return hasNomenclatureCue && hasMarginCue && hasRankingCue; } function hasVatPeriodInspectionBridgeSignal(text) { const normalized = String(text ?? "").trim().toLowerCase(); diff --git a/llm_normalizer/backend/dist/services/addressInventoryIntentSignals.js b/llm_normalizer/backend/dist/services/addressInventoryIntentSignals.js index 3e2f43b..709f8db 100644 --- a/llm_normalizer/backend/dist/services/addressInventoryIntentSignals.js +++ b/llm_normalizer/backend/dist/services/addressInventoryIntentSignals.js @@ -21,6 +21,19 @@ function hasPlainRussianInventoryOnHandSignal(text) { const hasSnapshotCue = /(?:на\s+(?:дату|сегодня|сейчас|март|апрел|май|мая|июн|июл|август|сентябр|октябр|ноябр|декабр|январ|феврал)|\b(?:19|20)\d{2}\b)/iu.test(normalized); return hasRequestCue && (hasSnapshotCue || /остатк/iu.test(normalized)); } +function hasInventoryMarginRankingSignal(text) { + const normalized = String(text ?? "") + .trim() + .toLowerCase() + .replace(/ё/g, "е"); + if (!normalized) { + return false; + } + const hasNomenclatureCue = /(?:номенклатур|товар|позици|ассортимент|sku|item|product|goods)/iu.test(normalized); + const hasMarginCue = /(?:прибыл|марж|рентаб|наценк|себестоим|выручк|profit|margin|profitability|gross\s+spread|cogs)/iu.test(normalized); + const hasRankingCue = /(?:высок|низк|топ|сам(?:ая|ый|ое|ые|ой|ого|ому|ым|ых|ую)|больш|меньш|ранж|рейтинг|max|min|high|low|top|rank|best|worst)/iu.test(normalized); + return hasNomenclatureCue && hasMarginCue && hasRankingCue; +} function hasInventoryOnHandSignal(text) { const hasColloquialStockSnapshotCue = /(?:что|С‡[еёо])\s+(?:Сѓ\s+нас\s+)?РЅР°\s+склад(?:Рµ|Сѓ|РѕРј|ах)(?=$|[\s,.;:!?])/iu.test(text); const hasStockStateCue = /(?:(?:что|С‡[еёо])\s+там\s+РЅР°\s+склад(?:Рµ|Сѓ|РѕРј|ах)|(?:что|С‡[еёо]).*РїСЂРѕРёСЃС…РѕРґ(?:РёС‚|ило|ящее).*(?:РЅР°\s+)?склад(?:Рµ|Сѓ|РѕРј|ах)|РїСЂРѕРёСЃС…РѕРґ(?:РёС‚|ило|ящее)\s+РЅР°\s+склад(?:Рµ|Сѓ|РѕРј|ах)|ситуац(?:РёСЏ|РёРё)\s+РЅР°\s+склад(?:Рµ|Сѓ|РѕРј|ах)|обстановк(?:Р°|Рё)\s+РЅР°\s+склад(?:Рµ|Сѓ|РѕРј|ах)|what(?:'s| is)?\s+(?:there\s+)?(?:on|in)\s+(?:the\s+)?(?:warehouse|stock)|what(?:'s| is)?\s+happening\s+(?:on|in)\s+(?:the\s+)?(?:warehouse|stock))/iu.test(text); @@ -34,6 +47,7 @@ function hasInventoryOnHandSignal(text) { hasInventoryPurchaseDocumentsSignalV2(text) || hasInventorySaleTraceSignalV2(text) || hasInventoryAgingSignal(text) || + hasInventoryMarginRankingSignal(text) || hasInventoryPurchaseToSaleChainSignal(text)) { return false; } @@ -185,6 +199,13 @@ function resolveInventoryAddressIntent(text) { reasons: ["inventory_aging_signal_detected_strong"] }; } + if (hasInventoryMarginRankingSignal(text)) { + return { + intent: "inventory_margin_ranking_for_nomenclature", + confidence: "high", + reasons: ["inventory_margin_ranking_signal_detected"] + }; + } if (hasInventoryAccount41Anchor(text) && hasInventoryAsOfCue(text)) { return { intent: "inventory_on_hand_as_of_date", diff --git a/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js b/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js index c3d88a5..8bb6d84 100644 --- a/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js +++ b/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js @@ -639,10 +639,12 @@ function hasInventoryMarginRankingFollowupCue(text) { const wantsFoundRows = /(?:покажи|показать|выведи|дай|раскрой|show|list|покажи|показать|выведи|дай|раскрой)/iu.test(normalized) && /(?:найденн|строк|реализац|себестоимостн|баз|найденн|строк|реализац|себестоимостн|баз)/iu.test(normalized) && /(?:себестоимостн|реализац|марж|прибыл|номенклатур|себестоимостн|реализац|марж|прибыл|номенклат)/iu.test(normalized); + const asksMarginBasis = /(?:из\s+чего|как\s+(?:ты\s+)?(?:это\s+)?посчитал|почему|какие\s+поля|чего\s+не\s+хватает|не\s+хватает|точн(?:ой|ая|ую)?\s+марж|basis|source|fields|calculated|missing)/iu.test(normalized) && + /(?:марж|прибыл|рентаб|себестоимост|выручк|номенклатур|рейтинг|top|margin|profit|cogs|revenue)/iu.test(normalized); const account41Not01 = /\b41(?:[.,]\d{1,2})?\b/iu.test(normalized) && /\b01(?:[.,]\d{1,2})?\b/iu.test(normalized) && /(?:\bне\b|вместо|а\s+не|not|instead|РЅРµ|вместо|Р°\s+РЅРµ)/iu.test(normalized); - return wantsFoundRows || account41Not01; + return wantsFoundRows || asksMarginBasis || account41Not01; } function hasAddressFollowupContextSignal(text) { const normalized = String(text ?? "").trim(); @@ -1346,8 +1348,8 @@ function deriveIntentWithFollowupContext(detectedIntent, userMessage, followupCo followupContext.root_anchor_type === "item" || followupContext.current_frame_kind === "inventory_root" || followupContext.current_frame_kind === "inventory_drilldown"; - const inventorySelectedObjectFollowup = inventoryLineageActive && - (hasSelectedObjectInventorySignal(normalizedMessage) || (previousIsInventoryFamily && hasFollowupSignal)); + const hasSelectedObjectReference = hasSelectedObjectInventorySignal(normalizedMessage); + const inventorySelectedObjectFollowup = inventoryLineageActive && (hasSelectedObjectReference || (previousIsInventoryFamily && hasFollowupSignal)); const previousCounterpartyLaneActive = hasPreviousCounterparty && (followupContext.previous_anchor_type === "counterparty" || sourceIntent === "list_documents_by_counterparty" || @@ -1369,6 +1371,7 @@ function deriveIntentWithFollowupContext(detectedIntent, userMessage, followupCo detectedIntent.intent === "account_balance_snapshot" || detectedIntent.intent === "documents_forming_balance" || detectedIntent.intent === "inventory_margin_ranking_for_nomenclature" || + (detectedIntent.intent === "inventory_profitability_for_item" && !hasSelectedObjectReference) || detectedIntent.intent === sourceIntent)) { return { intent: "inventory_margin_ranking_for_nomenclature", diff --git a/llm_normalizer/backend/dist/services/address_runtime/inventoryReplyBuilders.js b/llm_normalizer/backend/dist/services/address_runtime/inventoryReplyBuilders.js index ecfb6cd..1d284a6 100644 --- a/llm_normalizer/backend/dist/services/address_runtime/inventoryReplyBuilders.js +++ b/llm_normalizer/backend/dist/services/address_runtime/inventoryReplyBuilders.js @@ -88,6 +88,10 @@ function asksForInventoryCostBaseRows(userMessage) { } return /(?:себестоимостн|себестоимост|себестоим|закупочн|закупк|90\.02|\b41\b|баз)/iu.test(text); } +function asksForInventoryMarginBasis(userMessage) { + const text = String(userMessage ?? "").toLowerCase(); + return (/(?:из\s+чего|как\s+(?:ты\s+)?(?:это\s+)?посчитал|какие\s+поля|чего\s+не\s+хватает|не\s+хватает|точн(?:ой|ая|ую)?\s+марж|basis|source|fields|calculated|missing)/iu.test(text) && /(?:марж|прибыл|себестоимост|выручк|margin|profit|cogs|revenue)/iu.test(text)); +} function inventoryRowItemLabel(row, deps) { return deps.summarizeInventoryTraceRows([row]).item; } @@ -454,17 +458,19 @@ function composeInventoryReply(intent, rows, options, deps) { const entries = buildInventoryMarginRankingEntries(rows, deps); const confirmedEntries = entries.filter((entry) => entry.revenue > 0 && entry.costProxy > 0); const highMargin = [...confirmedEntries] - .sort((left, right) => right.spread - left.spread || (right.marginPct ?? -Infinity) - (left.marginPct ?? -Infinity)) - .slice(0, 5); + .sort((left, right) => (right.marginPct ?? -Infinity) - (left.marginPct ?? -Infinity) || right.spread - left.spread) + .slice(0, 3); const lowMargin = [...confirmedEntries] - .sort((left, right) => left.spread - right.spread || (left.marginPct ?? Infinity) - (right.marginPct ?? Infinity)) - .slice(0, 5); + .sort((left, right) => (left.marginPct ?? Infinity) - (right.marginPct ?? Infinity) || left.spread - right.spread) + .slice(0, 3); const salesWithoutCost = entries.filter((entry) => entry.revenue > 0 && entry.costProxy <= 0); const purchasesWithoutSales = entries.filter((entry) => entry.costProxy > 0 && entry.revenue <= 0); const periodLabel = inventoryProfitabilityPeriodLabel(options, deps); const totalRevenue = entries.reduce((sum, entry) => sum + entry.revenue, 0); const totalCostProxy = entries.reduce((sum, entry) => sum + entry.costProxy, 0); const totalSpread = totalRevenue - totalCostProxy; + const topMarginEntry = highMargin[0] ?? null; + const marginBasisRequested = asksForInventoryMarginBasis(options.userMessage); if (confirmedEntries.length === 0) { const costBaseRowsRequested = asksForInventoryCostBaseRows(options.userMessage); const lines = [ @@ -512,12 +518,34 @@ function composeInventoryReply(intent, rows, options, deps) { ]); return (0, replyContracts_1.buildFactualSummaryReply)(lines, (0, replyContracts_1.buildConfirmedBalanceSemantics)(entries.length > 0 ? "medium" : "weak", false)); } - const directAnswerLine = confirmedEntries.length > 0 - ? `За период ${periodLabel} собран рейтинг реализованной номенклатуры по валовой маржинальности: выручка ${deps.formatMoneyRub(totalRevenue)}, себестоимостная база ${deps.formatMoneyRub(totalCostProxy)}, расчетная валовая разница ${deps.formatMoneyRub(totalSpread)}.` - : `За период ${periodLabel} не удалось подтвердить рейтинг прибыльности номенклатуры: нужны одновременно строки реализации и закупочного/себестоимостного следа по товарам.`; + const directAnswerLine = topMarginEntry && marginBasisRequested + ? `Считал маржу за период ${periodLabel} как выручку реализации минус доступную себестоимостную базу: выручка ${deps.formatMoneyRub(totalRevenue)}, себестоимостная база ${deps.formatMoneyRub(totalCostProxy)}, валовая разница ${deps.formatMoneyRub(totalSpread)}.` + : topMarginEntry + ? `Самая маржинальная позиция за период ${periodLabel}: ${topMarginEntry.item} — маржа ${formatInventoryPercent(topMarginEntry.marginPct, deps.formatNumberWithDots)}, выручка ${deps.formatMoneyRub(topMarginEntry.revenue)}, себестоимостная база ${deps.formatMoneyRub(topMarginEntry.costProxy)}, валовая разница ${deps.formatMoneyRub(topMarginEntry.spread)}.` + : `За период ${periodLabel} не удалось подтвердить рейтинг прибыльности номенклатуры: нужны одновременно строки реализации и закупочного/себестоимостного следа по товарам.`; const lines = [directAnswerLine]; + if (marginBasisRequested) { + (0, inventoryReplyPresentation_1.appendInventoryBulletSection)(lines, "База расчета:", [ + "выручка: подтвержденные строки реализации по номенклатуре;", + "себестоимостная база: доступные строки закупочного/себестоимостного следа по той же номенклатуре;", + "валовая маржа: (выручка - себестоимостная база) / выручка." + ]); + const basisLimitations = [ + "это управленческий расчет валовой маржи, не показатель чистой прибыли;" + ]; + if (salesWithoutCost.length > 0) { + basisLimitations.push(`по ${deps.formatNumberWithDots(salesWithoutCost.length)} позициям есть продажи без подтвержденной себестоимости реализации;`); + } + if (purchasesWithoutSales.length > 0) { + basisLimitations.push(`по ${deps.formatNumberWithDots(purchasesWithoutSales.length)} позициям есть себестоимостная база без реализации в периоде;`); + } + basisLimitations.push("для строгого бухгалтерского расчета нужны проводки 90.01 / 90.02 и проверка закрытия себестоимости."); + (0, inventoryReplyPresentation_1.appendInventoryBulletSection)(lines, "Чего не хватает для точной маржи:", basisLimitations); + lines.push("", "Следующий шаг: могу раскрыть строки выручки и себестоимостной базы по любой позиции из рейтинга."); + return (0, replyContracts_1.buildFactualSummaryReply)(lines, (0, replyContracts_1.buildConfirmedBalanceSemantics)(confirmedEntries.length > 0 ? "strong" : entries.length > 0 ? "medium" : "weak", confirmedEntries.length > 0)); + } if (highMargin.length > 0) { - (0, inventoryReplyPresentation_1.appendInventorySection)(lines, "Высокая валовая маржинальность:", highMargin.map((entry, index) => formatInventoryMarginRankingLine(entry, index, deps))); + (0, inventoryReplyPresentation_1.appendInventorySection)(lines, "Высокая валовая маржинальность (топ по проценту маржи):", highMargin.map((entry, index) => formatInventoryMarginRankingLine(entry, index, deps))); } if (lowMargin.length > 0) { (0, inventoryReplyPresentation_1.appendInventorySection)(lines, "Низкая или отрицательная валовая маржинальность:", lowMargin.map((entry, index) => formatInventoryMarginRankingLine(entry, index, deps))); @@ -533,6 +561,7 @@ function composeInventoryReply(intent, rows, options, deps) { boundaryLines.push(`По ${deps.formatNumberWithDots(purchasesWithoutSales.length)} позициям есть себестоимостная база без реализации в этом периоде.`); } (0, inventoryReplyPresentation_1.appendInventoryBulletSection)(lines, "Граница ответа:", boundaryLines); + lines.push("", "Следующий шаг: могу раскрыть строки выручки и себестоимостной базы по выбранной позиции из рейтинга."); return (0, replyContracts_1.buildFactualSummaryReply)(lines, (0, replyContracts_1.buildConfirmedBalanceSemantics)(confirmedEntries.length > 0 ? "strong" : entries.length > 0 ? "medium" : "weak", confirmedEntries.length > 0)); } if (intent === "inventory_profitability_for_item") { diff --git a/llm_normalizer/backend/src/services/addressIntentResolver.ts b/llm_normalizer/backend/src/services/addressIntentResolver.ts index 1d33c2f..1f0116a 100644 --- a/llm_normalizer/backend/src/services/addressIntentResolver.ts +++ b/llm_normalizer/backend/src/services/addressIntentResolver.ts @@ -2159,19 +2159,15 @@ function hasNomenclatureMarginRankingSignal(text: string): boolean { } const hasNomenclatureCue = /(?:номенклатур|товар|позици|ассортимент|sku|item|product|goods)/iu.test(normalized); - const hasRealizationCue = - /(?:реализован|реализац|продан|продаж|отгруж|41(?:[.,]0?1)?|90(?:[.,]\d{1,2})?|sales?|sold)/iu.test( - normalized - ); const hasMarginCue = /(?:прибыл|марж|рентаб|наценк|себестоим|выручк|profit|margin|profitability|gross\s+spread|cogs)/iu.test( normalized ); const hasRankingCue = - /(?:высок|низк|топ|сам(?:ая|ый|ое|ые)|больш|меньш|ранж|рейтинг|high|low|top|rank|best|worst)/iu.test( + /(?:высок|низк|топ|сам(?:ая|ый|ое|ые|ой|ого|ому|ым|ых|ую)|больш|меньш|ранж|рейтинг|max|min|high|low|top|rank|best|worst)/iu.test( normalized ); - return hasNomenclatureCue && hasRealizationCue && hasMarginCue && hasRankingCue; + return hasNomenclatureCue && hasMarginCue && hasRankingCue; } function hasVatPeriodInspectionBridgeSignal(text: string): boolean { diff --git a/llm_normalizer/backend/src/services/addressInventoryIntentSignals.ts b/llm_normalizer/backend/src/services/addressInventoryIntentSignals.ts index 4904ec7..dd1c9f8 100644 --- a/llm_normalizer/backend/src/services/addressInventoryIntentSignals.ts +++ b/llm_normalizer/backend/src/services/addressInventoryIntentSignals.ts @@ -35,6 +35,27 @@ function hasPlainRussianInventoryOnHandSignal(text: string): boolean { return hasRequestCue && (hasSnapshotCue || /остатк/iu.test(normalized)); } +function hasInventoryMarginRankingSignal(text: string): boolean { + const normalized = String(text ?? "") + .trim() + .toLowerCase() + .replace(/ё/g, "е"); + if (!normalized) { + return false; + } + const hasNomenclatureCue = + /(?:номенклатур|товар|позици|ассортимент|sku|item|product|goods)/iu.test(normalized); + const hasMarginCue = + /(?:прибыл|марж|рентаб|наценк|себестоим|выручк|profit|margin|profitability|gross\s+spread|cogs)/iu.test( + normalized + ); + const hasRankingCue = + /(?:высок|низк|топ|сам(?:ая|ый|ое|ые|ой|ого|ому|ым|ых|ую)|больш|меньш|ранж|рейтинг|max|min|high|low|top|rank|best|worst)/iu.test( + normalized + ); + return hasNomenclatureCue && hasMarginCue && hasRankingCue; +} + function hasInventoryOnHandSignal(text: string): boolean { const hasColloquialStockSnapshotCue = /(?:что|С‡[еёо])\s+(?:Сѓ\s+нас\s+)?РЅР°\s+склад(?:Рµ|Сѓ|РѕРј|ах)(?=$|[\s,.;:!?])/iu.test( text @@ -54,6 +75,7 @@ function hasInventoryOnHandSignal(text: string): boolean { hasInventoryPurchaseDocumentsSignalV2(text) || hasInventorySaleTraceSignalV2(text) || hasInventoryAgingSignal(text) || + hasInventoryMarginRankingSignal(text) || hasInventoryPurchaseToSaleChainSignal(text) ) { return false; @@ -304,6 +326,14 @@ export function resolveInventoryAddressIntent(text: string): AddressIntentResolu }; } + if (hasInventoryMarginRankingSignal(text)) { + return { + intent: "inventory_margin_ranking_for_nomenclature", + confidence: "high", + reasons: ["inventory_margin_ranking_signal_detected"] + }; + } + if (hasInventoryAccount41Anchor(text) && hasInventoryAsOfCue(text)) { return { intent: "inventory_on_hand_as_of_date", diff --git a/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts b/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts index 3476c76..2a93d9a 100644 --- a/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts +++ b/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts @@ -812,11 +812,18 @@ export function hasInventoryMarginRankingFollowupCue(text: string): boolean { /(?:покажи|показать|выведи|дай|раскрой|show|list|покажи|показать|выведи|дай|раскрой)/iu.test(normalized) && /(?:найденн|строк|реализац|себестоимостн|баз|найденн|строк|реализац|себестоимостн|баз)/iu.test(normalized) && /(?:себестоимостн|реализац|марж|прибыл|номенклатур|себестоимостн|реализац|марж|прибыл|номенклат)/iu.test(normalized); + const asksMarginBasis = + /(?:из\s+чего|как\s+(?:ты\s+)?(?:это\s+)?посчитал|почему|какие\s+поля|чего\s+не\s+хватает|не\s+хватает|точн(?:ой|ая|ую)?\s+марж|basis|source|fields|calculated|missing)/iu.test( + normalized + ) && + /(?:марж|прибыл|рентаб|себестоимост|выручк|номенклатур|рейтинг|top|margin|profit|cogs|revenue)/iu.test( + normalized + ); const account41Not01 = /\b41(?:[.,]\d{1,2})?\b/iu.test(normalized) && /\b01(?:[.,]\d{1,2})?\b/iu.test(normalized) && /(?:\bне\b|вместо|а\s+не|not|instead|РЅРµ|вместо|Р°\s+РЅРµ)/iu.test(normalized); - return wantsFoundRows || account41Not01; + return wantsFoundRows || asksMarginBasis || account41Not01; } export function hasAddressFollowupContextSignal(text: string): boolean { @@ -1674,9 +1681,9 @@ function deriveIntentWithFollowupContext( followupContext.root_anchor_type === "item" || followupContext.current_frame_kind === "inventory_root" || followupContext.current_frame_kind === "inventory_drilldown"; + const hasSelectedObjectReference = hasSelectedObjectInventorySignal(normalizedMessage); const inventorySelectedObjectFollowup = - inventoryLineageActive && - (hasSelectedObjectInventorySignal(normalizedMessage) || (previousIsInventoryFamily && hasFollowupSignal)); + inventoryLineageActive && (hasSelectedObjectReference || (previousIsInventoryFamily && hasFollowupSignal)); const previousCounterpartyLaneActive = hasPreviousCounterparty && (followupContext.previous_anchor_type === "counterparty" || @@ -1707,6 +1714,7 @@ function deriveIntentWithFollowupContext( detectedIntent.intent === "account_balance_snapshot" || detectedIntent.intent === "documents_forming_balance" || detectedIntent.intent === "inventory_margin_ranking_for_nomenclature" || + (detectedIntent.intent === "inventory_profitability_for_item" && !hasSelectedObjectReference) || detectedIntent.intent === sourceIntent) ) { return { diff --git a/llm_normalizer/backend/src/services/address_runtime/inventoryReplyBuilders.ts b/llm_normalizer/backend/src/services/address_runtime/inventoryReplyBuilders.ts index 6c8ea84..71e71ec 100644 --- a/llm_normalizer/backend/src/services/address_runtime/inventoryReplyBuilders.ts +++ b/llm_normalizer/backend/src/services/address_runtime/inventoryReplyBuilders.ts @@ -170,6 +170,15 @@ function asksForInventoryCostBaseRows(userMessage: string | null | undefined): b return /(?:себестоимостн|себестоимост|себестоим|закупочн|закупк|90\.02|\b41\b|баз)/iu.test(text); } +function asksForInventoryMarginBasis(userMessage: string | null | undefined): boolean { + const text = String(userMessage ?? "").toLowerCase(); + return ( + /(?:из\s+чего|как\s+(?:ты\s+)?(?:это\s+)?посчитал|какие\s+поля|чего\s+не\s+хватает|не\s+хватает|точн(?:ой|ая|ую)?\s+марж|basis|source|fields|calculated|missing)/iu.test( + text + ) && /(?:марж|прибыл|себестоимост|выручк|margin|profit|cogs|revenue)/iu.test(text) + ); +} + interface InventoryMarginRankingEntry { item: string; revenue: number; @@ -627,17 +636,19 @@ export function composeInventoryReply( const entries = buildInventoryMarginRankingEntries(rows, deps); const confirmedEntries = entries.filter((entry) => entry.revenue > 0 && entry.costProxy > 0); const highMargin = [...confirmedEntries] - .sort((left, right) => right.spread - left.spread || (right.marginPct ?? -Infinity) - (left.marginPct ?? -Infinity)) - .slice(0, 5); + .sort((left, right) => (right.marginPct ?? -Infinity) - (left.marginPct ?? -Infinity) || right.spread - left.spread) + .slice(0, 3); const lowMargin = [...confirmedEntries] - .sort((left, right) => left.spread - right.spread || (left.marginPct ?? Infinity) - (right.marginPct ?? Infinity)) - .slice(0, 5); + .sort((left, right) => (left.marginPct ?? Infinity) - (right.marginPct ?? Infinity) || left.spread - right.spread) + .slice(0, 3); const salesWithoutCost = entries.filter((entry) => entry.revenue > 0 && entry.costProxy <= 0); const purchasesWithoutSales = entries.filter((entry) => entry.costProxy > 0 && entry.revenue <= 0); const periodLabel = inventoryProfitabilityPeriodLabel(options, deps); const totalRevenue = entries.reduce((sum, entry) => sum + entry.revenue, 0); const totalCostProxy = entries.reduce((sum, entry) => sum + entry.costProxy, 0); const totalSpread = totalRevenue - totalCostProxy; + const topMarginEntry = highMargin[0] ?? null; + const marginBasisRequested = asksForInventoryMarginBasis(options.userMessage); if (confirmedEntries.length === 0) { const costBaseRowsRequested = asksForInventoryCostBaseRows(options.userMessage); const lines: string[] = [ @@ -700,19 +711,54 @@ export function composeInventoryReply( return buildFactualSummaryReply(lines, buildConfirmedBalanceSemantics(entries.length > 0 ? "medium" : "weak", false)); } const directAnswerLine = - confirmedEntries.length > 0 - ? `За период ${periodLabel} собран рейтинг реализованной номенклатуры по валовой маржинальности: выручка ${deps.formatMoneyRub( + topMarginEntry && marginBasisRequested + ? `Считал маржу за период ${periodLabel} как выручку реализации минус доступную себестоимостную базу: выручка ${deps.formatMoneyRub( totalRevenue - )}, себестоимостная база ${deps.formatMoneyRub(totalCostProxy)}, расчетная валовая разница ${deps.formatMoneyRub( + )}, себестоимостная база ${deps.formatMoneyRub(totalCostProxy)}, валовая разница ${deps.formatMoneyRub( totalSpread )}.` - : `За период ${periodLabel} не удалось подтвердить рейтинг прибыльности номенклатуры: нужны одновременно строки реализации и закупочного/себестоимостного следа по товарам.`; + : topMarginEntry + ? `Самая маржинальная позиция за период ${periodLabel}: ${topMarginEntry.item} — маржа ${formatInventoryPercent( + topMarginEntry.marginPct, + deps.formatNumberWithDots + )}, выручка ${deps.formatMoneyRub(topMarginEntry.revenue)}, себестоимостная база ${deps.formatMoneyRub( + topMarginEntry.costProxy + )}, валовая разница ${deps.formatMoneyRub(topMarginEntry.spread)}.` + : `За период ${periodLabel} не удалось подтвердить рейтинг прибыльности номенклатуры: нужны одновременно строки реализации и закупочного/себестоимостного следа по товарам.`; const lines: string[] = [directAnswerLine]; + if (marginBasisRequested) { + appendInventoryBulletSection(lines, "База расчета:", [ + "выручка: подтвержденные строки реализации по номенклатуре;", + "себестоимостная база: доступные строки закупочного/себестоимостного следа по той же номенклатуре;", + "валовая маржа: (выручка - себестоимостная база) / выручка." + ]); + const basisLimitations = [ + "это управленческий расчет валовой маржи, не показатель чистой прибыли;" + ]; + if (salesWithoutCost.length > 0) { + basisLimitations.push( + `по ${deps.formatNumberWithDots(salesWithoutCost.length)} позициям есть продажи без подтвержденной себестоимости реализации;` + ); + } + if (purchasesWithoutSales.length > 0) { + basisLimitations.push( + `по ${deps.formatNumberWithDots(purchasesWithoutSales.length)} позициям есть себестоимостная база без реализации в периоде;` + ); + } + basisLimitations.push("для строгого бухгалтерского расчета нужны проводки 90.01 / 90.02 и проверка закрытия себестоимости."); + appendInventoryBulletSection(lines, "Чего не хватает для точной маржи:", basisLimitations); + lines.push("", "Следующий шаг: могу раскрыть строки выручки и себестоимостной базы по любой позиции из рейтинга."); + return buildFactualSummaryReply( + lines, + buildConfirmedBalanceSemantics(confirmedEntries.length > 0 ? "strong" : entries.length > 0 ? "medium" : "weak", confirmedEntries.length > 0) + ); + } + if (highMargin.length > 0) { appendInventorySection( lines, - "Высокая валовая маржинальность:", + "Высокая валовая маржинальность (топ по проценту маржи):", highMargin.map((entry, index) => formatInventoryMarginRankingLine(entry, index, deps)) ); } @@ -739,6 +785,7 @@ export function composeInventoryReply( ); } appendInventoryBulletSection(lines, "Граница ответа:", boundaryLines); + lines.push("", "Следующий шаг: могу раскрыть строки выручки и себестоимостной базы по выбранной позиции из рейтинга."); return buildFactualSummaryReply( lines, diff --git a/llm_normalizer/backend/tests/addressIntentResolverRegression.test.ts b/llm_normalizer/backend/tests/addressIntentResolverRegression.test.ts index 58b9913..7562f37 100644 --- a/llm_normalizer/backend/tests/addressIntentResolverRegression.test.ts +++ b/llm_normalizer/backend/tests/addressIntentResolverRegression.test.ts @@ -113,6 +113,16 @@ describe("addressIntentResolver regression bridges", () => { expect(result.reasons).toContain("unicode_nomenclature_margin_ranking_bridge_signal_detected"); }); + it("routes bare highest-margin nomenclature wording away from inventory snapshot", () => { + const result = resolveAddressIntent( + "\u041a\u0430\u043a\u0430\u044f \u043d\u043e\u043c\u0435\u043d\u043a\u043b\u0430\u0442\u0443\u0440\u0430 \u0431\u044b\u043b\u0430 \u0441\u0430\u043c\u043e\u0439 \u043c\u0430\u0440\u0436\u0438\u043d\u0430\u043b\u044c\u043d\u043e\u0439 \u0437\u0430 2020 \u0433\u043e\u0434?" + ); + + expect(result.intent).toBe("inventory_margin_ranking_for_nomenclature"); + expect(result.intent).not.toBe("inventory_on_hand_as_of_date"); + expect(result.reasons).toContain("unicode_nomenclature_margin_ranking_bridge_signal_detected"); + }); + it("detects bare historical inventory root with explicit month-year", () => { const result = resolveAddressIntent("остатки РЅР° март 2016"); diff --git a/llm_normalizer/backend/tests/addressInventoryIntentSignals.test.ts b/llm_normalizer/backend/tests/addressInventoryIntentSignals.test.ts index fa3712e..828dfed 100644 --- a/llm_normalizer/backend/tests/addressInventoryIntentSignals.test.ts +++ b/llm_normalizer/backend/tests/addressInventoryIntentSignals.test.ts @@ -18,6 +18,15 @@ describe("addressInventoryIntentSignals", () => { expect(result?.reasons).toContain("inventory_on_hand_signal_detected"); }); + it("keeps bare highest-margin nomenclature wording out of the stock snapshot route", () => { + const result = resolveInventoryAddressIntent( + "\u041a\u0430\u043a\u0430\u044f \u043d\u043e\u043c\u0435\u043d\u043a\u043b\u0430\u0442\u0443\u0440\u0430 \u0431\u044b\u043b\u0430 \u0441\u0430\u043c\u043e\u0439 \u043c\u0430\u0440\u0436\u0438\u043d\u0430\u043b\u044c\u043d\u043e\u0439 \u0437\u0430 2020 \u0433\u043e\u0434?" + ); + + expect(result?.intent).toBe("inventory_margin_ranking_for_nomenclature"); + expect(result?.reasons).toContain("inventory_margin_ranking_signal_detected"); + }); + it("classifies selected-object purchase provenance wording through the extracted inventory owner", () => { const result = resolveInventoryAddressIntent("selected object supplier provenance"); diff --git a/llm_normalizer/backend/tests/addressInventoryProfitabilitySelectedObjectRegression.test.ts b/llm_normalizer/backend/tests/addressInventoryProfitabilitySelectedObjectRegression.test.ts index 8f432ec..cc8b6f6 100644 --- a/llm_normalizer/backend/tests/addressInventoryProfitabilitySelectedObjectRegression.test.ts +++ b/llm_normalizer/backend/tests/addressInventoryProfitabilitySelectedObjectRegression.test.ts @@ -232,6 +232,33 @@ describe("inventory profitability selected-object regressions", () => { expect(result?.intent.reasons).toContain("intent_adjusted_to_inventory_margin_ranking_followup_context"); }); + it("keeps margin basis follow-up inside ranking context instead of asking for an item", () => { + const marginFollowupContext = { + previous_intent: "inventory_margin_ranking_for_nomenclature" as const, + target_intent: "inventory_margin_ranking_for_nomenclature" as const, + root_intent: "inventory_margin_ranking_for_nomenclature" as const, + previous_filters: { + organization: "OOO Alternative Plus", + period_from: "2020-01-01", + period_to: "2020-12-31" + }, + previous_anchor_type: "unknown" as const, + previous_anchor_value: null + }; + + const result = runAddressDecomposeStage( + "\u0410 \u0438\u0437 \u0447\u0435\u0433\u043e \u0442\u044b \u044d\u0442\u043e \u043f\u043e\u0441\u0447\u0438\u0442\u0430\u043b \u0438 \u0447\u0435\u0433\u043e \u043d\u0435 \u0445\u0432\u0430\u0442\u0430\u0435\u0442 \u0434\u043b\u044f \u0442\u043e\u0447\u043d\u043e\u0439 \u043c\u0430\u0440\u0436\u0438?", + marginFollowupContext + ); + + expect(result).not.toBeNull(); + expect(result?.intent.intent).toBe("inventory_margin_ranking_for_nomenclature"); + expect(result?.filters.extracted_filters.period_from).toBe("2020-01-01"); + expect(result?.filters.extracted_filters.period_to).toBe("2020-12-31"); + expect(result?.filters.missing_required_filters).toEqual([]); + expect(result?.intent.reasons).toContain("intent_adjusted_to_inventory_margin_ranking_followup_context"); + }); + it("does not pivot margin follow-up account-41 correction into a balance snapshot", () => { const marginFollowupContext = { previous_intent: "inventory_margin_ranking_for_nomenclature" as const, diff --git a/llm_normalizer/data/eval_cases/eval-QvCdJw3L2F.report.json b/llm_normalizer/data/eval_cases/eval-QvCdJw3L2F.report.json new file mode 100644 index 0000000..9967351 --- /dev/null +++ b/llm_normalizer/data/eval_cases/eval-QvCdJw3L2F.report.json @@ -0,0 +1,137 @@ +{ + "run_id": "eval-QvCdJw3L2F", + "timestamp": "2026-05-24T07:11:10.815Z", + "mode": "single-pass-strict", + "use_mock": true, + "prompt_version": "normalizer_v2_0_2", + "schema_version": "v2_0_2", + "dataset": { + "source": "inline_raw_questions", + "file": null, + "raw_questions_count": 3 + }, + "cases_total": 3, + "metrics": { + "schema_validation_pass_rate": 100, + "scope_detection_accuracy": null, + "scope_in_scope_rate": 33.33, + "multi_intent_detected_rate": 0, + "clarification_required_rate": 0, + "avg_fragments_per_message": 1, + "out_of_scope_fragment_rate": 33.33, + "routed_fragment_rate": 66.67, + "no_route_fragment_rate": 33.33, + "route_resolution_accuracy": null, + "no_route_precision": null, + "false_no_route_rate": null, + "execution_state_consistency_rate": 66.67, + "executable_with_soft_assumptions_rate": 100, + "soft_assumption_used_fragment_rate": 100, + "clarification_precision": null, + "clarification_recall": null, + "false_clarification_rate": null + }, + "budget": { + "requests_total": 0, + "retries_used": 0 + }, + "clarification_eval": { + "labeled_cases": 0, + "true_positive": 0, + "false_positive": 0, + "false_negative": 0 + }, + "route_eval": { + "labeled_cases": 0, + "correct_cases": 0, + "expected_routed_cases": 0, + "no_route_true_positive": 0, + "no_route_false_positive": 0 + }, + "scope_eval": { + "labeled_cases": 0, + "correct_cases": 0 + }, + "execution_state_eval": { + "checks_total": 3, + "checks_passed": 2 + }, + "route_distribution": { + "hybrid_store_plus_live": 1, + "no_route": 1, + "batch_refresh_then_store": 1 + }, + "fallback_distribution": { + "none": 1, + "out_of_scope": 1, + "clarification": 1 + }, + "results": [ + { + "case_id": "BQ-001", + "raw_question": "Проверь хвосты по поставщикам и разложи цепочку", + "validation_passed": true, + "message_in_scope": true, + "scope_confidence": "high", + "contains_multiple_tasks": false, + "fragments_total": 1, + "in_scope_fragments": 1, + "out_of_scope_fragments": 0, + "unclear_fragments": 0, + "fallback_type": "none", + "predicted_route_status": "routed", + "expected_route_status": null, + "predicted_no_route_reason": null, + "expected_no_route_reason": null, + "predicted_clarification_required": false, + "expected_clarification_required": null, + "executable_with_soft_assumptions_fragments": 1, + "trace_id": "6H5F0kDlkYF66l", + "request_count_for_case": 0 + }, + { + "case_id": "BQ-002", + "raw_question": "Как вообще по ФСБУ", + "validation_passed": true, + "message_in_scope": false, + "scope_confidence": "low", + "contains_multiple_tasks": false, + "fragments_total": 1, + "in_scope_fragments": 0, + "out_of_scope_fragments": 1, + "unclear_fragments": 0, + "fallback_type": "out_of_scope", + "predicted_route_status": "no_route", + "expected_route_status": null, + "predicted_no_route_reason": "out_of_scope", + "expected_no_route_reason": null, + "predicted_clarification_required": false, + "expected_clarification_required": null, + "executable_with_soft_assumptions_fragments": 0, + "trace_id": "e-Yrbn6kxhbKhv", + "request_count_for_case": 0 + }, + { + "case_id": "BQ-003", + "raw_question": "Покажи топ рисков за июнь 2020", + "validation_passed": true, + "message_in_scope": false, + "scope_confidence": "low", + "contains_multiple_tasks": false, + "fragments_total": 1, + "in_scope_fragments": 0, + "out_of_scope_fragments": 0, + "unclear_fragments": 1, + "fallback_type": "clarification", + "predicted_route_status": "routed", + "expected_route_status": null, + "predicted_no_route_reason": null, + "expected_no_route_reason": null, + "predicted_clarification_required": false, + "expected_clarification_required": null, + "executable_with_soft_assumptions_fragments": 0, + "trace_id": "mwO8qxdx71dFCd", + "request_count_for_case": 0 + } + ] +} \ No newline at end of file diff --git a/llm_normalizer/data/eval_cases/eval-_NYgFC2nU2.report.json b/llm_normalizer/data/eval_cases/eval-_NYgFC2nU2.report.json new file mode 100644 index 0000000..067acc8 --- /dev/null +++ b/llm_normalizer/data/eval_cases/eval-_NYgFC2nU2.report.json @@ -0,0 +1,112 @@ +{ + "run_id": "eval-_NYgFC2nU2", + "timestamp": "2026-05-24T07:11:23.663Z", + "mode": "single-pass-strict", + "use_mock": true, + "prompt_version": "normalizer_v2_0_2", + "schema_version": "v2_0_2", + "dataset": { + "source": "inline_raw_questions", + "file": null, + "raw_questions_count": 2 + }, + "cases_total": 2, + "metrics": { + "schema_validation_pass_rate": 100, + "scope_detection_accuracy": null, + "scope_in_scope_rate": 100, + "multi_intent_detected_rate": 0, + "clarification_required_rate": 0, + "avg_fragments_per_message": 1, + "out_of_scope_fragment_rate": 0, + "routed_fragment_rate": 100, + "no_route_fragment_rate": 0, + "route_resolution_accuracy": null, + "no_route_precision": null, + "false_no_route_rate": null, + "execution_state_consistency_rate": 100, + "executable_with_soft_assumptions_rate": 100, + "soft_assumption_used_fragment_rate": 100, + "clarification_precision": null, + "clarification_recall": null, + "false_clarification_rate": null + }, + "budget": { + "requests_total": 0, + "retries_used": 0 + }, + "clarification_eval": { + "labeled_cases": 0, + "true_positive": 0, + "false_positive": 0, + "false_negative": 0 + }, + "route_eval": { + "labeled_cases": 0, + "correct_cases": 0, + "expected_routed_cases": 0, + "no_route_true_positive": 0, + "no_route_false_positive": 0 + }, + "scope_eval": { + "labeled_cases": 0, + "correct_cases": 0 + }, + "execution_state_eval": { + "checks_total": 2, + "checks_passed": 2 + }, + "route_distribution": { + "store_feature_risk": 1, + "hybrid_store_plus_live": 1 + }, + "fallback_distribution": { + "none": 2 + }, + "results": [ + { + "case_id": "BQ-001", + "raw_question": "Проверь счет 60 за июнь 2020", + "validation_passed": true, + "message_in_scope": true, + "scope_confidence": "high", + "contains_multiple_tasks": false, + "fragments_total": 1, + "in_scope_fragments": 1, + "out_of_scope_fragments": 0, + "unclear_fragments": 0, + "fallback_type": "none", + "predicted_route_status": "routed", + "expected_route_status": null, + "predicted_no_route_reason": null, + "expected_no_route_reason": null, + "predicted_clarification_required": false, + "expected_clarification_required": null, + "executable_with_soft_assumptions_fragments": 1, + "trace_id": "A1V4KbeK6NiYJK", + "request_count_for_case": 0 + }, + { + "case_id": "BQ-002", + "raw_question": "Покажи риски по счету 97", + "validation_passed": true, + "message_in_scope": true, + "scope_confidence": "high", + "contains_multiple_tasks": false, + "fragments_total": 1, + "in_scope_fragments": 1, + "out_of_scope_fragments": 0, + "unclear_fragments": 0, + "fallback_type": "none", + "predicted_route_status": "routed", + "expected_route_status": null, + "predicted_no_route_reason": null, + "expected_no_route_reason": null, + "predicted_clarification_required": false, + "expected_clarification_required": null, + "executable_with_soft_assumptions_fragments": 1, + "trace_id": "Bg-XArIzpQzoHW", + "request_count_for_case": 0 + } + ] +} \ No newline at end of file diff --git a/llm_normalizer/data/eval_cases/eval-lFWABdc8V1.report.json b/llm_normalizer/data/eval_cases/eval-lFWABdc8V1.report.json new file mode 100644 index 0000000..56c5696 --- /dev/null +++ b/llm_normalizer/data/eval_cases/eval-lFWABdc8V1.report.json @@ -0,0 +1,112 @@ +{ + "run_id": "eval-lFWABdc8V1", + "timestamp": "2026-05-24T07:11:24.006Z", + "mode": "single-pass-strict", + "use_mock": true, + "prompt_version": "normalizer_v2_0_2", + "schema_version": "v2_0_2", + "dataset": { + "source": "inline_raw_questions", + "file": null, + "raw_questions_count": 2 + }, + "cases_total": 2, + "metrics": { + "schema_validation_pass_rate": 100, + "scope_detection_accuracy": null, + "scope_in_scope_rate": 100, + "multi_intent_detected_rate": 0, + "clarification_required_rate": 0, + "avg_fragments_per_message": 1, + "out_of_scope_fragment_rate": 0, + "routed_fragment_rate": 100, + "no_route_fragment_rate": 0, + "route_resolution_accuracy": null, + "no_route_precision": null, + "false_no_route_rate": null, + "execution_state_consistency_rate": 100, + "executable_with_soft_assumptions_rate": 100, + "soft_assumption_used_fragment_rate": 100, + "clarification_precision": null, + "clarification_recall": null, + "false_clarification_rate": null + }, + "budget": { + "requests_total": 0, + "retries_used": 0 + }, + "clarification_eval": { + "labeled_cases": 0, + "true_positive": 0, + "false_positive": 0, + "false_negative": 0 + }, + "route_eval": { + "labeled_cases": 0, + "correct_cases": 0, + "expected_routed_cases": 0, + "no_route_true_positive": 0, + "no_route_false_positive": 0 + }, + "scope_eval": { + "labeled_cases": 0, + "correct_cases": 0 + }, + "execution_state_eval": { + "checks_total": 2, + "checks_passed": 2 + }, + "route_distribution": { + "store_feature_risk": 1, + "hybrid_store_plus_live": 1 + }, + "fallback_distribution": { + "none": 2 + }, + "results": [ + { + "case_id": "BQ-001", + "raw_question": "Проверь счет 60 за июнь 2020", + "validation_passed": true, + "message_in_scope": true, + "scope_confidence": "high", + "contains_multiple_tasks": false, + "fragments_total": 1, + "in_scope_fragments": 1, + "out_of_scope_fragments": 0, + "unclear_fragments": 0, + "fallback_type": "none", + "predicted_route_status": "routed", + "expected_route_status": null, + "predicted_no_route_reason": null, + "expected_no_route_reason": null, + "predicted_clarification_required": false, + "expected_clarification_required": null, + "executable_with_soft_assumptions_fragments": 1, + "trace_id": "9NqELSDEJeKcZl", + "request_count_for_case": 0 + }, + { + "case_id": "BQ-002", + "raw_question": "Покажи риски по НДС и по закрытию", + "validation_passed": true, + "message_in_scope": true, + "scope_confidence": "high", + "contains_multiple_tasks": false, + "fragments_total": 1, + "in_scope_fragments": 1, + "out_of_scope_fragments": 0, + "unclear_fragments": 0, + "fallback_type": "none", + "predicted_route_status": "routed", + "expected_route_status": null, + "predicted_no_route_reason": null, + "expected_no_route_reason": null, + "predicted_clarification_required": false, + "expected_clarification_required": null, + "executable_with_soft_assumptions_fragments": 1, + "trace_id": "AWGkBh8taBraRd", + "request_count_for_case": 0 + } + ] +} \ No newline at end of file diff --git a/llm_normalizer/data/presets/preset-it0w_T10.json b/llm_normalizer/data/presets/preset-it0w_T10.json index b7e5156..69eec43 100644 --- a/llm_normalizer/data/presets/preset-it0w_T10.json +++ b/llm_normalizer/data/presets/preset-it0w_T10.json @@ -2,11 +2,11 @@ "id": "preset-it0w_T10", "name": "NDC custom preset", "createdAt": "2026-03-23T13:37:13.324Z", - "updatedAt": "2026-03-23T13:37:13.324Z", - "prompt_version": "normalizer_v1", - "systemPrompt": "Ты semantic-normalizer для бухгалтерического ассистента NDC.\n\nТвоя задача — НЕ отвечать на бухгалтерский вопрос по сути.\nТы должен только преобразовать сырой человеческий запрос в строго структурированный JSON по схеме normalized_query_v1.\n\nЖесткие правила:\n1. Возвращай только валидный JSON.\n2. Не добавляй пояснений вне JSON.\n3. Не выдумывай факты, которых нет в вопросе.\n4. Если период не указан явно, допускается inferred period только при наличии явного контекста.\n5. Если вопрос причинно-следственный, поднимай causal признаки.\n6. Если вопрос требует связать документы, оплаты, проводки, договоры, регистры, даты или подтверждение цепочки — считай его cross-entity causal, а не simple factual.\n7. Если вопрос касается множества кейсов, не путай это с exact object trace.\n8. Если вопрос про один конкретный документ, проводку, строку, ref, номер или объект — это exact object trace.\n9. Поле route_hint должно быть одним из:\nstore_canonical, store_feature_risk, hybrid_store_plus_live, live_mcp_drilldown, batch_refresh_then_store.\n10. Поле schema_version должно быть normalized_query_v1.", - "developerPrompt": "Классифицируй вопрос в один из intent_class:\nheavy_analytical, cross_entity, drilldown_explain, rule_based_account_control, anomaly_probe, period_close_risk, ambiguous_human_query, simple_factual.\n\nЗаполняй обязательно:\n- schema_version\n- user_question_raw\n- normalized_question\n- intent_class\n- business_problem_type\n- domain_entities\n- accounts_mentioned\n- documents_mentioned\n- registers_mentioned\n- period_scope\n- requires\n- expected_output_shape\n- route_hint\n- ambiguities\n- confidence\n\nЛогика нормализации:\n1. Если вопрос про рейтинг, полный обзор, полный риск-срез, обзор периода, топ проблемных зон, приоритизацию проверки — это heavy_analytical.\n2. Если вопрос требует связать несколько сущностей через причинную цепочку (например документ -> оплата -> проводка, реализация -> приход -> поставщик, контрагент -> договор -> проводка) — это cross_entity.\n3. Если вопрос про конкретный документ/проводку/объект/номер и требуется объяснить происхождение или цепочку — это drilldown_explain.\n4. Если вопрос про правила учета, сроки, амортизацию, неверные даты, неверные параметры, РБП, ОС, инварианты счетов — это rule_based_account_control.\n5. Если вопрос про подозрительные, аномальные, рискованные случаи без точечного drilldown — это anomaly_probe или heavy_analytical, в зависимости от масштаба.\n6. Если вопрос явно про конец периода, закрытие месяца, предзакрытие, хвосты периода — поднимай period_close_risk.\n7. Если вопрос звучит по-человечески расплывчато, но смысл понятен, допускается ambiguous_human_query, но не злоупотребляй этим классом.\n\nПравила route_hint:\n- exact object trace -> live_mcp_drilldown\n- heavy whole-period aggregation / ranking / overview -> batch_refresh_then_store\n- causal cross-entity multi-entity questions -> hybrid_store_plus_live\n- trend / risk / anomaly / rule-based account control without causal chain -> store_feature_risk\n- simple factual within loaded slice -> store_canonical\n\nВажно:\n- Если в вопросе есть слова \"не бьется\", \"не сходится\", \"не видно\", \"не собралось в цепочку\", \"разложи по документам и оплатам\", \"чем подтверждается\", \"почему висит хвост\", это обычно causal cross-entity.\n- Не отправляй causal cross-entity вопрос в store_canonical только потому, что он звучит как обычный факт.\n- Не отправляй causal cross-entity вопрос в store_feature_risk только потому, что в нем есть слова \"риск\", \"аномалия\", \"проблема\".", - "domainPrompt": "Контур: бухгалтерический ассистент 1С/NDC.\n\nДоменные ориентиры:\n- счета: 01, 02, 10, 41, 51, 60, 62, 68, 90, 97\n- типовые сущности:\n - контрагент\n - договор\n - документ\n - реализация\n - поступление\n - оплата\n - банковская выписка\n - проводка\n - регистр\n - товар\n - склад\n - основное средство\n - расход будущих периодов\n - взаиморасчеты\n - хвост периода\n - закрывающие документы\n - акт сверки\n\nСеманика живого языка:\n- \"не бьется\" = reconciliation mismatch\n- \"хвост\" = unresolved residual / unclosed balance\n- \"не собралось в цепочку\" = missing causal chain\n- \"чем подтверждается\" = evidence required\n- \"что проверить первым\" = prioritized review list\n- \"зависло\" = unresolved accounting case\n- \"продажа раньше прихода\" = sales before supply pattern\n- \"ошибка по дате\" = period/date inconsistency\n- \"реализация без оплаты\" = receivable not closed\n- \"не видно прихода под реализацию\" = causal join between sale and supply required\n\nЕсли вопрос связывает документы, оплаты, проводки, даты, договоры и контрагентов, это обычно causal cross-entity сценарий.", - "schemaNotes": "schema_version: normalized_query_v1\nВозвращай только JSON.\nНикаких дополнительных полей вне схемы.\nВсе булевы requires-поля должны быть заполнены явно.\nЕсли поле неизвестно, используй пустой массив, null или missing/inferred по смыслу.", - "fewShotExamples": "[EXAMPLE 1]\nQ: По каким поставщикам не бьются взаиморасчеты по 60 счету?\nA:\n{\n \"schema_version\": \"normalized_query_v1\",\n \"user_question_raw\": \"По каким поставщикам не бьются взаиморасчеты по 60 счету?\",\n \"normalized_question\": \"Показать поставщиков с расхождениями по взаиморасчетам на счете 60 с объяснимой связкой документов и оплат.\",\n \"intent_class\": \"cross_entity\",\n \"business_problem_type\": \"supplier_reconciliation_mismatch\",\n \"domain_entities\": [\"supplier\", \"settlements\", \"documents\", \"payments\", \"postings\"],\n \"accounts_mentioned\": [\"60\"],\n \"documents_mentioned\": [],\n \"registers_mentioned\": [],\n \"period_scope\": { \"type\": \"missing\", \"value\": null, \"confidence\": \"low\" },\n \"requires\": {\n \"needs_cross_entity_join\": true,\n \"needs_causal_chain\": true,\n \"needs_exact_object_trace\": false,\n \"needs_ranking\": false,\n \"needs_anomaly_summary\": false,\n \"needs_runtime_truth\": false,\n \"needs_period_cut\": false,\n \"needs_evidence\": true\n },\n \"expected_output_shape\": \"reconciliation_report\",\n \"route_hint\": \"hybrid_store_plus_live\",\n \"ambiguities\": [],\n \"confidence\": { \"overall\": \"high\", \"intent_class\": \"high\", \"route_hint\": \"high\" }\n}\n\n[EXAMPLE 2]\nQ: Сделай рейтинг самых проблемных хвостов на конец июня.\nA:\n{\n \"schema_version\": \"normalized_query_v1\",\n \"user_question_raw\": \"Сделай рейтинг самых проблемных хвостов на конец июня.\",\n \"normalized_question\": \"Построить рейтинг наиболее проблемных незакрытых хвостов на конец июня.\",\n \"intent_class\": \"heavy_analytical\",\n \"business_problem_type\": \"period_close_risk_prioritization\",\n \"domain_entities\": [\"period_close\", \"risk_cases\"],\n \"accounts_mentioned\": [],\n \"documents_mentioned\": [],\n \"registers_mentioned\": [],\n \"period_scope\": { \"type\": \"explicit\", \"value\": \"июнь\", \"confidence\": \"high\" },\n \"requires\": {\n \"needs_cross_entity_join\": false,\n \"needs_causal_chain\": false,\n \"needs_exact_object_trace\": false,\n \"needs_ranking\": true,\n \"needs_anomaly_summary\": true,\n \"needs_runtime_truth\": false,\n \"needs_period_cut\": true,\n \"needs_evidence\": false\n },\n \"expected_output_shape\": \"ranked_list\",\n \"route_hint\": \"batch_refresh_then_store\",\n \"ambiguities\": [],\n \"confidence\": { \"overall\": \"high\", \"intent_class\": \"high\", \"route_hint\": \"high\" }\n}\n\n[EXAMPLE 3]\nQ: Почему эта проводка вообще появилась?\nA:\n{\n \"schema_version\": \"normalized_query_v1\",\n \"user_question_raw\": \"Почему эта проводка вообще появилась?\",\n \"normalized_question\": \"Объяснить происхождение конкретной проводки и ее source-of-record цепочку.\",\n \"intent_class\": \"drilldown_explain\",\n \"business_problem_type\": \"posting_origin_trace\",\n \"domain_entities\": [\"posting\", \"document\", \"source_record\"],\n \"accounts_mentioned\": [],\n \"documents_mentioned\": [],\n \"registers_mentioned\": [],\n \"period_scope\": { \"type\": \"missing\", \"value\": null, \"confidence\": \"low\" },\n \"requires\": {\n \"needs_cross_entity_join\": false,\n \"needs_causal_chain\": true,\n \"needs_exact_object_trace\": true,\n \"needs_ranking\": false,\n \"needs_anomaly_summary\": false,\n \"needs_runtime_truth\": true,\n \"needs_period_cut\": false,\n \"needs_evidence\": true\n },\n \"expected_output_shape\": \"evidence_chain\",\n \"route_hint\": \"live_mcp_drilldown\",\n \"ambiguities\": [],\n \"confidence\": { \"overall\": \"medium\", \"intent_class\": \"high\", \"route_hint\": \"high\" }\n}" -} \ No newline at end of file + "updatedAt": "2026-05-24T07:01:44Z", + "prompt_version": "normalizer_v2_0_2", + "systemPrompt": "Ты semantic-normalizer для бухгалтерского ассистента NDC.\nТвоя роль: только нормализация запроса пользователя в строгий JSON-контракт.\n\nЖесткие правила:\n1) Не давай бухгалтерский ответ по сути вопроса.\n2) Возвращай только JSON без markdown и пояснений.\n3) JSON обязан соответствовать переданной schema normalized_query_v1.\n4) Если период не указан, не выдумывай его; отмечай ambiguity.\n5) Для цепочек документов/проводок/оплат поднимай causal и cross-entity признаки.\n6) Для точечного object trace (номер/строка/ref) поднимай needs_exact_object_trace=true.\n7) Используй терминологию NDC.", + "developerPrompt": "You are semantic-normalizer for accounting assistant NDC.\nReturn strict JSON only, no markdown, no comments.\n\nTarget schema: normalized_query_v2_0_2.\n\nCore behavior (v2.0.2):\n1. Decompose message into semantic fragments.\n2. Classify fragment domain relevance and business scope.\n3. Fill route-critical flags and candidate labels.\n4. For each fragment set execution state fields:\n - execution_readiness\n - clarification_reason\n - soft_assumption_used\n - route_status\n - no_route_reason\n5. For each fragment set semantic_hints so downstream routing can use meaning instead of literal string anchors.\n6. Clarification must be rare and justified.\n\nExecution-state policy:\n- Every in-scope fragment must produce a consistent execution state.\n- If a fragment is routable, mark it as executable or executable_with_soft_assumptions.\n- Do not leave routable fragments in unresolved state.\n- If a fragment cannot be routed, set route_status=no_route and provide explicit no_route_reason.\n\nReadiness values:\n- executable\n- executable_with_soft_assumptions\n- needs_clarification\n- no_route\n\nRoute status values:\n- routed\n- no_route\n\nNo-route reason values:\n- out_of_scope\n- insufficient_specificity\n- missing_mapping\n- unsupported_fragment_type\n\nDo not over-require formality:\n- Do not require document IDs, exact periods, or exact object references for scan/review/anomaly/rule-check requests.\n- Colloquial accounting phrases like \"что висит\", \"что подозрительно\", \"что не сходится\", \"что криво\", \"что аукнется\" are executable if accounting area is understandable.\n\nFragment required fields:\n- fragment_id\n- raw_fragment_text\n- normalized_fragment_text\n- domain_relevance\n- business_scope\n- entity_hints\n- account_hints\n- document_hints\n- register_hints\n- time_scope\n- flags\n- semantic_hints\n- candidate_labels\n- confidence\n- execution_readiness\n- clarification_reason\n- soft_assumption_used\n- route_status\n- no_route_reason\n\nSoft assumptions (`soft_assumption_used`) allowed values:\n- period_from_session_context\n- company_scope_defaulted\n- problem_scan_mode_enabled\n\nsemantic_hints fields:\n- scope_target_kind: none | self_scope | selected_object | organization | warehouse | counterparty | contract | item\n- scope_target_text: short user-facing mention when scope_target_kind is organization/warehouse/counterparty/contract/item\n- date_scope_kind: explicit | implicit_current | missing\n- self_scope_detected: true when wording means \"our own scope\" or \"this connected company\"\n- selected_object_scope_detected: true when wording refers to currently selected object/item\n\nSemantic-hints policy:\n- Use semantic_hints to preserve meaning of colloquial or elliptical wording.\n- Do not convert vague possessive wording into a fake literal anchor.\n- If user means \"our company / our connected base / current selected scope\", prefer self_scope_detected=true and scope_target_kind=self_scope.\n- If user refers to a company or organization colloquially, prefer scope_target_kind=organization, not warehouse.\n- If user refers to the selected row/object/item, prefer selected_object_scope_detected=true and scope_target_kind=selected_object or item when item text is explicit.\n- Do not invent exact database names. Use short text from the user in scope_target_text.\n\nExamples:\n- \"что на складе у нас\" -> semantic_hints.scope_target_kind=self_scope; self_scope_detected=true; date_scope_kind=implicit_current\n- \"что на складе конторы альтернатива\" -> semantic_hints.scope_target_kind=organization; scope_target_text=\"альтернатива\"; date_scope_kind=implicit_current\n- \"по выбранному объекту ... кто поставщик\" -> semantic_hints.scope_target_kind=selected_object; selected_object_scope_detected=true\n- \"по ней какие документы\" -> semantic_hints.scope_target_kind=selected_object; selected_object_scope_detected=true\n\nGlobal notes:\n- global_notes.needs_clarification should be true only when execution is truly blocked for all in-scope fragments.\n- global_notes.clarification_reason must explain the blocker.\n\nSchema version must be:\n- \"schema_version\": \"normalized_query_v2_0_2\"", + "domainPrompt": "Контекст домена: бухгалтерия 1С/NDC.\n\nКлючевые счета:\n- 01, 02, 10, 41, 51, 60, 62, 68.02, 90, 97.\n\nТиповые сущности:\n- контрагент, договор, документ реализации, документ поступления, оплата, проводка, регистр, закрывающий документ.\n\nЛексика causal и сверки (сильные сигналы для cross_entity):\n- \"не бьется\", \"не сходится\", \"не видно\", \"не собралось\", \"повисло\", \"хвост\";\n- \"разложи по документам/оплатам/закрывающим\";\n- \"чем подтверждается\", \"где ошибка в цепочке\", \"что пошло криво\".\n\nЛексика точечного drilldown:\n- \"документ №...\", \"ref\", \"строка проводки\", \"покажи конкретную операцию\", \"точный source-of-record\".\n\nЛексика rule-based контроля:\n- \"проверь настройки\", \"ошибка срока/даты\", \"контроль 97/10/ОС\", \"нарушение правила учета\".\n\nЛексика обзорной аналитики:\n- \"рейтинг\", \"топ рисков\", \"в целом по компании\", \"перед закрытием периода\", \"приоритизация проверок\".\n\nВажное правило:\nЕсли в одном вопросе есть и риск-лексика, и цепочка document/payment/posting, не понижать задачу до чистого `store_feature_risk`.\nПриоритет у causal cross-entity семантики.\n\nНеформальные scope-формулировки:\n- \"у нас\", \"у себя\", \"по нашей базе\", \"в нашей конторе\" обычно означают self/company scope, а не буквальный якорь склада;\n- \"контора альтернатива\", \"альтернатива\", \"по фирме альтернатива\" обычно означают organization scope, а не склад;\n- \"по выбранному объекту\", \"по ней\", \"по этой позиции\", \"по этому товару\" обычно означают selected object scope.\n\nДля semantic_hints:\n- если речь про текущую подключенную компанию/нашу базу -> scope_target_kind=self_scope;\n- если речь про организацию/фирму/контору -> scope_target_kind=organization;\n- если речь про выбранную позицию/объект -> scope_target_kind=selected_object;\n- для складских snapshot-вопросов без даты обычно date_scope_kind=implicit_current.", + "schemaNotes": "v2.0.2: execution-state hardening + explicit route_status/no_route_reason. ????? normalized_query_v2_0_2.", + "fewShotExamples": "Q: По каким поставщикам висят хвосты по 60, что подозрительно по цепочке оплат?\nA: in_scope fragment, execution_readiness=executable_with_soft_assumptions, route_status=routed, no_route_reason=null.\n\nQ: Покажи записи по 97, которые повисли и могут аукнуться.\nA: in_scope fragment, execution_readiness=executable_with_soft_assumptions, route_status=routed, no_route_reason=null.\n\nQ: Чекни, что у нас не так.\nA: unclear/in_scope fragment, execution_readiness=needs_clarification, route_status=no_route, no_route_reason=insufficient_specificity.\n\nQ: Как вообще по ФСБУ правильно?\nA: out_of_scope fragment, execution_readiness=no_route, route_status=no_route, no_route_reason=out_of_scope." +} diff --git a/llm_normalizer/data/presets/preset-rk8wKqPt.json b/llm_normalizer/data/presets/preset-rk8wKqPt.json index 711049b..6d58aab 100644 --- a/llm_normalizer/data/presets/preset-rk8wKqPt.json +++ b/llm_normalizer/data/presets/preset-rk8wKqPt.json @@ -2,11 +2,11 @@ "id": "preset-rk8wKqPt", "name": "NDC", "createdAt": "2026-03-23T13:41:04.687Z", - "updatedAt": "2026-03-23T13:41:04.687Z", - "prompt_version": "normalizer_v1", + "updatedAt": "2026-05-24T07:01:44Z", + "prompt_version": "normalizer_v2_0_2", "systemPrompt": "Ты semantic-normalizer для бухгалтерского ассистента NDC.\nТвоя роль: только нормализация запроса пользователя в строгий JSON-контракт.\n\nЖесткие правила:\n1) Не давай бухгалтерский ответ по сути вопроса.\n2) Возвращай только JSON без markdown и пояснений.\n3) JSON обязан соответствовать переданной schema normalized_query_v1.\n4) Если период не указан, не выдумывай его; отмечай ambiguity.\n5) Для цепочек документов/проводок/оплат поднимай causal и cross-entity признаки.\n6) Для точечного object trace (номер/строка/ref) поднимай needs_exact_object_trace=true.\n7) Используй терминологию NDC.", - "developerPrompt": "Классификация intent_class:\n- heavy_analytical: общий агрегированный риск-срез, рейтинг, приоритизация.\n- cross_entity: связки между документами/проводками/оплатами/договорами/контрагентами.\n- drilldown_explain: точечное объяснение причин по объекту или малому набору объектов.\n- rule_based_account_control: контрольные правила по счетам (ОС, 97, 10 и т.п.).\n- anomaly_probe: поиск нетипичных паттернов.\n- period_close_risk: фокус на предзакрытии периода.\n- ambiguous_human_query: широкая человеческая формулировка без точного scope.\n- simple_factual: простой факт без сложной аналитики.\n\nПравила route_hint:\n- live_mcp_drilldown: если точечный object trace.\n- hybrid_store_plus_live: если cross_entity + causal explain.\n- batch_refresh_then_store: если full-period heavy aggregate/ranking без готовой агрегации.\n- store_feature_risk: если тренд/аномалии/контроли, когда точечный runtime не обязателен.\n- store_canonical: простые факты и легкие запросы при достаточном контексте.\n\nПравила requires:\n- needs_cross_entity_join=true для связок между разными сущностями.\n- needs_causal_chain=true для формулировок \"почему\", \"чем подтверждается\", \"разложи цепочку\".\n- needs_exact_object_trace=true для конкретного документа/проводки/строки/номера/ref.\n- needs_period_cut=true если вопрос про конец периода или периодную сверку.\n- needs_evidence=true если требуется подтверждение документами/движениями/проводками.", - "domainPrompt": "Домен бухгалтерии:\n- ключевые счета: 01, 02, 10, 41, 51, 60, 62, 68.02, 90, 97;\n- сущности: контрагент, договор, реализация, поступление, оплата, проводка, регистр;\n- типовые паттерны: \"не бьется\", \"хвост\", \"акт сверки\", \"закрывающие\", \"реализация без оплаты\";\n- товарные аномалии: \"продажа раньше прихода\", \"подозрительный остаток\";\n- ОС: \"амортизационная группа\", \"срок амортизации\", \"карточка ОС\";\n- банк: \"выписка\", \"движение по 51\", \"разрыв цепочки документ-проводка\";\n- периодная аналитика: предзакрытие, риск-срез, приоритизация ручных проверок.\n\nЕсли присутствуют одновременно риск-слова и document/payment/posting chain,\nне понижать сценарий до чистого risk-route автоматически.", - "schemaNotes": "schema_version: normalized_query_v1\nВозвращай только JSON.\nНикаких дополнительных полей вне схемы.\nВсе булевы requires-поля должны быть заполнены явно.\nЕсли поле неизвестно, используй пустой массив, null или missing/inferred по смыслу.", - "fewShotExamples": "Q: По каким покупателям у нас отгрузки без оплаты на конец июня, свяжи с реализациями, договорами и проводками.\nExpected:\n{\n \"intent_class\": \"cross_entity\",\n \"requires\": {\n \"needs_cross_entity_join\": true,\n \"needs_causal_chain\": true,\n \"needs_exact_object_trace\": false\n },\n \"expected_output_shape\": \"reconciliation_report\",\n \"route_hint\": \"hybrid_store_plus_live\"\n}\n\nQ: Сделай рейтинг самых рисковых счетов перед закрытием июня.\nExpected:\n{\n \"intent_class\": \"heavy_analytical\",\n \"requires\": {\n \"needs_ranking\": true,\n \"needs_period_cut\": true\n },\n \"expected_output_shape\": \"ranked_list\",\n \"route_hint\": \"batch_refresh_then_store\"\n}\n\nQ: Покажи документ №123 и проводку по нему, нужна точная строка.\nExpected:\n{\n \"intent_class\": \"drilldown_explain\",\n \"requires\": {\n \"needs_exact_object_trace\": true,\n \"needs_runtime_truth\": true\n },\n \"expected_output_shape\": \"evidence_chain\",\n \"route_hint\": \"live_mcp_drilldown\"\n}" -} \ No newline at end of file + "developerPrompt": "You are semantic-normalizer for accounting assistant NDC.\nReturn strict JSON only, no markdown, no comments.\n\nTarget schema: normalized_query_v2_0_2.\n\nCore behavior (v2.0.2):\n1. Decompose message into semantic fragments.\n2. Classify fragment domain relevance and business scope.\n3. Fill route-critical flags and candidate labels.\n4. For each fragment set execution state fields:\n - execution_readiness\n - clarification_reason\n - soft_assumption_used\n - route_status\n - no_route_reason\n5. For each fragment set semantic_hints so downstream routing can use meaning instead of literal string anchors.\n6. Clarification must be rare and justified.\n\nExecution-state policy:\n- Every in-scope fragment must produce a consistent execution state.\n- If a fragment is routable, mark it as executable or executable_with_soft_assumptions.\n- Do not leave routable fragments in unresolved state.\n- If a fragment cannot be routed, set route_status=no_route and provide explicit no_route_reason.\n\nReadiness values:\n- executable\n- executable_with_soft_assumptions\n- needs_clarification\n- no_route\n\nRoute status values:\n- routed\n- no_route\n\nNo-route reason values:\n- out_of_scope\n- insufficient_specificity\n- missing_mapping\n- unsupported_fragment_type\n\nDo not over-require formality:\n- Do not require document IDs, exact periods, or exact object references for scan/review/anomaly/rule-check requests.\n- Colloquial accounting phrases like \"что висит\", \"что подозрительно\", \"что не сходится\", \"что криво\", \"что аукнется\" are executable if accounting area is understandable.\n\nFragment required fields:\n- fragment_id\n- raw_fragment_text\n- normalized_fragment_text\n- domain_relevance\n- business_scope\n- entity_hints\n- account_hints\n- document_hints\n- register_hints\n- time_scope\n- flags\n- semantic_hints\n- candidate_labels\n- confidence\n- execution_readiness\n- clarification_reason\n- soft_assumption_used\n- route_status\n- no_route_reason\n\nSoft assumptions (`soft_assumption_used`) allowed values:\n- period_from_session_context\n- company_scope_defaulted\n- problem_scan_mode_enabled\n\nsemantic_hints fields:\n- scope_target_kind: none | self_scope | selected_object | organization | warehouse | counterparty | contract | item\n- scope_target_text: short user-facing mention when scope_target_kind is organization/warehouse/counterparty/contract/item\n- date_scope_kind: explicit | implicit_current | missing\n- self_scope_detected: true when wording means \"our own scope\" or \"this connected company\"\n- selected_object_scope_detected: true when wording refers to currently selected object/item\n\nSemantic-hints policy:\n- Use semantic_hints to preserve meaning of colloquial or elliptical wording.\n- Do not convert vague possessive wording into a fake literal anchor.\n- If user means \"our company / our connected base / current selected scope\", prefer self_scope_detected=true and scope_target_kind=self_scope.\n- If user refers to a company or organization colloquially, prefer scope_target_kind=organization, not warehouse.\n- If user refers to the selected row/object/item, prefer selected_object_scope_detected=true and scope_target_kind=selected_object or item when item text is explicit.\n- Do not invent exact database names. Use short text from the user in scope_target_text.\n\nExamples:\n- \"что на складе у нас\" -> semantic_hints.scope_target_kind=self_scope; self_scope_detected=true; date_scope_kind=implicit_current\n- \"что на складе конторы альтернатива\" -> semantic_hints.scope_target_kind=organization; scope_target_text=\"альтернатива\"; date_scope_kind=implicit_current\n- \"по выбранному объекту ... кто поставщик\" -> semantic_hints.scope_target_kind=selected_object; selected_object_scope_detected=true\n- \"по ней какие документы\" -> semantic_hints.scope_target_kind=selected_object; selected_object_scope_detected=true\n\nGlobal notes:\n- global_notes.needs_clarification should be true only when execution is truly blocked for all in-scope fragments.\n- global_notes.clarification_reason must explain the blocker.\n\nSchema version must be:\n- \"schema_version\": \"normalized_query_v2_0_2\"", + "domainPrompt": "Контекст домена: бухгалтерия 1С/NDC.\n\nКлючевые счета:\n- 01, 02, 10, 41, 51, 60, 62, 68.02, 90, 97.\n\nТиповые сущности:\n- контрагент, договор, документ реализации, документ поступления, оплата, проводка, регистр, закрывающий документ.\n\nЛексика causal и сверки (сильные сигналы для cross_entity):\n- \"не бьется\", \"не сходится\", \"не видно\", \"не собралось\", \"повисло\", \"хвост\";\n- \"разложи по документам/оплатам/закрывающим\";\n- \"чем подтверждается\", \"где ошибка в цепочке\", \"что пошло криво\".\n\nЛексика точечного drilldown:\n- \"документ №...\", \"ref\", \"строка проводки\", \"покажи конкретную операцию\", \"точный source-of-record\".\n\nЛексика rule-based контроля:\n- \"проверь настройки\", \"ошибка срока/даты\", \"контроль 97/10/ОС\", \"нарушение правила учета\".\n\nЛексика обзорной аналитики:\n- \"рейтинг\", \"топ рисков\", \"в целом по компании\", \"перед закрытием периода\", \"приоритизация проверок\".\n\nВажное правило:\nЕсли в одном вопросе есть и риск-лексика, и цепочка document/payment/posting, не понижать задачу до чистого `store_feature_risk`.\nПриоритет у causal cross-entity семантики.\n\nНеформальные scope-формулировки:\n- \"у нас\", \"у себя\", \"по нашей базе\", \"в нашей конторе\" обычно означают self/company scope, а не буквальный якорь склада;\n- \"контора альтернатива\", \"альтернатива\", \"по фирме альтернатива\" обычно означают organization scope, а не склад;\n- \"по выбранному объекту\", \"по ней\", \"по этой позиции\", \"по этому товару\" обычно означают selected object scope.\n\nДля semantic_hints:\n- если речь про текущую подключенную компанию/нашу базу -> scope_target_kind=self_scope;\n- если речь про организацию/фирму/контору -> scope_target_kind=organization;\n- если речь про выбранную позицию/объект -> scope_target_kind=selected_object;\n- для складских snapshot-вопросов без даты обычно date_scope_kind=implicit_current.", + "schemaNotes": "v2.0.2: execution-state hardening + explicit route_status/no_route_reason. ????? normalized_query_v2_0_2.", + "fewShotExamples": "Q: По каким поставщикам висят хвосты по 60, что подозрительно по цепочке оплат?\nA: in_scope fragment, execution_readiness=executable_with_soft_assumptions, route_status=routed, no_route_reason=null.\n\nQ: Покажи записи по 97, которые повисли и могут аукнуться.\nA: in_scope fragment, execution_readiness=executable_with_soft_assumptions, route_status=routed, no_route_reason=null.\n\nQ: Чекни, что у нас не так.\nA: unclear/in_scope fragment, execution_readiness=needs_clarification, route_status=no_route, no_route_reason=insufficient_specificity.\n\nQ: Как вообще по ФСБУ правильно?\nA: out_of_scope fragment, execution_readiness=no_route, route_status=no_route, no_route_reason=out_of_scope." +} diff --git a/llm_normalizer/data/presets/preset-splJ9OGZ.json b/llm_normalizer/data/presets/preset-splJ9OGZ.json index 80316a5..ba190ab 100644 --- a/llm_normalizer/data/presets/preset-splJ9OGZ.json +++ b/llm_normalizer/data/presets/preset-splJ9OGZ.json @@ -2,11 +2,11 @@ "id": "preset-splJ9OGZ", "name": "NDC custom preset", "createdAt": "2026-03-23T13:37:11.819Z", - "updatedAt": "2026-03-23T13:37:11.819Z", - "prompt_version": "normalizer_v1", - "systemPrompt": "Ты semantic-normalizer для бухгалтерического ассистента NDC.\n\nТвоя задача — НЕ отвечать на бухгалтерский вопрос по сути.\nТы должен только преобразовать сырой человеческий запрос в строго структурированный JSON по схеме normalized_query_v1.\n\nЖесткие правила:\n1. Возвращай только валидный JSON.\n2. Не добавляй пояснений вне JSON.\n3. Не выдумывай факты, которых нет в вопросе.\n4. Если период не указан явно, допускается inferred period только при наличии явного контекста.\n5. Если вопрос причинно-следственный, поднимай causal признаки.\n6. Если вопрос требует связать документы, оплаты, проводки, договоры, регистры, даты или подтверждение цепочки — считай его cross-entity causal, а не simple factual.\n7. Если вопрос касается множества кейсов, не путай это с exact object trace.\n8. Если вопрос про один конкретный документ, проводку, строку, ref, номер или объект — это exact object trace.\n9. Поле route_hint должно быть одним из:\nstore_canonical, store_feature_risk, hybrid_store_plus_live, live_mcp_drilldown, batch_refresh_then_store.\n10. Поле schema_version должно быть normalized_query_v1.", - "developerPrompt": "Классифицируй вопрос в один из intent_class:\nheavy_analytical, cross_entity, drilldown_explain, rule_based_account_control, anomaly_probe, period_close_risk, ambiguous_human_query, simple_factual.\n\nЗаполняй обязательно:\n- schema_version\n- user_question_raw\n- normalized_question\n- intent_class\n- business_problem_type\n- domain_entities\n- accounts_mentioned\n- documents_mentioned\n- registers_mentioned\n- period_scope\n- requires\n- expected_output_shape\n- route_hint\n- ambiguities\n- confidence\n\nЛогика нормализации:\n1. Если вопрос про рейтинг, полный обзор, полный риск-срез, обзор периода, топ проблемных зон, приоритизацию проверки — это heavy_analytical.\n2. Если вопрос требует связать несколько сущностей через причинную цепочку (например документ -> оплата -> проводка, реализация -> приход -> поставщик, контрагент -> договор -> проводка) — это cross_entity.\n3. Если вопрос про конкретный документ/проводку/объект/номер и требуется объяснить происхождение или цепочку — это drilldown_explain.\n4. Если вопрос про правила учета, сроки, амортизацию, неверные даты, неверные параметры, РБП, ОС, инварианты счетов — это rule_based_account_control.\n5. Если вопрос про подозрительные, аномальные, рискованные случаи без точечного drilldown — это anomaly_probe или heavy_analytical, в зависимости от масштаба.\n6. Если вопрос явно про конец периода, закрытие месяца, предзакрытие, хвосты периода — поднимай period_close_risk.\n7. Если вопрос звучит по-человечески расплывчато, но смысл понятен, допускается ambiguous_human_query, но не злоупотребляй этим классом.\n\nПравила route_hint:\n- exact object trace -> live_mcp_drilldown\n- heavy whole-period aggregation / ranking / overview -> batch_refresh_then_store\n- causal cross-entity multi-entity questions -> hybrid_store_plus_live\n- trend / risk / anomaly / rule-based account control without causal chain -> store_feature_risk\n- simple factual within loaded slice -> store_canonical\n\nВажно:\n- Если в вопросе есть слова \"не бьется\", \"не сходится\", \"не видно\", \"не собралось в цепочку\", \"разложи по документам и оплатам\", \"чем подтверждается\", \"почему висит хвост\", это обычно causal cross-entity.\n- Не отправляй causal cross-entity вопрос в store_canonical только потому, что он звучит как обычный факт.\n- Не отправляй causal cross-entity вопрос в store_feature_risk только потому, что в нем есть слова \"риск\", \"аномалия\", \"проблема\".", - "domainPrompt": "Контур: бухгалтерический ассистент 1С/NDC.\n\nДоменные ориентиры:\n- счета: 01, 02, 10, 41, 51, 60, 62, 68, 90, 97\n- типовые сущности:\n - контрагент\n - договор\n - документ\n - реализация\n - поступление\n - оплата\n - банковская выписка\n - проводка\n - регистр\n - товар\n - склад\n - основное средство\n - расход будущих периодов\n - взаиморасчеты\n - хвост периода\n - закрывающие документы\n - акт сверки\n\nСеманика живого языка:\n- \"не бьется\" = reconciliation mismatch\n- \"хвост\" = unresolved residual / unclosed balance\n- \"не собралось в цепочку\" = missing causal chain\n- \"чем подтверждается\" = evidence required\n- \"что проверить первым\" = prioritized review list\n- \"зависло\" = unresolved accounting case\n- \"продажа раньше прихода\" = sales before supply pattern\n- \"ошибка по дате\" = period/date inconsistency\n- \"реализация без оплаты\" = receivable not closed\n- \"не видно прихода под реализацию\" = causal join between sale and supply required\n\nЕсли вопрос связывает документы, оплаты, проводки, даты, договоры и контрагентов, это обычно causal cross-entity сценарий.", - "schemaNotes": "schema_version: normalized_query_v1\nВозвращай только JSON.\nНикаких дополнительных полей вне схемы.\nВсе булевы requires-поля должны быть заполнены явно.\nЕсли поле неизвестно, используй пустой массив, null или missing/inferred по смыслу.", - "fewShotExamples": "[EXAMPLE 1]\nQ: По каким поставщикам не бьются взаиморасчеты по 60 счету?\nA:\n{\n \"schema_version\": \"normalized_query_v1\",\n \"user_question_raw\": \"По каким поставщикам не бьются взаиморасчеты по 60 счету?\",\n \"normalized_question\": \"Показать поставщиков с расхождениями по взаиморасчетам на счете 60 с объяснимой связкой документов и оплат.\",\n \"intent_class\": \"cross_entity\",\n \"business_problem_type\": \"supplier_reconciliation_mismatch\",\n \"domain_entities\": [\"supplier\", \"settlements\", \"documents\", \"payments\", \"postings\"],\n \"accounts_mentioned\": [\"60\"],\n \"documents_mentioned\": [],\n \"registers_mentioned\": [],\n \"period_scope\": { \"type\": \"missing\", \"value\": null, \"confidence\": \"low\" },\n \"requires\": {\n \"needs_cross_entity_join\": true,\n \"needs_causal_chain\": true,\n \"needs_exact_object_trace\": false,\n \"needs_ranking\": false,\n \"needs_anomaly_summary\": false,\n \"needs_runtime_truth\": false,\n \"needs_period_cut\": false,\n \"needs_evidence\": true\n },\n \"expected_output_shape\": \"reconciliation_report\",\n \"route_hint\": \"hybrid_store_plus_live\",\n \"ambiguities\": [],\n \"confidence\": { \"overall\": \"high\", \"intent_class\": \"high\", \"route_hint\": \"high\" }\n}\n\n[EXAMPLE 2]\nQ: Сделай рейтинг самых проблемных хвостов на конец июня.\nA:\n{\n \"schema_version\": \"normalized_query_v1\",\n \"user_question_raw\": \"Сделай рейтинг самых проблемных хвостов на конец июня.\",\n \"normalized_question\": \"Построить рейтинг наиболее проблемных незакрытых хвостов на конец июня.\",\n \"intent_class\": \"heavy_analytical\",\n \"business_problem_type\": \"period_close_risk_prioritization\",\n \"domain_entities\": [\"period_close\", \"risk_cases\"],\n \"accounts_mentioned\": [],\n \"documents_mentioned\": [],\n \"registers_mentioned\": [],\n \"period_scope\": { \"type\": \"explicit\", \"value\": \"июнь\", \"confidence\": \"high\" },\n \"requires\": {\n \"needs_cross_entity_join\": false,\n \"needs_causal_chain\": false,\n \"needs_exact_object_trace\": false,\n \"needs_ranking\": true,\n \"needs_anomaly_summary\": true,\n \"needs_runtime_truth\": false,\n \"needs_period_cut\": true,\n \"needs_evidence\": false\n },\n \"expected_output_shape\": \"ranked_list\",\n \"route_hint\": \"batch_refresh_then_store\",\n \"ambiguities\": [],\n \"confidence\": { \"overall\": \"high\", \"intent_class\": \"high\", \"route_hint\": \"high\" }\n}\n\n[EXAMPLE 3]\nQ: Почему эта проводка вообще появилась?\nA:\n{\n \"schema_version\": \"normalized_query_v1\",\n \"user_question_raw\": \"Почему эта проводка вообще появилась?\",\n \"normalized_question\": \"Объяснить происхождение конкретной проводки и ее source-of-record цепочку.\",\n \"intent_class\": \"drilldown_explain\",\n \"business_problem_type\": \"posting_origin_trace\",\n \"domain_entities\": [\"posting\", \"document\", \"source_record\"],\n \"accounts_mentioned\": [],\n \"documents_mentioned\": [],\n \"registers_mentioned\": [],\n \"period_scope\": { \"type\": \"missing\", \"value\": null, \"confidence\": \"low\" },\n \"requires\": {\n \"needs_cross_entity_join\": false,\n \"needs_causal_chain\": true,\n \"needs_exact_object_trace\": true,\n \"needs_ranking\": false,\n \"needs_anomaly_summary\": false,\n \"needs_runtime_truth\": true,\n \"needs_period_cut\": false,\n \"needs_evidence\": true\n },\n \"expected_output_shape\": \"evidence_chain\",\n \"route_hint\": \"live_mcp_drilldown\",\n \"ambiguities\": [],\n \"confidence\": { \"overall\": \"medium\", \"intent_class\": \"high\", \"route_hint\": \"high\" }\n}" -} \ No newline at end of file + "updatedAt": "2026-05-24T07:01:44Z", + "prompt_version": "normalizer_v2_0_2", + "systemPrompt": "Ты semantic-normalizer для бухгалтерского ассистента NDC.\nТвоя роль: только нормализация запроса пользователя в строгий JSON-контракт.\n\nЖесткие правила:\n1) Не давай бухгалтерский ответ по сути вопроса.\n2) Возвращай только JSON без markdown и пояснений.\n3) JSON обязан соответствовать переданной schema normalized_query_v1.\n4) Если период не указан, не выдумывай его; отмечай ambiguity.\n5) Для цепочек документов/проводок/оплат поднимай causal и cross-entity признаки.\n6) Для точечного object trace (номер/строка/ref) поднимай needs_exact_object_trace=true.\n7) Используй терминологию NDC.", + "developerPrompt": "You are semantic-normalizer for accounting assistant NDC.\nReturn strict JSON only, no markdown, no comments.\n\nTarget schema: normalized_query_v2_0_2.\n\nCore behavior (v2.0.2):\n1. Decompose message into semantic fragments.\n2. Classify fragment domain relevance and business scope.\n3. Fill route-critical flags and candidate labels.\n4. For each fragment set execution state fields:\n - execution_readiness\n - clarification_reason\n - soft_assumption_used\n - route_status\n - no_route_reason\n5. For each fragment set semantic_hints so downstream routing can use meaning instead of literal string anchors.\n6. Clarification must be rare and justified.\n\nExecution-state policy:\n- Every in-scope fragment must produce a consistent execution state.\n- If a fragment is routable, mark it as executable or executable_with_soft_assumptions.\n- Do not leave routable fragments in unresolved state.\n- If a fragment cannot be routed, set route_status=no_route and provide explicit no_route_reason.\n\nReadiness values:\n- executable\n- executable_with_soft_assumptions\n- needs_clarification\n- no_route\n\nRoute status values:\n- routed\n- no_route\n\nNo-route reason values:\n- out_of_scope\n- insufficient_specificity\n- missing_mapping\n- unsupported_fragment_type\n\nDo not over-require formality:\n- Do not require document IDs, exact periods, or exact object references for scan/review/anomaly/rule-check requests.\n- Colloquial accounting phrases like \"что висит\", \"что подозрительно\", \"что не сходится\", \"что криво\", \"что аукнется\" are executable if accounting area is understandable.\n\nFragment required fields:\n- fragment_id\n- raw_fragment_text\n- normalized_fragment_text\n- domain_relevance\n- business_scope\n- entity_hints\n- account_hints\n- document_hints\n- register_hints\n- time_scope\n- flags\n- semantic_hints\n- candidate_labels\n- confidence\n- execution_readiness\n- clarification_reason\n- soft_assumption_used\n- route_status\n- no_route_reason\n\nSoft assumptions (`soft_assumption_used`) allowed values:\n- period_from_session_context\n- company_scope_defaulted\n- problem_scan_mode_enabled\n\nsemantic_hints fields:\n- scope_target_kind: none | self_scope | selected_object | organization | warehouse | counterparty | contract | item\n- scope_target_text: short user-facing mention when scope_target_kind is organization/warehouse/counterparty/contract/item\n- date_scope_kind: explicit | implicit_current | missing\n- self_scope_detected: true when wording means \"our own scope\" or \"this connected company\"\n- selected_object_scope_detected: true when wording refers to currently selected object/item\n\nSemantic-hints policy:\n- Use semantic_hints to preserve meaning of colloquial or elliptical wording.\n- Do not convert vague possessive wording into a fake literal anchor.\n- If user means \"our company / our connected base / current selected scope\", prefer self_scope_detected=true and scope_target_kind=self_scope.\n- If user refers to a company or organization colloquially, prefer scope_target_kind=organization, not warehouse.\n- If user refers to the selected row/object/item, prefer selected_object_scope_detected=true and scope_target_kind=selected_object or item when item text is explicit.\n- Do not invent exact database names. Use short text from the user in scope_target_text.\n\nExamples:\n- \"что на складе у нас\" -> semantic_hints.scope_target_kind=self_scope; self_scope_detected=true; date_scope_kind=implicit_current\n- \"что на складе конторы альтернатива\" -> semantic_hints.scope_target_kind=organization; scope_target_text=\"альтернатива\"; date_scope_kind=implicit_current\n- \"по выбранному объекту ... кто поставщик\" -> semantic_hints.scope_target_kind=selected_object; selected_object_scope_detected=true\n- \"по ней какие документы\" -> semantic_hints.scope_target_kind=selected_object; selected_object_scope_detected=true\n\nGlobal notes:\n- global_notes.needs_clarification should be true only when execution is truly blocked for all in-scope fragments.\n- global_notes.clarification_reason must explain the blocker.\n\nSchema version must be:\n- \"schema_version\": \"normalized_query_v2_0_2\"", + "domainPrompt": "Контекст домена: бухгалтерия 1С/NDC.\n\nКлючевые счета:\n- 01, 02, 10, 41, 51, 60, 62, 68.02, 90, 97.\n\nТиповые сущности:\n- контрагент, договор, документ реализации, документ поступления, оплата, проводка, регистр, закрывающий документ.\n\nЛексика causal и сверки (сильные сигналы для cross_entity):\n- \"не бьется\", \"не сходится\", \"не видно\", \"не собралось\", \"повисло\", \"хвост\";\n- \"разложи по документам/оплатам/закрывающим\";\n- \"чем подтверждается\", \"где ошибка в цепочке\", \"что пошло криво\".\n\nЛексика точечного drilldown:\n- \"документ №...\", \"ref\", \"строка проводки\", \"покажи конкретную операцию\", \"точный source-of-record\".\n\nЛексика rule-based контроля:\n- \"проверь настройки\", \"ошибка срока/даты\", \"контроль 97/10/ОС\", \"нарушение правила учета\".\n\nЛексика обзорной аналитики:\n- \"рейтинг\", \"топ рисков\", \"в целом по компании\", \"перед закрытием периода\", \"приоритизация проверок\".\n\nВажное правило:\nЕсли в одном вопросе есть и риск-лексика, и цепочка document/payment/posting, не понижать задачу до чистого `store_feature_risk`.\nПриоритет у causal cross-entity семантики.\n\nНеформальные scope-формулировки:\n- \"у нас\", \"у себя\", \"по нашей базе\", \"в нашей конторе\" обычно означают self/company scope, а не буквальный якорь склада;\n- \"контора альтернатива\", \"альтернатива\", \"по фирме альтернатива\" обычно означают organization scope, а не склад;\n- \"по выбранному объекту\", \"по ней\", \"по этой позиции\", \"по этому товару\" обычно означают selected object scope.\n\nДля semantic_hints:\n- если речь про текущую подключенную компанию/нашу базу -> scope_target_kind=self_scope;\n- если речь про организацию/фирму/контору -> scope_target_kind=organization;\n- если речь про выбранную позицию/объект -> scope_target_kind=selected_object;\n- для складских snapshot-вопросов без даты обычно date_scope_kind=implicit_current.", + "schemaNotes": "v2.0.2: execution-state hardening + explicit route_status/no_route_reason. ????? normalized_query_v2_0_2.", + "fewShotExamples": "Q: По каким поставщикам висят хвосты по 60, что подозрительно по цепочке оплат?\nA: in_scope fragment, execution_readiness=executable_with_soft_assumptions, route_status=routed, no_route_reason=null.\n\nQ: Покажи записи по 97, которые повисли и могут аукнуться.\nA: in_scope fragment, execution_readiness=executable_with_soft_assumptions, route_status=routed, no_route_reason=null.\n\nQ: Чекни, что у нас не так.\nA: unclear/in_scope fragment, execution_readiness=needs_clarification, route_status=no_route, no_route_reason=insufficient_specificity.\n\nQ: Как вообще по ФСБУ правильно?\nA: out_of_scope fragment, execution_readiness=no_route, route_status=no_route, no_route_reason=out_of_scope." +} diff --git a/scripts/agent_runtime_manifest.py b/scripts/agent_runtime_manifest.py new file mode 100644 index 0000000..aa02412 --- /dev/null +++ b/scripts/agent_runtime_manifest.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import subprocess +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +REPO_ROOT = Path(__file__).resolve().parents[1] +EFFECTIVE_RUNTIME_SCHEMA_VERSION = "agent_effective_runtime_v1" +PROMPT_REGISTRY_HEALTH_SCHEMA_VERSION = "prompt_registry_health_v1" +EFFECTIVE_RUNTIME_FILE_NAME = "effective_runtime.json" + +CONFIG_TS = REPO_ROOT / "llm_normalizer" / "backend" / "src" / "config.ts" +PROMPT_BUILDER_TS = REPO_ROOT / "llm_normalizer" / "backend" / "src" / "services" / "promptBuilder.ts" +PROMPTS_DIR = REPO_ROOT / "llm_normalizer" / "prompts" +PRESETS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "presets" +SHARED_LLM_CONNECTION_CONFIG = REPO_ROOT / "llm_normalizer" / "data" / "shared_llm_connection.json" +DEFAULT_MCP_PROXY_URL = "http://127.0.0.1:6003" +ASSISTANT_RUNTIME_PROMPT_VERSIONS = {"address_query_runtime_v1"} + +BUILTIN_PROMPT_FILES: dict[str, dict[str, str]] = { + "normalizer_v1": { + "system": "system/default.txt", + "developer": "developer/default.txt", + "domain": "domain/default.txt", + "fewshot": "fewshot/default.txt", + }, + "normalizer_v1_1": { + "system": "system/default.txt", + "developer": "developer/normalizer_v1_1.txt", + "domain": "domain/normalizer_domain_v1_1.txt", + "fewshot": "fewshot/normalizer_fewshot_v1_1.txt", + }, + "normalizer_v1_1_1": { + "system": "system/default.txt", + "developer": "developer/normalizer_v1_1_1.txt", + "domain": "domain/normalizer_domain_v1_1.txt", + "fewshot": "fewshot/normalizer_fewshot_v1_1_1.txt", + }, + "normalizer_v1_1_2": { + "system": "system/default.txt", + "developer": "developer/normalizer_v1_1_2.txt", + "domain": "domain/normalizer_domain_v1_1.txt", + "fewshot": "fewshot/normalizer_fewshot_v1_1_2.txt", + }, + "normalizer_v1_1_2_1": { + "system": "system/default.txt", + "developer": "developer/normalizer_v1_1_2_1.txt", + "domain": "domain/normalizer_domain_v1_1.txt", + "fewshot": "fewshot/normalizer_fewshot_v1_1_2_1.txt", + }, + "normalizer_v2": { + "system": "system/default.txt", + "developer": "developer/normalizer_v2.txt", + "domain": "domain/normalizer_domain_v1_1.txt", + "fewshot": "fewshot/normalizer_v2.txt", + }, + "normalizer_v2_0_1": { + "system": "system/default.txt", + "developer": "developer/normalizer_v2_0_1.txt", + "domain": "domain/normalizer_domain_v1_1.txt", + "fewshot": "fewshot/normalizer_v2_0_1.txt", + }, + "normalizer_v2_0_2": { + "system": "system/default.txt", + "developer": "developer/normalizer_v2_0_2.txt", + "domain": "domain/normalizer_domain_v1_1.txt", + "fewshot": "fewshot/normalizer_v2_0_2.txt", + }, +} + + +def now_utc_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def repo_relative(path: Path, repo_root: Path = REPO_ROOT) -> str: + try: + return str(path.resolve().relative_to(repo_root.resolve())).replace("\\", "/") + except ValueError: + return str(path.resolve()) + + +def read_json_object(path: Path) -> dict[str, Any]: + parsed = json.loads(path.read_text(encoding="utf-8")) + return parsed if isinstance(parsed, dict) else {} + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8", newline="\n") + + +def git_sha(repo_root: Path = REPO_ROOT) -> str: + try: + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=str(repo_root), + text=True, + encoding="utf-8", + errors="replace", + capture_output=True, + check=False, + timeout=10, + ) + except (OSError, subprocess.SubprocessError): + return "unknown" + if result.returncode != 0: + return "unknown" + return result.stdout.strip() or "unknown" + + +def read_default_prompt_version(repo_root: Path = REPO_ROOT) -> str | None: + config_path = repo_root / "llm_normalizer" / "backend" / "src" / "config.ts" + if not config_path.exists(): + return None + text = config_path.read_text(encoding="utf-8", errors="replace") + match = re.search(r"DEFAULT_PROMPT_VERSION\s*=\s*process\.env\.DEFAULT_PROMPT_VERSION\s*\?\?\s*\"([^\"]+)\"", text) + return match.group(1) if match else None + + +def load_shared_llm_connection(repo_root: Path = REPO_ROOT) -> dict[str, Any]: + config_path = repo_root / "llm_normalizer" / "data" / "shared_llm_connection.json" + if not config_path.exists(): + return {} + try: + raw = read_json_object(config_path) + except (OSError, json.JSONDecodeError): + return {} + connection = raw.get("connection") + return dict(connection) if isinstance(connection, dict) else {} + + +def _env_bool(value: str | None, default_value: bool) -> bool: + if value is None or value.strip() == "": + return default_value + lowered = value.strip().lower() + return lowered not in {"0", "false", "off", "no"} + + +def collect_feature_flags(repo_root: Path = REPO_ROOT) -> dict[str, Any]: + config_path = repo_root / "llm_normalizer" / "backend" / "src" / "config.ts" + if not config_path.exists(): + return {} + text = config_path.read_text(encoding="utf-8", errors="replace") + pattern = re.compile( + r"export\s+const\s+(FEATURE_[A-Z0-9_]+)\s*=\s*toBooleanFlag\(\s*" + r"process\.env\.\1\s*,\s*(true|false)\s*\)", + re.DOTALL, + ) + flags: dict[str, Any] = {} + for name, default_raw in pattern.findall(text): + default_value = default_raw == "true" + flags[name] = { + "value": _env_bool(os.environ.get(name), default_value), + "source": "env" if name in os.environ else "default", + "default": default_value, + } + return flags + + +def _hash_prompt_files(files: list[dict[str, Any]], repo_root: Path) -> str | None: + present_files = [item for item in files if item.get("exists") is True] + if not present_files: + return None + digest = hashlib.sha256() + for item in sorted(present_files, key=lambda entry: str(entry.get("relative_path") or "")): + path = repo_root / str(item["relative_path"]) + digest.update(str(item["relative_path"]).replace("\\", "/").encode("utf-8")) + digest.update(b"\0") + digest.update(path.read_bytes()) + digest.update(b"\0") + return digest.hexdigest() + + +def _prompt_files_for_version(repo_root: Path, prompt_version: str) -> list[dict[str, Any]]: + definitions = BUILTIN_PROMPT_FILES.get(prompt_version) + if not definitions: + return [] + files: list[dict[str, Any]] = [] + for slot, relative_prompt_path in definitions.items(): + relative_path = Path("llm_normalizer") / "prompts" / Path(relative_prompt_path) + file_path = repo_root / relative_path + files.append( + { + "slot": slot, + "prompt_path": relative_prompt_path.replace("\\", "/"), + "relative_path": relative_path.as_posix(), + "exists": file_path.exists(), + "size_bytes": file_path.stat().st_size if file_path.exists() else None, + } + ) + return files + + +def _preset_prompt_versions(repo_root: Path) -> list[dict[str, Any]]: + presets_dir = repo_root / "llm_normalizer" / "data" / "presets" + if not presets_dir.exists(): + return [] + presets: list[dict[str, Any]] = [] + for path in sorted(presets_dir.glob("*.json")): + try: + payload = read_json_object(path) + except (OSError, json.JSONDecodeError): + presets.append( + { + "path": repo_relative(path, repo_root), + "prompt_version": None, + "status": "invalid_json", + } + ) + continue + presets.append( + { + "path": repo_relative(path, repo_root), + "prompt_version": str(payload.get("prompt_version") or "").strip() or None, + "status": "ok", + } + ) + return presets + + +def build_prompt_registry_health( + repo_root: Path = REPO_ROOT, + *, + prompt_version: str | None = None, + strict_preset_match: bool = True, +) -> dict[str, Any]: + active_prompt_version = prompt_version or read_default_prompt_version(repo_root) or "unknown" + default_prompt_version = read_default_prompt_version(repo_root) + files = _prompt_files_for_version(repo_root, active_prompt_version) + failures: list[str] = [] + warnings: list[str] = [] + + if active_prompt_version not in BUILTIN_PROMPT_FILES: + failures.append(f"unknown_prompt_version:{active_prompt_version}") + + missing_files = [ + str(item.get("relative_path")) + for item in files + if item.get("exists") is not True + ] + if missing_files: + failures.append("prompt_files_missing:" + ",".join(missing_files)) + + prompt_hash = _hash_prompt_files(files, repo_root) + if not prompt_hash: + failures.append("prompt_hash_unavailable") + + preset_versions = _preset_prompt_versions(repo_root) + mismatched_presets = [ + item + for item in preset_versions + if item.get("status") == "ok" + and item.get("prompt_version") + and default_prompt_version + and item.get("prompt_version") != default_prompt_version + ] + if mismatched_presets: + message = "preset_version_mismatch:" + ",".join( + f"{item['path']}={item['prompt_version']}" for item in mismatched_presets + ) + if strict_preset_match: + failures.append(message) + else: + warnings.append(message) + + invalid_presets = [item for item in preset_versions if item.get("status") != "ok"] + if invalid_presets: + failures.append("preset_json_invalid:" + ",".join(str(item.get("path")) for item in invalid_presets)) + + source = "file" if files and not missing_files else ("unknown" if not files else "partial_file") + status = "pass" if not failures else "fail" + return { + "schema_version": PROMPT_REGISTRY_HEALTH_SCHEMA_VERSION, + "status": status, + "default_prompt_version": default_prompt_version, + "active_prompt_version": active_prompt_version, + "prompt_source": source, + "prompt_hash": prompt_hash, + "prompt_files": files, + "prompt_builder": repo_relative(PROMPT_BUILDER_TS, repo_root), + "config": repo_relative(CONFIG_TS, repo_root), + "preset_versions": preset_versions, + "failures": failures, + "warnings": warnings, + "checked_at": now_utc_iso(), + } + + +def resolve_effective_prompt_version(repo_root: Path, requested_prompt_version: str | None) -> tuple[str, dict[str, Any]]: + requested = str(requested_prompt_version or "").strip() + default_prompt_version = read_default_prompt_version(repo_root) + if not requested: + resolved = default_prompt_version or "unknown" + return resolved, { + "mode": "default_prompt_version", + "requested_prompt_version": None, + "resolved_prompt_version": resolved, + } + if requested in BUILTIN_PROMPT_FILES: + return requested, { + "mode": "requested_prompt_version", + "requested_prompt_version": requested, + "resolved_prompt_version": requested, + } + if requested in ASSISTANT_RUNTIME_PROMPT_VERSIONS: + resolved = default_prompt_version or "unknown" + return resolved, { + "mode": "assistant_runtime_schema_uses_default_normalizer_prompt", + "requested_prompt_version": requested, + "resolved_prompt_version": resolved, + "assistant_runtime_prompt_version": requested, + } + return requested, { + "mode": "unknown_prompt_version", + "requested_prompt_version": requested, + "resolved_prompt_version": requested, + } + + +def _get_arg(args: argparse.Namespace | None, name: str, default: Any = None) -> Any: + if args is None: + return default + return getattr(args, name, default) + + +def build_effective_runtime_manifest( + *, + runner: str, + args: argparse.Namespace | None = None, + repo_root: Path = REPO_ROOT, + spec_path: Path | None = None, + output_dir: Path | None = None, + run_id: str | None = None, + extra: dict[str, Any] | None = None, +) -> dict[str, Any]: + requested_prompt_version = str(_get_arg(args, "prompt_version", "") or "").strip() or None + prompt_version, prompt_resolution = resolve_effective_prompt_version(repo_root, requested_prompt_version) + prompt_health = build_prompt_registry_health(repo_root, prompt_version=prompt_version, strict_preset_match=False) + shared_llm = load_shared_llm_connection(repo_root) + llm_provider = str(_get_arg(args, "llm_provider", "") or shared_llm.get("llmProvider") or "unknown") + llm_model = str(_get_arg(args, "llm_model", "") or shared_llm.get("model") or "unknown") + llm_base_url = str(_get_arg(args, "llm_base_url", "") or shared_llm.get("baseUrl") or "") + temperature = _get_arg(args, "temperature", shared_llm.get("temperature")) + max_output_tokens = _get_arg(args, "max_output_tokens", shared_llm.get("maxOutputTokens")) + + manifest: dict[str, Any] = { + "schema_version": EFFECTIVE_RUNTIME_SCHEMA_VERSION, + "git_sha": git_sha(repo_root), + "runner": runner, + "run_id": run_id, + "spec_path": repo_relative(spec_path, repo_root) if spec_path else None, + "output_dir": repo_relative(output_dir, repo_root) if output_dir else None, + "backend_url": _get_arg(args, "backend_url"), + "mcp_proxy_url": _get_arg(args, "mcp_proxy_url", os.environ.get("MCP_PROXY_URL") or DEFAULT_MCP_PROXY_URL), + "mcp_channel": _get_arg(args, "mcp_channel", os.environ.get("MCP_CHANNEL")), + "llm_provider": llm_provider, + "llm_model": llm_model, + "llm_base_url": llm_base_url or None, + "temperature": temperature, + "max_output_tokens": max_output_tokens, + "requested_prompt_version": prompt_resolution.get("requested_prompt_version"), + "prompt_version": prompt_version, + "prompt_resolution": prompt_resolution, + "assistant_runtime_prompt_version": prompt_resolution.get("assistant_runtime_prompt_version"), + "prompt_source": prompt_health.get("prompt_source"), + "prompt_hash": prompt_health.get("prompt_hash"), + "prompt_registry_status": prompt_health.get("status"), + "prompt_registry_failures": prompt_health.get("failures") or [], + "prompt_registry_warnings": prompt_health.get("warnings") or [], + "prompt_files": prompt_health.get("prompt_files") or [], + "feature_flags": collect_feature_flags(repo_root), + "shared_llm_connection": { + "path": repo_relative(repo_root / "llm_normalizer" / "data" / "shared_llm_connection.json", repo_root), + "exists": (repo_root / "llm_normalizer" / "data" / "shared_llm_connection.json").exists(), + "connection": shared_llm, + }, + "use_mock": bool(_get_arg(args, "use_mock", False)), + "created_at": now_utc_iso(), + } + if extra: + manifest["extra"] = extra + return manifest + + +def write_effective_runtime_manifest(output_dir: Path, manifest: dict[str, Any]) -> Path: + manifest_path = output_dir / EFFECTIVE_RUNTIME_FILE_NAME + write_json(manifest_path, manifest) + return manifest_path + + +def write_effective_runtime( + output_dir: Path, + *, + runner: str, + args: argparse.Namespace | None = None, + repo_root: Path = REPO_ROOT, + spec_path: Path | None = None, + run_id: str | None = None, + extra: dict[str, Any] | None = None, +) -> dict[str, Any]: + manifest = build_effective_runtime_manifest( + runner=runner, + args=args, + repo_root=repo_root, + spec_path=spec_path, + output_dir=output_dir, + run_id=run_id, + extra=extra, + ) + write_effective_runtime_manifest(output_dir, manifest) + return manifest + + +def validate_effective_runtime_manifest(manifest: dict[str, Any], *, manifest_path: Path | None = None) -> None: + location = f": {manifest_path}" if manifest_path else "" + required_fields = ( + "git_sha", + "runner", + "llm_model", + "temperature", + "max_output_tokens", + "prompt_version", + "prompt_source", + "prompt_hash", + ) + missing_fields = [ + field_name + for field_name in required_fields + if manifest.get(field_name) is None or str(manifest.get(field_name)).strip() == "" + ] + if missing_fields: + raise RuntimeError( + f"{EFFECTIVE_RUNTIME_FILE_NAME} is incomplete{location}: missing " + + ", ".join(missing_fields) + ) + if manifest.get("prompt_registry_status") != "pass": + failures = manifest.get("prompt_registry_failures") + failure_text = ",".join(str(item) for item in failures) if isinstance(failures, list) else str(failures or "") + raise RuntimeError( + f"{EFFECTIVE_RUNTIME_FILE_NAME} has failing prompt registry status{location}: " + f"{manifest.get('prompt_registry_status')}; {failure_text}" + ) + + +def load_effective_runtime_manifest(run_dir: Path) -> dict[str, Any]: + manifest_path = run_dir / EFFECTIVE_RUNTIME_FILE_NAME + if not manifest_path.exists(): + raise RuntimeError(f"{EFFECTIVE_RUNTIME_FILE_NAME} not found: {manifest_path}") + try: + manifest = read_json_object(manifest_path) + except json.JSONDecodeError as exc: + raise RuntimeError(f"{EFFECTIVE_RUNTIME_FILE_NAME} is invalid JSON: {manifest_path}") from exc + if manifest.get("schema_version") != EFFECTIVE_RUNTIME_SCHEMA_VERSION: + raise RuntimeError( + f"{EFFECTIVE_RUNTIME_FILE_NAME} has unsupported schema_version={manifest.get('schema_version')!r}" + ) + validate_effective_runtime_manifest(manifest, manifest_path=manifest_path) + return manifest diff --git a/scripts/domain_case_loop.py b/scripts/domain_case_loop.py index 08e5fc2..af396a4 100644 --- a/scripts/domain_case_loop.py +++ b/scripts/domain_case_loop.py @@ -13,12 +13,14 @@ from typing import Any from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen +import agent_runtime_manifest as runtime_manifest REPO_ROOT = Path(__file__).resolve().parent.parent DEFAULT_ARTIFACTS_ROOT = REPO_ROOT / "artifacts" / "domain_runs" DEFAULT_SESSIONS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "assistant_sessions" DEFAULT_REPORTS_DIR = REPO_ROOT / "llm_normalizer" / "reports" DEFAULT_LOOP_SCHEMA_DIR = REPO_ROOT / "docs" / "orchestration" / "schemas" +ISSUE_CATALOG_PATH = REPO_ROOT / "docs" / "orchestration" / "issue_catalog.json" SHARED_LLM_CONNECTION_CONFIG = REPO_ROOT / "llm_normalizer" / "data" / "shared_llm_connection.json" DEFAULT_BACKEND_URL = "http://127.0.0.1:8787" DEFAULT_PROMPT_VERSION = "address_query_runtime_v1" @@ -37,6 +39,11 @@ ACTIVE_DOMAIN_CONTRACT_SCHEMA_VERSION = "active_domain_contract_v1" AUTONOMOUS_LOOP_SCHEMA_VERSION = "domain_autonomous_loop_v1" REPAIR_MODE_LEAD_HANDOFF = "lead-handoff" REPAIR_MODE_AUTO_CODER = "auto-coder" +AUTO_CODER_ALLOWED_ISSUE_CODES = { + "business_direct_answer_missing", + "business_next_step_missing", + "technical_garbage_in_answer", +} def load_shared_local_llm_defaults(config_path: Path | None = None) -> dict[str, Any]: @@ -217,6 +224,13 @@ GUARDED_INSUFFICIENCY_LIMITATION_MARKERS = ( GUARDED_INSUFFICIENCY_RESULT_MODES = {"heuristic_candidates"} GUARDED_INSUFFICIENCY_TRUTH_MODES = {"limited"} GUARDED_INSUFFICIENCY_ANSWER_SHAPES = {"limited_with_reason"} +BUSINESS_EXPECTED_RESULT_MODES = { + "clarification_required", + "limited_accounting_answer", + "evidence_or_honest_boundary", + "ranking_or_limited_accounting_answer", + "same_inventory_margin_context_or_clarification", +} MCP_DISCOVERY_CHAIN_INTENT_ALIASES: dict[str, tuple[str, ...]] = { "business_overview": ("business_overview",), @@ -860,6 +874,90 @@ def read_json_file(file_path: Path) -> dict[str, Any]: return json.loads(read_text_file(file_path)) +def load_issue_catalog(path: Path = ISSUE_CATALOG_PATH) -> dict[str, Any]: + if not path.exists(): + return {"schema_version": "agent_issue_catalog_v1", "issues": {}} + try: + payload = read_json_file(path) + except (OSError, json.JSONDecodeError): + return {"schema_version": "agent_issue_catalog_v1", "issues": {}} + return payload if isinstance(payload, dict) else {"schema_version": "agent_issue_catalog_v1", "issues": {}} + + +def issue_catalog_entry(issue_code: str, catalog: dict[str, Any] | None = None) -> dict[str, Any]: + source = catalog if isinstance(catalog, dict) else load_issue_catalog() + issues = source.get("issues") if isinstance(source.get("issues"), dict) else {} + entry = issues.get(issue_code) + return dict(entry) if isinstance(entry, dict) else {} + + +def default_rerun_matrix_for_problem(problem_type: str) -> list[str]: + if problem_type in {"route_gap", "capability_gap", "route_candidate_enablement_gap"}: + return ["failed_scenario", "route_neighbor_pack", "accepted_smoke_pack"] + if problem_type in {"answer_shape_mismatch", "presentation_gap", "business_utility_gap"}: + return ["failed_scenario", "answer_surface_pack", "accepted_smoke_pack"] + if problem_type in {"field_mapping_gap", "evidence_gap", "domain_anchor_gap"}: + return ["failed_scenario", "field_truth_pack", "accepted_smoke_pack"] + return ["failed_scenario", "accepted_smoke_pack"] + + +def is_margin_profitability_step(step_output: dict[str, Any]) -> bool: + question = str(step_output.get("question_resolved") or step_output.get("question_template") or "") + if is_nomenclature_margin_context(step_output, question): + return True + tokens = [ + str(step_output.get("expected_business_answer_contract") or ""), + str(step_output.get("required_answer_contract") or ""), + *normalize_string_list(step_output.get("semantic_tags")), + ] + return any("margin" in token or "марж" in token.casefold() for token in tokens) + + +def derive_repair_issue_code(step_output: dict[str, Any], problem_type: str) -> str: + violated = normalize_string_list(step_output.get("violated_invariants")) + if "domain_leak_accounting_route" in violated and is_margin_profitability_step(step_output): + return "margin_domain_leak_accounting_route" + for issue_code in ( + "technical_garbage_in_answer", + "business_direct_answer_missing", + "accounting_contract_missing", + "business_next_step_missing", + ): + if issue_code in violated: + return issue_code + if problem_type == "route_candidate_enablement_gap": + return "route_candidate_enablement_gap" + if problem_type == "capability_gap": + return "capability_gap" + return problem_type or "other" + + +def expected_answer_contract_for_issue(issue_code: str, step_output: dict[str, Any], catalog_entry: dict[str, Any]) -> str | None: + explicit_contract = ( + str(step_output.get("expected_business_answer_contract") or step_output.get("required_answer_contract") or "").strip() + or None + ) + if explicit_contract: + return explicit_contract + catalog_contract = str(catalog_entry.get("expected_answer_contract") or "").strip() + if catalog_contract: + return catalog_contract + if issue_code.startswith("margin_") or is_margin_profitability_step(step_output): + return "margin_profitability_v1" + return None + + +def evidence_paths_for_step(scenario_dir: Path, step_id: str) -> list[str]: + step_dir = scenario_dir / "steps" / step_id + candidates = [ + step_dir / "output.md", + step_dir / "turn.json", + step_dir / "step_state.json", + step_dir / "debug.json", + ] + return [repo_relative(path) for path in candidates] + + def extract_conversation_from_session(session_record: dict[str, Any]) -> list[dict[str, Any]]: items = session_record.get("items") if isinstance(items, list) and items: @@ -2188,6 +2286,75 @@ def is_validated_guarded_insufficiency_answer( ) +def _business_review_is_clean(step_state: dict[str, Any]) -> bool: + business_review = step_state.get("business_first_review") + if not isinstance(business_review, dict): + return True + return len(normalize_string_list(business_review.get("issue_codes"))) == 0 + + +def business_expected_result_mode_matches(expected_result_mode: str, step_state: dict[str, Any]) -> bool: + reply_type = str(step_state.get("reply_type") or "").strip() + response_type = str(step_state.get("response_type") or "").strip() + truth_mode = str(step_state.get("truth_mode") or "").strip() + answer_shape = str(step_state.get("answer_shape") or "").strip() + detected_intent = str(step_state.get("detected_intent") or "").strip() + capability_id = str(step_state.get("capability_id") or "").strip() + assistant_text = str(step_state.get("assistant_text") or "").strip() + clean_business_review = _business_review_is_clean(step_state) + in_margin_context = ( + detected_intent == "inventory_margin_ranking_for_nomenclature" + or capability_id == "inventory_inventory_margin_ranking_for_nomenclature" + ) + + if expected_result_mode == "clarification_required": + return ( + clean_business_review + and ( + truth_mode == "clarification_required" + or answer_shape == "clarification_required" + or (reply_type == "partial_coverage" and response_type == "LIMITED_WITH_REASON") + ) + ) + + if expected_result_mode == "limited_accounting_answer": + return ( + clean_business_review + and in_margin_context + and bool(assistant_text) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + if expected_result_mode == "evidence_or_honest_boundary": + return ( + clean_business_review + and bool(assistant_text) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + if expected_result_mode == "ranking_or_limited_accounting_answer": + return ( + clean_business_review + and in_margin_context + and bool(assistant_text) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + if expected_result_mode == "same_inventory_margin_context_or_clarification": + return ( + clean_business_review + and bool(assistant_text) + and ( + in_margin_context + or truth_mode == "clarification_required" + or answer_shape == "clarification_required" + ) + and reply_type in {"partial_coverage", "factual", "factual_with_explanation"} + ) + + return False + + def acceptance_status_from_execution(execution_status: str, hard_fail: bool, semantic_validated: bool = False) -> str: if execution_status == "blocked": return "blocked" @@ -2232,7 +2399,11 @@ def validate_step_contract(step_state: dict[str, Any]) -> dict[str, Any]: expected_result_mode = str(state.get("expected_result_mode") or "").strip() actual_result_mode = str(state.get("result_mode") or "").strip() - if expected_result_mode and actual_result_mode and normalize_identifier(actual_result_mode) != normalize_identifier(expected_result_mode): + normalized_expected_result_mode = normalize_identifier(expected_result_mode) + if normalized_expected_result_mode in BUSINESS_EXPECTED_RESULT_MODES: + if not business_expected_result_mode_matches(normalized_expected_result_mode, state): + violated_invariants.append("wrong_result_mode") + elif expected_result_mode and actual_result_mode and normalize_identifier(actual_result_mode) != normalize_identifier(expected_result_mode): violated_invariants.append("wrong_result_mode") for forbidden_capability in normalize_string_list(state.get("forbidden_capabilities")): @@ -2716,6 +2887,23 @@ def execute_scenario_manifest( write_json(scenario_dir / "scenario_manifest.json", manifest) if manifest_source_label: write_text(scenario_dir / "manifest_source.txt", f"{manifest_source_label}\n") + source_path = None + if manifest_source_label: + raw_source_path = str(manifest_source_label).split("#", 1)[0].strip() + if raw_source_path: + source_path = Path(raw_source_path).resolve() + runtime_manifest.write_effective_runtime( + scenario_dir, + runner="domain_case_loop.run-scenario", + args=args, + spec_path=source_path, + run_id=manifest["scenario_id"], + extra={ + "domain": manifest["domain"], + "title": manifest["title"], + "manifest_source_label": manifest_source_label, + }, + ) ensure_scenario_brief(scenario_dir, manifest) scenario_state: dict[str, Any] = { @@ -2845,6 +3033,19 @@ def handle_run_case(args: argparse.Namespace) -> int: case_id = slugify_case_id(args.domain, args.case_id) case_dir = Path(args.output_root).resolve() / case_id case_dir.mkdir(parents=True, exist_ok=True) + runtime_manifest.write_effective_runtime( + case_dir, + runner="domain_case_loop.run-case", + args=args, + run_id=case_id, + extra={ + "domain": args.domain, + "case_id": case_id, + "slot": args.slot, + "expected_capability": args.expected_capability, + "expected_result_mode": args.expected_result_mode, + }, + ) ensure_case_brief( case_dir, domain=args.domain, @@ -3809,6 +4010,7 @@ def build_step_repair_target( signals.append(f"route_candidate_missing_axes={','.join(missing_axes)}") target = { + "issue_code": derive_repair_issue_code(step_output, problem_type), "target_id": f"{scenario_id}:{step_id}", "scenario_id": scenario_id, "scenario_title": scenario_title, @@ -3829,6 +4031,17 @@ def build_step_repair_target( "step_state_json": str(step_state_path), }, } + catalog_entry = issue_catalog_entry(str(target["issue_code"])) + target["expected_business_answer_contract"] = expected_answer_contract_for_issue( + str(target["issue_code"]), + step_output, + catalog_entry, + ) + target["evidence_paths"] = evidence_paths_for_step(scenario_dir, step_id) + target["allowed_patch_targets"] = normalize_string_list(catalog_entry.get("allowed_patch_targets")) or target["candidate_files"] + target["forbidden_patch_targets"] = normalize_string_list(catalog_entry.get("forbidden_patch_targets")) + target["rerun_matrix"] = normalize_string_list(catalog_entry.get("rerun_matrix")) or default_rerun_matrix_for_problem(problem_type) + target["minimal_patch_direction"] = target["fix_goal"] route_candidate = compact_route_candidate_handoff(scenario_id=scenario_id, step_id=step_id, step_output=step_output) if route_candidate: target["route_candidate"] = route_candidate @@ -3856,10 +4069,21 @@ def build_priority_repair_foci(targets: list[dict[str, Any]]) -> list[dict[str, "problem_type": str(target.get("problem_type") or "other"), "root_cause_layers": normalize_string_list(target.get("root_cause_layers")), "candidate_files": normalize_string_list(target.get("candidate_files")), + "issue_codes": [], + "allowed_patch_targets": [], + "forbidden_patch_targets": [], + "rerun_matrix": [], "target_ids": [], "scenario_ids": set(), }, ) + issue_code = str(target.get("issue_code") or "").strip() + if issue_code and issue_code not in focus["issue_codes"]: + focus["issue_codes"].append(issue_code) + for field_name in ("allowed_patch_targets", "forbidden_patch_targets", "rerun_matrix"): + for item in normalize_string_list(target.get(field_name)): + if item not in focus[field_name]: + focus[field_name].append(item) focus["target_ids"].append(str(target.get("target_id") or "")) scenario_id = str(target.get("scenario_id") or "").strip() if scenario_id: @@ -4024,7 +4248,10 @@ def normalize_analyst_priority_repair_target(raw_target: dict[str, Any], index: fix_goal = f"Resolve the analyst-identified `{problem_type}` on `{scenario_id}:{step_id}` without masking partial evidence as accepted." if not root_cause_layers: root_cause_layers = [problem_type] + issue_code = str(raw_target.get("issue_code") or problem_type or "other").strip() + catalog_entry = issue_catalog_entry(issue_code) return { + "issue_code": issue_code, "target_id": f"{scenario_id}:{step_id}", "scenario_id": scenario_id, "scenario_title": str(raw_target.get("scenario_title") or "").strip() or None, @@ -4039,6 +4266,23 @@ def normalize_analyst_priority_repair_target(raw_target: dict[str, Any], index: "violated_invariants": normalize_string_list(raw_target.get("violated_invariants")), "fix_goal": fix_goal, "candidate_files": candidate_files, + "expected_business_answer_contract": str( + raw_target.get("expected_business_answer_contract") + or raw_target.get("expected_answer_contract") + or catalog_entry.get("expected_answer_contract") + or "" + ).strip() + or None, + "evidence_paths": normalize_string_list(raw_target.get("evidence_paths")), + "allowed_patch_targets": normalize_string_list(raw_target.get("allowed_patch_targets")) + or normalize_string_list(catalog_entry.get("allowed_patch_targets")) + or candidate_files, + "forbidden_patch_targets": normalize_string_list(raw_target.get("forbidden_patch_targets")) + or normalize_string_list(catalog_entry.get("forbidden_patch_targets")), + "rerun_matrix": normalize_string_list(raw_target.get("rerun_matrix")) + or normalize_string_list(catalog_entry.get("rerun_matrix")) + or default_rerun_matrix_for_problem(problem_type), + "minimal_patch_direction": str(raw_target.get("minimal_patch_direction") or fix_goal).strip() or fix_goal, "signals": ["analyst_priority_target"], "target_source": "analyst_verdict.priority_targets", } @@ -4253,6 +4497,80 @@ def select_primary_repair_focus(repair_targets: dict[str, Any]) -> dict[str, Any return primary_focus if isinstance(primary_focus, dict) else None +def evaluate_auto_coder_gate( + repair_targets: dict[str, Any], + assigned_focus: dict[str, Any] | None, +) -> dict[str, Any]: + issue_codes = normalize_string_list((assigned_focus or {}).get("issue_codes")) + root_layers = normalize_string_list((assigned_focus or {}).get("root_cause_layers")) + allowed_patch_targets = normalize_string_list((assigned_focus or {}).get("allowed_patch_targets")) + forbidden_patch_targets = normalize_string_list((assigned_focus or {}).get("forbidden_patch_targets")) + rerun_matrix = normalize_string_list((assigned_focus or {}).get("rerun_matrix")) + focus_id = str((assigned_focus or {}).get("focus_id") or "").strip() or None + blocking_reasons: list[str] = [] + + if not assigned_focus: + blocking_reasons.append("missing_assigned_focus") + if not issue_codes: + blocking_reasons.append("missing_issue_code") + for issue_code in issue_codes: + if issue_code not in AUTO_CODER_ALLOWED_ISSUE_CODES: + blocking_reasons.append(f"issue_code_not_allowlisted:{issue_code}") + if not root_layers: + blocking_reasons.append("missing_root_layers") + if not allowed_patch_targets: + blocking_reasons.append("missing_allowed_patch_targets") + if not forbidden_patch_targets: + blocking_reasons.append("missing_forbidden_patch_targets") + if not rerun_matrix: + blocking_reasons.append("missing_rerun_matrix") + if rerun_matrix and "accepted_smoke_pack" not in rerun_matrix: + blocking_reasons.append("missing_accepted_smoke_pack") + + target_items = repair_targets.get("targets") if isinstance(repair_targets.get("targets"), list) else [] + focus_target_ids = set(normalize_string_list((assigned_focus or {}).get("target_ids"))) + focus_targets = [ + target + for target in target_items + if isinstance(target, dict) and str(target.get("target_id") or "").strip() in focus_target_ids + ] + if not focus_targets and assigned_focus: + blocking_reasons.append("missing_focus_targets") + for target in focus_targets: + target_id = str(target.get("target_id") or "").strip() or "unknown_target" + target_issue = str(target.get("issue_code") or "").strip() + if not target_issue: + blocking_reasons.append(f"target_missing_issue_code:{target_id}") + if not normalize_string_list(target.get("allowed_patch_targets")): + blocking_reasons.append(f"target_missing_allowed_patch_targets:{target_id}") + if not normalize_string_list(target.get("forbidden_patch_targets")): + blocking_reasons.append(f"target_missing_forbidden_patch_targets:{target_id}") + if not normalize_string_list(target.get("rerun_matrix")): + blocking_reasons.append(f"target_missing_rerun_matrix:{target_id}") + + allowed = not blocking_reasons + return { + "schema_version": "auto_coder_gate_v1", + "allowed": allowed, + "mode": REPAIR_MODE_AUTO_CODER, + "focus_id": focus_id, + "issue_codes": issue_codes, + "root_layers": root_layers, + "allowed_patch_targets": allowed_patch_targets, + "forbidden_patch_targets": forbidden_patch_targets, + "rerun_matrix": rerun_matrix, + "allowlisted_issue_codes": sorted(AUTO_CODER_ALLOWED_ISSUE_CODES), + "blocking_reasons": blocking_reasons, + "reason": "auto_coder_gate_passed" if allowed else ";".join(blocking_reasons), + "policy": { + "auto_coder_default": False, + "requires_issue_catalog_contract": True, + "requires_accepted_smoke_pack": True, + "lead_owns_merge_and_acceptance": True, + }, + } + + def build_repair_targets_summary(repair_targets: dict[str, Any]) -> str: lines = [ "# Repair targets", @@ -4535,6 +4853,7 @@ def build_coder_loop_prompt( assigned_focus: dict[str, Any] | None, analyst_verdict_path: Path, analyst_verdict_json: str, + auto_coder_gate_json: str | None = None, ) -> str: assigned_focus_block = ( textwrap.dedent( @@ -4548,6 +4867,17 @@ def build_coder_loop_prompt( if assigned_focus else "Assigned deterministic repair focus for this iteration: none" ) + auto_coder_gate_block = "" + if auto_coder_gate_json: + auto_coder_gate_block = textwrap.dedent( + f"""\ + + Auto-coder gate: + ```json + {auto_coder_gate_json} + ``` + """ + ) return textwrap.dedent( f"""\ You are the `domain_coder` for NDC_1C. @@ -4576,6 +4906,8 @@ def build_coder_loop_prompt( - use `root_cause_layers`, `broken_edge_ids`, `violated_invariants`, and business-utility scores from the analyst verdict to choose the smallest fix; - use the deterministic repair targets to choose the highest-leverage repair focus first; within that focus, patch the narrowest shared layer that can clear the most `P0`/`P1` targets without architecture drift; - the assigned deterministic repair focus below is mandatory for this iteration; do not switch to a lower-priority focus unless you are blocked from making a safe patch for the assigned focus; + - auto-coder mode is allowed only for the issue codes and patch targets passed by the auto-coder gate; do not edit outside `allowed_patch_targets`; + - do not touch any `forbidden_patch_targets`; if the fix needs one, return `blocked` instead of patching; - if the analyst verdict is optimistic but deterministic repair targets still contain `P0` or `P1`, trust the deterministic repair targets and keep fixing the pack; - prioritize state continuity, selected-object persistence, stable `focus_object`, stable `answer_object`, reusable `provenance_bundle` / `sale_trace_bundle`, action-first answer behavior, compact micro-action answers, answer layering, temporal honesty, and field-truth mapping when those are the blocking layers; - do not broaden scope when the analyst says the defect is mainly `object_memory_gap`, `followup_action_resolution_gap`, `bundle_reuse_gap`, `field_mapping_gap`, `temporal_honesty_gap`, `answer_shape_mismatch`, or `business_utility_gap`; @@ -4596,6 +4928,7 @@ def build_coder_loop_prompt( ``` {assigned_focus_block} + {auto_coder_gate_block} - then return JSON only and follow the schema exactly. """ @@ -4638,6 +4971,191 @@ def _limited_dict_items(raw_items: Any, limit: int = 5) -> list[dict[str, Any]]: return [item for item in raw_items[:limit] if isinstance(item, dict)] +def normalize_business_audit_status(loop_decision: str) -> str: + normalized = str(loop_decision or "").strip() + if normalized in {"accepted", "partial", "blocked", "needs_exact_capability"}: + return normalized + if normalized == "continue": + return "partial" + return "partial" + + +def collect_rerun_matrix(repair_targets: dict[str, Any]) -> list[str]: + matrix: list[str] = [] + catalog = load_issue_catalog() + for target in repair_targets.get("targets") if isinstance(repair_targets.get("targets"), list) else []: + if not isinstance(target, dict): + continue + issue_code = str(target.get("issue_code") or "").strip() + catalog_entry = issue_catalog_entry(issue_code, catalog) if issue_code else {} + target_matrix = normalize_string_list(target.get("rerun_matrix")) or normalize_string_list( + catalog_entry.get("rerun_matrix") + ) + for item in target_matrix: + if item not in matrix: + matrix.append(item) + return matrix or ["failed_scenario", "accepted_smoke_pack"] + + +def build_issue_catalog_snapshot(repair_targets: dict[str, Any], catalog: dict[str, Any] | None = None) -> dict[str, Any]: + source = catalog if isinstance(catalog, dict) else load_issue_catalog() + issues = source.get("issues") if isinstance(source.get("issues"), dict) else {} + observed_codes = sorted( + { + str(target.get("issue_code") or "").strip() + for target in repair_targets.get("targets", []) + if isinstance(target, dict) and str(target.get("issue_code") or "").strip() + } + ) + return { + "schema_version": "issue_catalog_snapshot_v1", + "source_catalog": repo_relative(ISSUE_CATALOG_PATH), + "observed_issue_codes": observed_codes, + "issues": { + issue_code: issues.get(issue_code) + for issue_code in observed_codes + if isinstance(issues.get(issue_code), dict) + }, + } + + +def build_detector_candidates(repair_targets: dict[str, Any], catalog: dict[str, Any] | None = None) -> dict[str, Any]: + source = catalog if isinstance(catalog, dict) else load_issue_catalog() + issues = source.get("issues") if isinstance(source.get("issues"), dict) else {} + candidates: list[dict[str, Any]] = [] + seen: set[tuple[str, str]] = set() + for target in repair_targets.get("targets") if isinstance(repair_targets.get("targets"), list) else []: + if not isinstance(target, dict): + continue + issue_code = str(target.get("issue_code") or "").strip() + entry = issues.get(issue_code) if isinstance(issues.get(issue_code), dict) else {} + detectors = normalize_string_list(entry.get("detectors")) + if not detectors and issue_code: + detectors = [f"{issue_code}_detector"] + for detector in detectors: + key = (issue_code, detector) + if key in seen: + continue + seen.add(key) + candidates.append( + { + "issue_code": issue_code, + "detector": detector, + "severity": target.get("severity"), + "sample_target_id": target.get("target_id"), + "evidence_paths": target.get("evidence_paths") or [], + } + ) + return { + "schema_version": "detector_candidates_v1", + "candidate_count": len(candidates), + "candidates": candidates, + } + + +def build_blocking_issue_contract(target: dict[str, Any], catalog: dict[str, Any]) -> dict[str, Any]: + issue_code = str(target.get("issue_code") or target.get("problem_type") or "other").strip() + entry = issue_catalog_entry(issue_code, catalog) + return { + "issue_code": issue_code, + "severity": target.get("severity"), + "domain": target.get("scenario_id") or target.get("domain"), + "scenario_id": target.get("scenario_id"), + "step_id": target.get("step_id"), + "user_question": target.get("question_resolved"), + "expected_business_answer_contract": target.get("expected_business_answer_contract") + or entry.get("expected_answer_contract"), + "actual_answer_path": (target.get("evidence_paths") or [None])[0], + "evidence_paths": target.get("evidence_paths") or [], + "root_layers": target.get("root_cause_layers") or entry.get("root_layers") or [], + "business_mismatch": target.get("fix_goal") or entry.get("business_meaning"), + "minimal_patch_direction": target.get("minimal_patch_direction") or target.get("fix_goal"), + "allowed_patch_targets": target.get("allowed_patch_targets") + or entry.get("allowed_patch_targets") + or target.get("candidate_files") + or [], + "forbidden_patch_targets": target.get("forbidden_patch_targets") or entry.get("forbidden_patch_targets") or [], + "rerun_matrix": target.get("rerun_matrix") or entry.get("rerun_matrix") or [], + } + + +def build_business_audit_contract( + *, + analyst_verdict: dict[str, Any], + repair_targets: dict[str, Any], + target_score: int, + loop_decision: str, + analyst_accepted_gate: bool, + accepted_gate: bool, + deterministic_gate_ok: bool, + deterministic_gate_reason: str, + business_audit_markdown_path: Path, + analyst_verdict_path: Path, + repair_targets_path: Path, + business_audit_json_path: Path | None = None, + issue_catalog_snapshot_path: Path | None = None, + rerun_matrix_path: Path | None = None, + detector_candidates_path: Path | None = None, +) -> dict[str, Any]: + catalog = load_issue_catalog() + targets = repair_targets.get("targets") if isinstance(repair_targets.get("targets"), list) else [] + blocking_issues = [ + build_blocking_issue_contract(target, catalog) + for target in targets + if isinstance(target, dict) and str(target.get("severity") or "").upper() in {"P0", "P1"} + ] + rerun_matrix = collect_rerun_matrix(repair_targets) + result = { + "schema_version": "business_audit_contract_v1", + "created_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), + "overall_status": normalize_business_audit_status(loop_decision), + "quality_score": int(analyst_verdict.get("quality_score") or 0), + "target_score": target_score, + "loop_decision": loop_decision, + "analyst_accepted_gate": analyst_accepted_gate, + "accepted_gate": accepted_gate, + "deterministic_gate_ok": deterministic_gate_ok, + "deterministic_gate_reason": deterministic_gate_reason, + "human_meaning": { + "user_intent_summary": analyst_verdict.get("user_intent_summary"), + "expected_direct_answer": analyst_verdict.get("expected_direct_answer"), + "actual_direct_answer": analyst_verdict.get("actual_direct_answer"), + }, + "quality_flags": { + "direct_answer_ok": bool(analyst_verdict.get("direct_answer_ok", True)), + "business_usefulness_ok": bool(analyst_verdict.get("business_usefulness_ok", True)), + "temporal_honesty_ok": bool(analyst_verdict.get("temporal_honesty_ok", True)), + "field_truth_ok": bool(analyst_verdict.get("field_truth_ok", True)), + "answer_layering_ok": bool(analyst_verdict.get("answer_layering_ok", True)), + "regression_detected": bool(analyst_verdict.get("regression_detected")), + }, + "root_layers": normalize_string_list(analyst_verdict.get("root_cause_layers")), + "violated_invariants": normalize_string_list(analyst_verdict.get("violated_invariants")), + "blocking_issues": blocking_issues, + "repair_targets_summary": { + "target_count": repair_targets.get("target_count"), + "severity_counts": repair_targets.get("severity_counts") or {}, + "priority_foci": _limited_dict_items(repair_targets.get("priority_foci"), limit=8), + }, + "rerun_matrix": rerun_matrix, + "artifact_refs": { + "business_audit_md": repo_relative(business_audit_markdown_path), + "analyst_verdict_json": repo_relative(analyst_verdict_path), + "repair_targets_json": repo_relative(repair_targets_path), + }, + } + artifact_refs = result["artifact_refs"] + if business_audit_json_path is not None: + artifact_refs["business_audit_json"] = repo_relative(business_audit_json_path) + if issue_catalog_snapshot_path is not None: + artifact_refs["issue_catalog_snapshot_json"] = repo_relative(issue_catalog_snapshot_path) + if rerun_matrix_path is not None: + artifact_refs["rerun_matrix_json"] = repo_relative(rerun_matrix_path) + if detector_candidates_path is not None: + artifact_refs["detector_candidates_json"] = repo_relative(detector_candidates_path) + return result + + def build_business_audit_markdown( *, analyst_verdict: dict[str, Any], @@ -4724,6 +5242,10 @@ def build_lead_coder_handoff( analyst_verdict_path: Path, repair_targets_path: Path, business_audit_path: Path, + business_audit_json_path: Path | None = None, + issue_catalog_snapshot_path: Path | None = None, + rerun_matrix_path: Path | None = None, + detector_candidates_path: Path | None = None, analyst_verdict: dict[str, Any], repair_targets: dict[str, Any], target_score: int, @@ -4738,7 +5260,16 @@ def build_lead_coder_handoff( ) -> dict[str, Any]: assigned_focus = select_primary_repair_focus(repair_targets) priority_foci = _limited_dict_items(repair_targets.get("priority_foci") if isinstance(repair_targets, dict) else []) - repair_items = _limited_dict_items(repair_targets.get("targets") if isinstance(repair_targets, dict) else [], limit=8) + repair_target_items = repair_targets.get("targets") if isinstance(repair_targets.get("targets"), list) else [] + repair_items = _limited_dict_items(repair_target_items, limit=8) + issue_codes = sorted( + { + str(target.get("issue_code") or "").strip() + for target in repair_target_items + if isinstance(target, dict) and str(target.get("issue_code") or "").strip() + } + ) + rerun_matrix = collect_rerun_matrix(repair_targets) route_candidate_groups = _limited_dict_items( repair_targets.get("route_candidate_groups") if isinstance(repair_targets, dict) else [], limit=8, @@ -4749,6 +5280,24 @@ def build_lead_coder_handoff( if isinstance(item, dict) and str(item.get("target_source") or "") == "route_candidate_enablement" ] candidate_files = [repo_relative(path) for path in build_coder_snapshot_paths(repair_targets)] + artifact_refs = { + "pack_dir": repo_relative(pack_dir), + "business_audit": repo_relative(business_audit_path), + "analyst_verdict": repo_relative(analyst_verdict_path), + "repair_targets": repo_relative(repair_targets_path), + "pack_summary": repo_relative(pack_dir / "pack_summary.md"), + "pack_state": repo_relative(pack_dir / "pack_state.json"), + "scenario_acceptance_matrix": repo_relative(pack_dir / "scenario_acceptance_matrix.md"), + } + if business_audit_json_path is not None: + artifact_refs["business_audit_json"] = repo_relative(business_audit_json_path) + if issue_catalog_snapshot_path is not None: + artifact_refs["issue_catalog_snapshot"] = repo_relative(issue_catalog_snapshot_path) + if rerun_matrix_path is not None: + artifact_refs["rerun_matrix"] = repo_relative(rerun_matrix_path) + if detector_candidates_path is not None: + artifact_refs["detector_candidates"] = repo_relative(detector_candidates_path) + return { "schema_version": "domain_loop_lead_coder_handoff_v1", "repair_mode": REPAIR_MODE_LEAD_HANDOFF, @@ -4767,15 +5316,9 @@ def build_lead_coder_handoff( "requires_user_decision": requires_user_decision, "user_decision_type": user_decision_type, "user_decision_prompt": user_decision_prompt, - "artifact_refs": { - "pack_dir": repo_relative(pack_dir), - "business_audit": repo_relative(business_audit_path), - "analyst_verdict": repo_relative(analyst_verdict_path), - "repair_targets": repo_relative(repair_targets_path), - "pack_summary": repo_relative(pack_dir / "pack_summary.md"), - "pack_state": repo_relative(pack_dir / "pack_state.json"), - "scenario_acceptance_matrix": repo_relative(pack_dir / "scenario_acceptance_matrix.md"), - }, + "artifact_refs": artifact_refs, + "issue_codes": issue_codes, + "rerun_matrix": rerun_matrix, "human_meaning": { "user_intent_summary": analyst_verdict.get("user_intent_summary"), "expected_direct_answer": analyst_verdict.get("expected_direct_answer"), @@ -4792,7 +5335,10 @@ def build_lead_coder_handoff( "candidate_files": candidate_files, "lead_instructions": [ "Read business_audit.md first and judge the user-facing answer before debug metadata.", + "Use business_audit.json, issue_catalog_snapshot.json, rerun_matrix.json, and detector_candidates.json as the repair contract.", "Inspect analyst_verdict.json and repair_targets.json only after the semantic defect is clear.", + "Patch only inside allowed_patch_targets for the issue_code unless Lead Codex explicitly expands scope.", + "Do not touch forbidden_patch_targets and do not repair by masking detector symptoms.", "Use route_candidate_groups to distinguish missing user scope from a reviewed-route enablement gap before patching.", "Patch code manually in the main Codex context; do not launch a weak autonomous coder by default.", "Keep the patch narrow, preserve UTF-8 without BOM, run targeted tests/build, rebuild graphify after code edits, then rerun the same semantic pack.", @@ -4816,10 +5362,19 @@ def build_lead_coder_handoff_markdown(handoff: dict[str, Any]) -> str: "", "## Read First", f"- business_audit: `{artifact_refs.get('business_audit')}`", + f"- business_audit_json: `{artifact_refs.get('business_audit_json') or 'n/a'}`", f"- analyst_verdict: `{artifact_refs.get('analyst_verdict')}`", f"- repair_targets: `{artifact_refs.get('repair_targets')}`", + f"- issue_catalog_snapshot: `{artifact_refs.get('issue_catalog_snapshot') or 'n/a'}`", + f"- rerun_matrix: `{artifact_refs.get('rerun_matrix') or 'n/a'}`", + f"- detector_candidates: `{artifact_refs.get('detector_candidates') or 'n/a'}`", + f"- auto_coder_gate: `{artifact_refs.get('auto_coder_gate') or 'n/a'}`", f"- pack_dir: `{artifact_refs.get('pack_dir')}`", "", + "## Repair Contract", + f"- issue_codes: `{', '.join(normalize_string_list(handoff.get('issue_codes'))) or 'n/a'}`", + f"- rerun_matrix: `{', '.join(normalize_string_list(handoff.get('rerun_matrix'))) or 'n/a'}`", + "", "## Human Meaning", f"- user_intent_summary: {human_meaning.get('user_intent_summary') or 'n/a'}", f"- expected_direct_answer: {human_meaning.get('expected_direct_answer') or 'n/a'}", @@ -4853,10 +5408,15 @@ def build_lead_coder_handoff_markdown(handoff: dict[str, Any]) -> str: lines.extend( [ f"- `{target.get('target_id') or 'n/a'}`", + f" issue_code: `{target.get('issue_code') or 'n/a'}`", f" severity: `{target.get('severity') or 'n/a'}`", f" problem_type: `{target.get('problem_type') or 'n/a'}`", + f" expected_contract: `{target.get('expected_business_answer_contract') or 'n/a'}`", f" source: `{target.get('target_source') or 'n/a'}`", f" fix_goal: {target.get('fix_goal') or 'n/a'}", + f" allowed_patch_targets: `{', '.join(normalize_string_list(target.get('allowed_patch_targets'))) or 'n/a'}`", + f" forbidden_patch_targets: `{', '.join(normalize_string_list(target.get('forbidden_patch_targets'))) or 'n/a'}`", + f" rerun_matrix: `{', '.join(normalize_string_list(target.get('rerun_matrix'))) or 'n/a'}`", f" candidate_files: `{', '.join(candidate_files) if candidate_files else 'n/a'}`", ] ) @@ -4948,6 +5508,19 @@ def handle_run_pack(args: argparse.Namespace) -> int: scenarios_dir.mkdir(parents=True, exist_ok=True) write_json(pack_dir / "pack_manifest.json", pack) write_text(pack_dir / "manifest_source.txt", f"{pack_path}\n") + runtime_manifest.write_effective_runtime( + pack_dir, + runner="domain_case_loop.run-pack", + args=args, + spec_path=pack_path, + run_id=pack["pack_id"], + extra={ + "domain": pack["domain"], + "title": pack["title"], + "scenario_count": len(pack.get("scenarios") or []), + "max_scenarios": getattr(args, "max_scenarios", None), + }, + ) scenario_results: list[dict[str, Any]] = [] max_scenarios = max(0, int(args.max_scenarios)) if args.max_scenarios is not None else None @@ -5037,7 +5610,11 @@ def build_loop_summary(loop_state: dict[str, Any]) -> str: f" coder_workspace_hygiene_restored_files: `{', '.join(item.get('coder_workspace_hygiene_restored_files') or []) or 'none'}`", f" analyst_verdict: `{item.get('analyst_verdict_path') or 'n/a'}`", f" business_audit: `{item.get('business_audit_path') or 'n/a'}`", + f" business_audit_json: `{item.get('business_audit_json_path') or 'n/a'}`", f" repair_targets: `{item.get('repair_targets_path') or 'n/a'}`", + f" rerun_matrix: `{item.get('rerun_matrix_path') or 'n/a'}`", + f" detector_candidates: `{item.get('detector_candidates_path') or 'n/a'}`", + f" auto_coder_gate: `{item.get('auto_coder_gate_path') or 'n/a'}`", f" lead_coder_handoff: `{item.get('lead_coder_handoff_path') or 'n/a'}`", f" repair_target_count: `{item.get('repair_target_count')}`", f" repair_target_severity_counts: `{dump_json(item.get('repair_target_severity_counts') or {})}`", @@ -5057,6 +5634,8 @@ def build_loop_final_status(loop_state: dict[str, Any]) -> str: - target_score: `{loop_state['target_score']}` - iterations_ran: `{len(loop_state.get('iterations', []))}` - last_analyst_decision: `{loop_state.get('last_analyst_decision') or 'n/a'}` + - latest_business_audit_json: `{loop_state.get('latest_business_audit_json_path') or 'n/a'}` + - latest_rerun_matrix: `{loop_state.get('latest_rerun_matrix_path') or 'n/a'}` - latest_lead_coder_handoff: `{loop_state.get('latest_lead_coder_handoff_path') or 'n/a'}` - stop_reason: {loop_state.get('stop_reason') or 'n/a'} """ @@ -5070,6 +5649,18 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: iterations_dir = loop_dir / "iterations" iterations_dir.mkdir(parents=True, exist_ok=True) write_text(loop_dir / "manifest_source.txt", f"{manifest_path}\n") + runtime_manifest.write_effective_runtime( + loop_dir, + runner="domain_case_loop.run-pack-loop", + args=args, + spec_path=manifest_path, + run_id=loop_id, + extra={ + "target_score": args.target_score, + "max_iterations": args.max_iterations, + "repair_mode": getattr(args, "repair_mode", REPAIR_MODE_LEAD_HANDOFF), + }, + ) target_score = int(args.target_score) max_iterations = int(args.max_iterations) @@ -5166,6 +5757,10 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: ) deterministic_gate_ok, deterministic_gate_reason = evaluate_deterministic_loop_gate(pack_state, repair_targets) business_audit_path = iteration_dir / "business_audit.md" + business_audit_json_path = iteration_dir / "business_audit.json" + issue_catalog_snapshot_path = iteration_dir / "issue_catalog_snapshot.json" + rerun_matrix_path = iteration_dir / "rerun_matrix.json" + detector_candidates_path = iteration_dir / "detector_candidates.json" write_text( business_audit_path, build_business_audit_markdown( @@ -5175,6 +5770,34 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: ), ) accepted_gate = analyst_accepted_gate and deterministic_gate_ok + issue_catalog_snapshot = build_issue_catalog_snapshot(repair_targets) + rerun_matrix_contract = { + "schema_version": "rerun_matrix_v1", + "source_repair_targets": repo_relative(repair_targets_path), + "items": collect_rerun_matrix(repair_targets), + } + detector_candidates = build_detector_candidates(repair_targets) + business_audit_contract = build_business_audit_contract( + analyst_verdict=analyst_verdict, + repair_targets=repair_targets, + target_score=target_score, + loop_decision=loop_decision, + analyst_accepted_gate=analyst_accepted_gate, + accepted_gate=accepted_gate, + deterministic_gate_ok=deterministic_gate_ok, + deterministic_gate_reason=deterministic_gate_reason, + business_audit_markdown_path=business_audit_path, + analyst_verdict_path=analyst_verdict_path, + repair_targets_path=repair_targets_path, + business_audit_json_path=business_audit_json_path, + issue_catalog_snapshot_path=issue_catalog_snapshot_path, + rerun_matrix_path=rerun_matrix_path, + detector_candidates_path=detector_candidates_path, + ) + write_json(business_audit_json_path, business_audit_contract) + write_json(issue_catalog_snapshot_path, issue_catalog_snapshot) + write_json(rerun_matrix_path, rerun_matrix_contract) + write_json(detector_candidates_path, detector_candidates) repair_target_count = int(repair_targets.get("target_count") or 0) if isinstance(repair_targets, dict) else 0 repair_target_severity_counts = ( repair_targets.get("severity_counts") @@ -5184,6 +5807,11 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: loop_state["last_analyst_decision"] = loop_decision loop_state["last_user_decision_type"] = user_decision_type loop_state["last_user_decision_prompt"] = user_decision_prompt + loop_state["latest_business_audit_path"] = str(business_audit_path) + loop_state["latest_business_audit_json_path"] = str(business_audit_json_path) + loop_state["latest_issue_catalog_snapshot_path"] = str(issue_catalog_snapshot_path) + loop_state["latest_rerun_matrix_path"] = str(rerun_matrix_path) + loop_state["latest_detector_candidates_path"] = str(detector_candidates_path) iteration_record: dict[str, Any] = { "iteration_id": iteration_id, @@ -5199,7 +5827,11 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: "user_decision_prompt": user_decision_prompt, "analyst_verdict_path": str(analyst_verdict_path), "business_audit_path": str(business_audit_path), + "business_audit_json_path": str(business_audit_json_path), "repair_targets_path": str(repair_targets_path), + "issue_catalog_snapshot_path": str(issue_catalog_snapshot_path), + "rerun_matrix_path": str(rerun_matrix_path), + "detector_candidates_path": str(detector_candidates_path), "repair_target_count": repair_target_count, "repair_target_severity_counts": repair_target_severity_counts, "coder_status": None, @@ -5241,6 +5873,10 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: analyst_verdict_path=analyst_verdict_path, repair_targets_path=repair_targets_path, business_audit_path=business_audit_path, + business_audit_json_path=business_audit_json_path, + issue_catalog_snapshot_path=issue_catalog_snapshot_path, + rerun_matrix_path=rerun_matrix_path, + detector_candidates_path=detector_candidates_path, analyst_verdict=analyst_verdict, repair_targets=repair_targets, target_score=target_score, @@ -5278,8 +5914,61 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: write_json(loop_dir / "loop_state.json", loop_state) break - coder_result_path = iteration_dir / "coder_result.json" assigned_focus = select_primary_repair_focus(repair_targets) + auto_coder_gate_path = iteration_dir / "auto_coder_gate.json" + auto_coder_gate = evaluate_auto_coder_gate(repair_targets, assigned_focus) + write_json(auto_coder_gate_path, auto_coder_gate) + iteration_record["auto_coder_gate_path"] = str(auto_coder_gate_path) + if not bool(auto_coder_gate.get("allowed")): + handoff = build_lead_coder_handoff( + loop_state=loop_state, + iteration_id=iteration_id, + pack_dir=pack_dir, + analyst_verdict_path=analyst_verdict_path, + repair_targets_path=repair_targets_path, + business_audit_path=business_audit_path, + business_audit_json_path=business_audit_json_path, + issue_catalog_snapshot_path=issue_catalog_snapshot_path, + rerun_matrix_path=rerun_matrix_path, + detector_candidates_path=detector_candidates_path, + analyst_verdict=analyst_verdict, + repair_targets=repair_targets, + target_score=target_score, + loop_decision=loop_decision, + analyst_accepted_gate=analyst_accepted_gate, + accepted_gate=accepted_gate, + deterministic_gate_ok=deterministic_gate_ok, + deterministic_gate_reason=deterministic_gate_reason, + requires_user_decision=requires_user_decision, + user_decision_type=user_decision_type, + user_decision_prompt=user_decision_prompt, + ) + handoff["status"] = "auto_coder_gate_blocked_lead_handoff_required" + handoff["reason"] = "auto-coder was explicitly requested, but the issue catalog repair contract did not pass the auto-coder gate" + handoff["auto_coder_gate"] = auto_coder_gate + if isinstance(handoff.get("artifact_refs"), dict): + handoff["artifact_refs"]["auto_coder_gate"] = repo_relative(auto_coder_gate_path) + handoff_paths = save_lead_coder_handoff( + loop_dir=loop_dir, + iteration_dir=iteration_dir, + handoff=handoff, + ) + iteration_record["coder_status"] = "auto_coder_gate_blocked" + iteration_record.update(handoff_paths) + if assigned_focus: + iteration_record["assigned_repair_focus_id"] = str(assigned_focus.get("focus_id") or "") + loop_state["iterations"].append(iteration_record) + loop_state["latest_lead_coder_handoff_path"] = handoff_paths["latest_lead_coder_handoff_path"] + loop_state["latest_lead_coder_handoff_markdown_path"] = handoff_paths[ + "latest_lead_coder_handoff_markdown_path" + ] + loop_state["final_status"] = loop_decision if loop_decision in {"needs_exact_capability", "partial"} else "partial" + loop_state["stop_reason"] = f"auto_coder_gate_blocked at {iteration_id}: {auto_coder_gate.get('reason')}" + loop_state["updated_at"] = datetime.now(timezone.utc).replace(microsecond=0).isoformat() + write_json(loop_dir / "loop_state.json", loop_state) + break + + coder_result_path = iteration_dir / "coder_result.json" coder_prompt = build_coder_loop_prompt( loop_dir=loop_dir, iteration_dir=iteration_dir, @@ -5289,6 +5978,7 @@ def handle_run_pack_loop(args: argparse.Namespace) -> int: assigned_focus=assigned_focus, analyst_verdict_path=analyst_verdict_path, analyst_verdict_json=dump_json(analyst_verdict), + auto_coder_gate_json=dump_json(auto_coder_gate), ) write_text(iteration_dir / "coder_prompt.md", coder_prompt + "\n") coder_snapshot_paths = build_coder_snapshot_paths(repair_targets) diff --git a/scripts/domain_truth_harness.py b/scripts/domain_truth_harness.py index 7fc3fa9..cc5bf9b 100644 --- a/scripts/domain_truth_harness.py +++ b/scripts/domain_truth_harness.py @@ -9,6 +9,7 @@ from types import SimpleNamespace from typing import Any import domain_case_loop as dcl +import agent_runtime_manifest as runtime_manifest import check_mcp_live_readiness as mcp_readiness import scenario_acceptance_policy as sap @@ -1522,6 +1523,18 @@ def run_live(spec: dict[str, Any], output_dir: Path, args: argparse.Namespace) - dcl.ensure_backend_health(runner_args.backend_url, runner_args.timeout_seconds) output_dir.mkdir(parents=True, exist_ok=True) + runtime_manifest.write_effective_runtime( + output_dir, + runner="domain_truth_harness.run-live", + args=args, + spec_path=Path(args.spec).resolve() if getattr(args, "spec", None) else None, + run_id=spec["scenario_id"], + extra={ + "domain": spec["domain"], + "title": spec["title"], + "require_mcp_live_readiness": bool(getattr(args, "require_mcp_live_readiness", False)), + }, + ) manifest = build_generated_manifest(spec) write_json(output_dir / "truth_harness_spec.json", spec) write_json(output_dir / "scenario_manifest.json", manifest) @@ -1671,6 +1684,18 @@ def handle_review_export(args: argparse.Namespace) -> int: output_dir = Path(args.output_dir).resolve() if args.output_dir else default_output_dir( f"{spec['scenario_id']}_review" ) + runtime_manifest.write_effective_runtime( + output_dir, + runner="domain_truth_harness.review-export", + args=args, + spec_path=spec_path, + run_id=spec["scenario_id"], + extra={ + "domain": spec["domain"], + "title": spec["title"], + "source_export": str(export_path), + }, + ) result = review_export(spec, export_path, output_dir) print(f"[truth-harness] review-export overall_status={result['review_summary']['overall_status']}") print(f"[truth-harness] review-export final_status={result['pack_state']['final_status']}") @@ -1684,6 +1709,19 @@ def handle_run_live(args: argparse.Namespace) -> int: output_dir = Path(args.output_dir).resolve() if args.output_dir else default_output_dir( f"{spec['scenario_id']}_live" ) + runtime_manifest.write_effective_runtime( + output_dir, + runner="domain_truth_harness.run-live", + args=args, + spec_path=spec_path, + run_id=spec["scenario_id"], + extra={ + "domain": spec["domain"], + "title": spec["title"], + "require_mcp_live_readiness": bool(getattr(args, "require_mcp_live_readiness", False)), + "preflight_manifest": True, + }, + ) if args.require_mcp_live_readiness: output_dir.mkdir(parents=True, exist_ok=True) readiness = mcp_readiness.check_readiness( diff --git a/scripts/prompt_registry_healthcheck.py b/scripts/prompt_registry_healthcheck.py new file mode 100644 index 0000000..c837d16 --- /dev/null +++ b/scripts/prompt_registry_healthcheck.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +import agent_runtime_manifest as runtime_manifest + + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def build_markdown_summary(health: dict[str, object]) -> str: + lines = [ + "# Prompt registry healthcheck", + "", + f"- status: `{health.get('status')}`", + f"- default_prompt_version: `{health.get('default_prompt_version') or 'n/a'}`", + f"- active_prompt_version: `{health.get('active_prompt_version') or 'n/a'}`", + f"- prompt_source: `{health.get('prompt_source') or 'n/a'}`", + f"- prompt_hash: `{health.get('prompt_hash') or 'n/a'}`", + "", + "## Prompt files", + ] + files = health.get("prompt_files") if isinstance(health.get("prompt_files"), list) else [] + if not files: + lines.append("- no prompt files resolved") + else: + for item in files: + if not isinstance(item, dict): + continue + exists = "yes" if item.get("exists") is True else "no" + lines.append(f"- `{item.get('slot')}` `{item.get('relative_path')}` exists=`{exists}`") + + failures = health.get("failures") if isinstance(health.get("failures"), list) else [] + warnings = health.get("warnings") if isinstance(health.get("warnings"), list) else [] + lines.extend(["", "## Failures"]) + lines.extend([f"- {item}" for item in failures] if failures else ["- none"]) + lines.extend(["", "## Warnings"]) + lines.extend([f"- {item}" for item in warnings] if warnings else ["- none"]) + return "\n".join(lines).strip() + "\n" + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Fail-loud prompt registry healthcheck for AGENT semantic runs.") + parser.add_argument("--prompt-version", help="Override active prompt version. Defaults to backend DEFAULT_PROMPT_VERSION.") + parser.add_argument("--json", action="store_true", help="Print machine-readable JSON instead of markdown.") + parser.add_argument( + "--allow-preset-mismatch", + action="store_true", + help="Downgrade saved preset prompt-version mismatch to warning for exploratory local runs.", + ) + return parser + + +def main() -> int: + args = build_parser().parse_args() + health = runtime_manifest.build_prompt_registry_health( + REPO_ROOT, + prompt_version=args.prompt_version, + strict_preset_match=not bool(args.allow_preset_mismatch), + ) + if args.json: + print(json.dumps(health, ensure_ascii=False, indent=2)) + else: + print(build_markdown_summary(health), end="") + return 0 if health.get("status") == "pass" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/save_agent_semantic_run.py b/scripts/save_agent_semantic_run.py index 3633d0e..9681553 100644 --- a/scripts/save_agent_semantic_run.py +++ b/scripts/save_agent_semantic_run.py @@ -9,6 +9,7 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any +import agent_runtime_manifest as runtime_manifest REPO_ROOT = Path(__file__).resolve().parents[1] HISTORY_FILE = REPO_ROOT / "llm_normalizer" / "data" / "autorun_generators" / "history.json" @@ -113,8 +114,37 @@ def assert_status(value: Any, expected: str, label: str, problems: list[str]) -> problems.append(f"{label}={actual or 'missing'}") +def require_effective_runtime_manifest(run_dir: Path) -> dict[str, Any]: + try: + return runtime_manifest.load_effective_runtime_manifest(run_dir) + except RuntimeError as exc: + raise RuntimeError( + "Refusing to save AGENT autorun because the accepted replay has no reproducibility manifest: " + f"{exc}" + ) from exc + + +def build_effective_runtime_save_summary(manifest: dict[str, Any], run_dir: Path) -> dict[str, Any]: + return { + "manifest_path": repo_relative(run_dir / runtime_manifest.EFFECTIVE_RUNTIME_FILE_NAME), + "runner": manifest.get("runner"), + "git_sha": manifest.get("git_sha"), + "backend_url": manifest.get("backend_url"), + "mcp_proxy_url": manifest.get("mcp_proxy_url"), + "llm_provider": manifest.get("llm_provider"), + "llm_model": manifest.get("llm_model"), + "temperature": manifest.get("temperature"), + "max_output_tokens": manifest.get("max_output_tokens"), + "prompt_version": manifest.get("prompt_version"), + "prompt_source": manifest.get("prompt_source"), + "prompt_hash": manifest.get("prompt_hash"), + "prompt_registry_status": manifest.get("prompt_registry_status"), + } + + def validate_truth_harness_run_dir(run_dir: Path) -> dict[str, Any]: run_dir = run_dir.resolve() + effective_runtime = require_effective_runtime_manifest(run_dir) pack_state = load_json_object(run_dir / "pack_state.json", "Validated run pack_state.json") truth_review = load_json_object(run_dir / "truth_review.json", "Validated run truth_review.json") business_review = load_json_object(run_dir / "business_review.json", "Validated run business_review.json") @@ -153,12 +183,14 @@ def validate_truth_harness_run_dir(run_dir: Path) -> dict[str, Any]: "steps_with_business_failures": business_review.get("steps_with_business_failures"), "steps_with_business_warnings": business_review.get("steps_with_business_warnings"), "acceptance_gate_passed": pack_state.get("acceptance_gate_passed"), + "effective_runtime": build_effective_runtime_save_summary(effective_runtime, run_dir), "saved_after_validated_replay": True, } def validate_domain_pack_loop_dir(loop_dir: Path) -> dict[str, Any]: loop_dir = loop_dir.resolve() + effective_runtime = require_effective_runtime_manifest(loop_dir) loop_state = load_json_object(loop_dir / "loop_state.json", "Validated loop_state.json") iterations = loop_state.get("iterations") if not isinstance(iterations, list) or not iterations: @@ -225,6 +257,7 @@ def validate_domain_pack_loop_dir(loop_dir: Path) -> dict[str, Any]: "repair_target_count": last_iteration.get("repair_target_count"), "repair_target_severity_counts": last_iteration.get("repair_target_severity_counts"), "accepted_gate": last_iteration.get("accepted_gate"), + "effective_runtime": build_effective_runtime_save_summary(effective_runtime, loop_dir), "saved_after_validated_replay": True, } diff --git a/scripts/stage_agent_loop.py b/scripts/stage_agent_loop.py index bdbb491..26a945d 100644 --- a/scripts/stage_agent_loop.py +++ b/scripts/stage_agent_loop.py @@ -12,6 +12,7 @@ from pathlib import Path from typing import Any import domain_case_loop as dcl +import agent_runtime_manifest as runtime_manifest import review_assistant_stage1_run as gui_review @@ -2008,6 +2009,19 @@ def handle_run(args: argparse.Namespace) -> int: stage_dir.mkdir(parents=True, exist_ok=True) write_json(stage_dir / "stage_manifest.json", stage_manifest) write_text(stage_dir / "stage_manifest_source.txt", repo_relative(stage_manifest_path) + "\n") + runtime_manifest.write_effective_runtime( + stage_dir, + runner="stage_agent_loop.run", + args=args, + spec_path=stage_manifest_path, + run_id=stage_manifest["stage_id"], + extra={ + "module_name": stage_manifest.get("module_name"), + "title": stage_manifest.get("title"), + "pack_manifest": stage_manifest.get("pack_manifest"), + "repair_mode": dcl.normalize_repair_mode(getattr(args, "repair_mode", None) or stage_manifest.get("repair_mode")), + }, + ) save_stage_context_capsule(stage_manifest, stage_dir) command = build_domain_pack_loop_command(args, stage_manifest, stage_dir) diff --git a/scripts/test_agent_runtime_manifest.py b/scripts/test_agent_runtime_manifest.py new file mode 100644 index 0000000..363ee48 --- /dev/null +++ b/scripts/test_agent_runtime_manifest.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import argparse +import json +import sys +import tempfile +import unittest +from pathlib import Path + + +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +import agent_runtime_manifest as runtime_manifest + + +def write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + + +def write_json(path: Path, payload: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + +def create_prompt_registry(repo_root: Path, *, preset_version: str = "normalizer_v2_0_2") -> None: + write_text( + repo_root / "llm_normalizer" / "backend" / "src" / "config.ts", + 'export const DEFAULT_PROMPT_VERSION = process.env.DEFAULT_PROMPT_VERSION ?? "normalizer_v2_0_2";\n' + "export const FEATURE_ASSISTANT_ADDRESS_QUERY_V1 = toBooleanFlag(\n" + " process.env.FEATURE_ASSISTANT_ADDRESS_QUERY_V1,\n" + " true\n" + ");\n", + ) + for relative_path in runtime_manifest.BUILTIN_PROMPT_FILES["normalizer_v2_0_2"].values(): + write_text(repo_root / "llm_normalizer" / "prompts" / relative_path, f"{relative_path}\n") + write_json( + repo_root / "llm_normalizer" / "data" / "presets" / "preset-current.json", + {"prompt_version": preset_version}, + ) + + +class AgentRuntimeManifestTests(unittest.TestCase): + def test_prompt_registry_health_passes_for_complete_matching_registry(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo_root = Path(tmp) + create_prompt_registry(repo_root) + + health = runtime_manifest.build_prompt_registry_health(repo_root) + + self.assertEqual(health["status"], "pass") + self.assertEqual(health["prompt_source"], "file") + self.assertEqual(health["active_prompt_version"], "normalizer_v2_0_2") + self.assertTrue(health["prompt_hash"]) + self.assertFalse(health["failures"]) + + def test_prompt_registry_health_fails_on_preset_mismatch_when_strict(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo_root = Path(tmp) + create_prompt_registry(repo_root, preset_version="normalizer_v1") + + health = runtime_manifest.build_prompt_registry_health(repo_root) + + self.assertEqual(health["status"], "fail") + self.assertTrue(any(str(item).startswith("preset_version_mismatch:") for item in health["failures"])) + + def test_effective_runtime_manifest_records_runner_and_llm_settings(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo_root = Path(tmp) + create_prompt_registry(repo_root) + args = argparse.Namespace( + backend_url="http://127.0.0.1:8787", + mcp_proxy_url="http://127.0.0.1:6003", + mcp_channel="default", + llm_provider="local", + llm_model="test-model", + llm_base_url="http://127.0.0.1:1234/v1", + temperature=0.0, + max_output_tokens=2048, + prompt_version="normalizer_v2_0_2", + use_mock=False, + ) + + manifest = runtime_manifest.build_effective_runtime_manifest( + runner="domain_truth_harness.run-live", + args=args, + repo_root=repo_root, + spec_path=repo_root / "docs" / "orchestration" / "spec.json", + output_dir=repo_root / "artifacts" / "domain_runs" / "run", + run_id="run", + ) + + self.assertEqual(manifest["runner"], "domain_truth_harness.run-live") + self.assertEqual(manifest["llm_model"], "test-model") + self.assertEqual(manifest["temperature"], 0.0) + self.assertEqual(manifest["max_output_tokens"], 2048) + self.assertEqual(manifest["prompt_registry_status"], "pass") + + def test_effective_runtime_manifest_resolves_address_runtime_prompt_to_default_registry(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo_root = Path(tmp) + create_prompt_registry(repo_root) + args = argparse.Namespace( + backend_url="http://127.0.0.1:8787", + mcp_proxy_url="http://127.0.0.1:6003", + mcp_channel="default", + llm_provider="local", + llm_model="test-model", + llm_base_url="http://127.0.0.1:1234/v1", + temperature=0.0, + max_output_tokens=2048, + prompt_version="address_query_runtime_v1", + use_mock=False, + ) + + manifest = runtime_manifest.build_effective_runtime_manifest( + runner="domain_case_loop.run-pack", + args=args, + repo_root=repo_root, + spec_path=repo_root / "docs" / "orchestration" / "spec.json", + output_dir=repo_root / "artifacts" / "domain_runs" / "run", + run_id="run", + ) + + self.assertEqual(manifest["requested_prompt_version"], "address_query_runtime_v1") + self.assertEqual(manifest["assistant_runtime_prompt_version"], "address_query_runtime_v1") + self.assertEqual(manifest["prompt_version"], "normalizer_v2_0_2") + self.assertEqual(manifest["prompt_resolution"]["mode"], "assistant_runtime_schema_uses_default_normalizer_prompt") + self.assertEqual(manifest["prompt_source"], "file") + self.assertTrue(manifest["prompt_hash"]) + self.assertEqual(manifest["prompt_registry_status"], "pass") + + def test_load_effective_runtime_manifest_refuses_failing_prompt_registry(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + run_dir = Path(tmp) + runtime_manifest.write_json( + run_dir / runtime_manifest.EFFECTIVE_RUNTIME_FILE_NAME, + { + "schema_version": runtime_manifest.EFFECTIVE_RUNTIME_SCHEMA_VERSION, + "git_sha": "test-sha", + "runner": "domain_case_loop.run-pack", + "llm_model": "test-model", + "temperature": 0.0, + "max_output_tokens": 2048, + "prompt_version": "address_query_runtime_v1", + "prompt_source": "unknown", + "prompt_hash": None, + "prompt_registry_status": "fail", + "prompt_registry_failures": ["prompt_hash_unavailable"], + }, + ) + + with self.assertRaisesRegex(RuntimeError, "failing prompt registry status|missing prompt_hash"): + runtime_manifest.load_effective_runtime_manifest(run_dir) + + +if __name__ == "__main__": + unittest.main() diff --git a/scripts/test_domain_case_loop_lead_handoff.py b/scripts/test_domain_case_loop_lead_handoff.py index 7bcf48b..cfaca09 100644 --- a/scripts/test_domain_case_loop_lead_handoff.py +++ b/scripts/test_domain_case_loop_lead_handoff.py @@ -65,6 +65,10 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): analyst_verdict_path=analyst_verdict_path, repair_targets_path=repair_targets_path, business_audit_path=business_audit_path, + business_audit_json_path=iteration_dir / "business_audit.json", + issue_catalog_snapshot_path=iteration_dir / "issue_catalog_snapshot.json", + rerun_matrix_path=iteration_dir / "rerun_matrix.json", + detector_candidates_path=iteration_dir / "detector_candidates.json", analyst_verdict=analyst_verdict, repair_targets=repair_targets, target_score=88, @@ -90,8 +94,114 @@ class DomainCaseLoopLeadHandoffTests(unittest.TestCase): self.assertEqual(saved["status"], "lead_coder_repair_required") self.assertEqual(saved["assigned_primary_focus"]["focus_id"], "answer_shape") self.assertIn("business_audit", saved["artifact_refs"]) + self.assertIn("business_audit_json", saved["artifact_refs"]) + self.assertIn("issue_catalog_snapshot", saved["artifact_refs"]) + self.assertIn("business_direct_answer_missing", saved["issue_codes"]) + self.assertIn("failed_scenario", saved["rerun_matrix"]) self.assertTrue(latest_handoff_exists) + def test_business_audit_contract_exposes_repair_issue_contract(self) -> None: + repair_targets = { + "target_count": 1, + "severity_counts": {"P0": 1}, + "priority_foci": [], + "targets": [ + { + "target_id": "margin_pack:s01", + "scenario_id": "margin_pack", + "step_id": "s01", + "severity": "P0", + "issue_code": "margin_domain_leak_accounting_route", + "question_resolved": "Which item had the best margin?", + "fix_goal": "Route the question to margin profitability instead of accounting noise.", + "evidence_paths": ["artifacts/domain_runs/margin_pack/steps/s01/output.md"], + } + ], + } + contract = dcl.build_business_audit_contract( + analyst_verdict={ + "quality_score": 31, + "loop_decision": "partial", + "user_intent_summary": "User needs item margin ranking.", + "expected_direct_answer": "Best item by gross margin.", + "actual_direct_answer": "Accounting route answer.", + }, + repair_targets=repair_targets, + target_score=88, + loop_decision="partial", + analyst_accepted_gate=False, + accepted_gate=False, + deterministic_gate_ok=False, + deterministic_gate_reason="P0 repair target remains", + business_audit_markdown_path=Path("business_audit.md"), + analyst_verdict_path=Path("analyst_verdict.json"), + repair_targets_path=Path("repair_targets.json"), + business_audit_json_path=Path("business_audit.json"), + issue_catalog_snapshot_path=Path("issue_catalog_snapshot.json"), + rerun_matrix_path=Path("rerun_matrix.json"), + detector_candidates_path=Path("detector_candidates.json"), + ) + + self.assertEqual(contract["overall_status"], "partial") + self.assertEqual(contract["blocking_issues"][0]["issue_code"], "margin_domain_leak_accounting_route") + self.assertEqual(contract["blocking_issues"][0]["expected_business_answer_contract"], "margin_profitability_v1") + self.assertIn("failed_margin_scenario", contract["rerun_matrix"]) + self.assertIn("detector_candidates_json", contract["artifact_refs"]) + + def test_auto_coder_gate_blocks_non_allowlisted_issue_codes(self) -> None: + repair_targets = { + "targets": [ + { + "target_id": "margin_pack:s01", + "issue_code": "margin_domain_leak_accounting_route", + "allowed_patch_targets": ["llm_normalizer/backend/src/services/addressIntentResolver.ts"], + "forbidden_patch_targets": ["global orchestration rewrite"], + "rerun_matrix": ["failed_margin_scenario", "accepted_smoke_pack"], + } + ], + } + assigned_focus = { + "focus_id": "route|addressIntentResolver", + "issue_codes": ["margin_domain_leak_accounting_route"], + "root_cause_layers": ["intent", "route"], + "allowed_patch_targets": ["llm_normalizer/backend/src/services/addressIntentResolver.ts"], + "forbidden_patch_targets": ["global orchestration rewrite"], + "rerun_matrix": ["failed_margin_scenario", "accepted_smoke_pack"], + "target_ids": ["margin_pack:s01"], + } + + gate = dcl.evaluate_auto_coder_gate(repair_targets, assigned_focus) + + self.assertFalse(gate["allowed"]) + self.assertIn("issue_code_not_allowlisted:margin_domain_leak_accounting_route", gate["blocking_reasons"]) + + def test_auto_coder_gate_allows_complete_answer_surface_contract(self) -> None: + repair_targets = { + "targets": [ + { + "target_id": "pack:s01", + "issue_code": "business_direct_answer_missing", + "allowed_patch_targets": ["llm_normalizer/backend/src/services/address_runtime/composeStage.ts"], + "forbidden_patch_targets": ["routing rewrites"], + "rerun_matrix": ["failed_scenario", "direct_answer_surface_pack", "accepted_smoke_pack"], + } + ], + } + assigned_focus = { + "focus_id": "answer_shape|composeStage", + "issue_codes": ["business_direct_answer_missing"], + "root_cause_layers": ["answer_surface"], + "allowed_patch_targets": ["llm_normalizer/backend/src/services/address_runtime/composeStage.ts"], + "forbidden_patch_targets": ["routing rewrites"], + "rerun_matrix": ["failed_scenario", "direct_answer_surface_pack", "accepted_smoke_pack"], + "target_ids": ["pack:s01"], + } + + gate = dcl.evaluate_auto_coder_gate(repair_targets, assigned_focus) + + self.assertTrue(gate["allowed"]) + self.assertEqual(gate["reason"], "auto_coder_gate_passed") + def test_analyst_priority_targets_become_lead_repair_targets(self) -> None: repair_targets = { "pack_id": "demo_pack", diff --git a/scripts/test_domain_case_loop_step_state.py b/scripts/test_domain_case_loop_step_state.py index 079a62f..da4bbb3 100644 --- a/scripts/test_domain_case_loop_step_state.py +++ b/scripts/test_domain_case_loop_step_state.py @@ -574,6 +574,123 @@ class DomainCaseLoopStepStateTests(unittest.TestCase): self.assertTrue(step_state["runtime_factual_answer_validated"]) self.assertEqual(step_state["acceptance_status"], "validated") + def test_business_result_mode_accepts_clean_margin_confirmed_balance(self) -> None: + question = ( + "\u043a\u0430\u043a\u0430\u044f " + "\u043d\u043e\u043c\u0435\u043d\u043a\u043b\u0430\u0442\u0443\u0440\u0430 " + "\u0431\u044b\u043b\u0430 \u0441\u0430\u043c\u043e\u0439 " + "\u043c\u0430\u0440\u0436\u0438\u043d\u0430\u043b\u044c\u043d\u043e\u0439 " + "\u0432 2020" + ) + answer_text = ( + "\u0421\u0430\u043c\u0430\u044f " + "\u043c\u0430\u0440\u0436\u0438\u043d\u0430\u043b\u044c\u043d\u0430\u044f " + "\u043f\u043e\u0437\u0438\u0446\u0438\u044f \u0437\u0430 " + "\u043f\u0435\u0440\u0438\u043e\u0434 2020: " + "\u0422\u043e\u0432\u0430\u0440 A \u2014 " + "\u043c\u0430\u0440\u0436\u0430 42%, " + "\u0432\u044b\u0440\u0443\u0447\u043a\u0430 100 000 " + "\u0440\u0443\u0431., " + "\u0441\u0435\u0431\u0435\u0441\u0442\u043e\u0438\u043c\u043e\u0441\u0442\u043d\u0430\u044f " + "\u0431\u0430\u0437\u0430 58 000 " + "\u0440\u0443\u0431., " + "\u0432\u0430\u043b\u043e\u0432\u0430\u044f " + "\u0440\u0430\u0437\u043d\u0438\u0446\u0430 42 000 " + "\u0440\u0443\u0431.\n" + "\u0421\u043b\u0435\u0434\u0443\u044e\u0449\u0438\u0439 " + "\u0448\u0430\u0433: \u043c\u043e\u0433\u0443 " + "\u0440\u0430\u0441\u043a\u0440\u044b\u0442\u044c " + "\u0441\u0442\u0440\u043e\u043a\u0438 " + "\u0432\u044b\u0440\u0443\u0447\u043a\u0438 \u0438 " + "\u0441\u0435\u0431\u0435\u0441\u0442\u043e\u0438\u043c\u043e\u0441\u0442\u043d\u043e\u0439 " + "\u0431\u0430\u0437\u044b." + ) + step_state = dcl.build_scenario_step_state( + scenario_id="margin_result_mode_demo", + domain="margin_profitability", + step={ + "step_id": "step_01", + "title": "Margin ranking", + "depends_on": [], + "question_template": question, + "expected_intents": ["inventory_margin_ranking_for_nomenclature"], + "expected_capability": "inventory_inventory_margin_ranking_for_nomenclature", + "expected_recipe": "address_inventory_margin_ranking_for_nomenclature_v1", + "expected_result_mode": "ranking_or_limited_accounting_answer", + "required_answer_shape": "direct_answer_first", + }, + step_index=1, + question_resolved=question, + analysis_context={}, + turn_artifact={ + "assistant_message": { + "reply_type": "factual", + "text": answer_text, + "message_id": "msg-1", + "trace_id": "trace-1", + }, + "technical_debug_payload": { + "detected_mode": "address_query", + "detected_intent": "inventory_margin_ranking_for_nomenclature", + "selected_recipe": "address_inventory_margin_ranking_for_nomenclature_v1", + "capability_id": "inventory_inventory_margin_ranking_for_nomenclature", + "capability_route_mode": "exact", + "fallback_type": "none", + "mcp_call_status": "matched_non_empty", + "response_type": "FACTUAL_SUMMARY", + "result_mode": "confirmed_balance", + "truth_mode": "confirmed", + "answer_shape": "confirmed_factual", + "balance_confirmed": True, + }, + "session_summary": {}, + }, + entries=[], + ) + + self.assertEqual(step_state["execution_status"], "exact") + self.assertNotIn("wrong_result_mode", step_state["violated_invariants"]) + self.assertEqual(step_state["business_first_review"]["issue_codes"], []) + self.assertEqual(step_state["acceptance_status"], "validated") + + def test_literal_result_mode_contract_still_rejects_mismatch(self) -> None: + step_state = dcl.build_scenario_step_state( + scenario_id="literal_result_mode_demo", + domain="inventory", + step={ + "step_id": "step_01", + "title": "Literal result mode", + "depends_on": [], + "question_template": "show stock", + "expected_result_mode": "exact_inventory_balance", + }, + step_index=1, + question_resolved="show stock", + analysis_context={}, + turn_artifact={ + "assistant_message": { + "reply_type": "factual", + "text": "Short: stock is confirmed.", + "message_id": "msg-1", + "trace_id": "trace-1", + }, + "technical_debug_payload": { + "detected_mode": "address_query", + "fallback_type": "none", + "mcp_call_status": "matched_non_empty", + "response_type": "FACTUAL_SUMMARY", + "result_mode": "confirmed_balance", + "truth_mode": "confirmed", + "answer_shape": "confirmed_factual", + "balance_confirmed": True, + }, + "session_summary": {}, + }, + entries=[], + ) + + self.assertIn("wrong_result_mode", step_state["violated_invariants"]) + def test_exact_confirmed_document_followup_sets_runtime_factual_validation(self) -> None: step_state = dcl.build_scenario_step_state( scenario_id="svk_pivot", diff --git a/scripts/test_save_agent_semantic_run.py b/scripts/test_save_agent_semantic_run.py index 15dbfe4..a2b7b3f 100644 --- a/scripts/test_save_agent_semantic_run.py +++ b/scripts/test_save_agent_semantic_run.py @@ -1,16 +1,84 @@ from __future__ import annotations +import json import sys import unittest from pathlib import Path +import tempfile sys.path.insert(0, str(Path(__file__).resolve().parent)) +import agent_runtime_manifest as runtime_manifest import save_agent_semantic_run as saver +def write_json(path: Path, payload: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + class SaveAgentSemanticRunTests(unittest.TestCase): + def write_clean_truth_run(self, run_dir: Path, *, include_runtime: bool) -> None: + write_json( + run_dir / "pack_state.json", + { + "final_status": "accepted", + "review_overall_status": "pass", + "acceptance_gate_passed": True, + "no_unresolved_p0": True, + "unresolved_p0_count": 0, + "steps_total": 1, + "steps_passed": 1, + "steps_failed": 0, + }, + ) + write_json(run_dir / "truth_review.json", {"summary": {"overall_status": "pass"}}) + write_json( + run_dir / "business_review.json", + { + "overall_business_status": "pass", + "steps_with_business_failures": 0, + "steps_with_business_warnings": 0, + }, + ) + if include_runtime: + write_json( + run_dir / runtime_manifest.EFFECTIVE_RUNTIME_FILE_NAME, + { + "schema_version": runtime_manifest.EFFECTIVE_RUNTIME_SCHEMA_VERSION, + "runner": "domain_truth_harness.run-live", + "git_sha": "test-sha", + "llm_provider": "local", + "llm_model": "test-model", + "temperature": 0.0, + "max_output_tokens": 2048, + "prompt_version": "normalizer_v2_0_2", + "prompt_source": "file", + "prompt_hash": "abc123", + "prompt_registry_status": "pass", + }, + ) + + def test_validate_truth_harness_run_refuses_missing_effective_runtime(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + run_dir = Path(tmp) + self.write_clean_truth_run(run_dir, include_runtime=False) + + with self.assertRaisesRegex(RuntimeError, "reproducibility manifest"): + saver.validate_truth_harness_run_dir(run_dir) + + def test_validate_truth_harness_run_includes_effective_runtime_summary(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + run_dir = Path(tmp) + self.write_clean_truth_run(run_dir, include_runtime=True) + + metadata = saver.validate_truth_harness_run_dir(run_dir) + + self.assertEqual(metadata["validation_status"], "accepted_live_replay") + self.assertEqual(metadata["effective_runtime"]["runner"], "domain_truth_harness.run-live") + self.assertEqual(metadata["effective_runtime"]["llm_model"], "test-model") + def test_extract_questions_resolves_scenario_pack_bindings(self) -> None: spec = { "schema_version": "domain_scenario_pack_v1",