From 97b2a9b028fee88e61e7d9a86991eacaaa79ad7f Mon Sep 17 00:00:00 2001 From: dctouch Date: Tue, 14 Apr 2026 18:47:38 +0300 Subject: [PATCH] =?UTF-8?q?=D0=9E=D0=A0=D0=A0=D0=9A=D0=95=D0=A1=D0=A2?= =?UTF-8?q?=D0=A0=D0=90=D0=A6=D0=98=D0=AF=20-=20=D0=A3=D1=81=D0=B8=D0=BB?= =?UTF-8?q?=D0=B8=D1=82=D1=8C=20agent=20loop=20object-centric=20=D0=B0?= =?UTF-8?q?=D1=83=D0=B4=D0=B8=D1=82=D0=BE=D0=BC=20=D0=B8=20=D0=B4=D0=BE?= =?UTF-8?q?=D0=B1=D0=B8=D1=82=D1=8C=20pronoun=20follow-up=20=D0=BF=D0=BE?= =?UTF-8?q?=20=D0=B4=D0=BE=D0=BA=D1=83=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D0=BC?= =?UTF-8?q?=20=D0=B7=D0=B0=D0=BA=D1=83=D0=BF=D0=BA=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .codex/agents/domain_analyst.toml | 8 +- .codex/agents/orchestrator.toml | 4 + .codex/skills/domain-case-loop/SKILL.md | 4 +- .../business_first_analyst_rubric.md | 28 +++++ .../references/case_brief_template.md | 10 ++ .../scenario_tree_acceptance_canon.md | 14 ++- .../references/verdict_template.md | 6 + AGENTS.md | 1 + .../orchestration/active_domain_contract.json | 81 +++++++++--- .../domain_loop_analyst_verdict.schema.json | 23 ++++ .../dist/services/addressIntentResolver.js | 2 +- .../services/address_runtime/composeStage.js | 22 ++++ .../address_runtime/decomposeStage.js | 13 +- .../src/services/addressIntentResolver.ts | 4 +- .../services/address_runtime/composeStage.ts | 26 ++++ .../address_runtime/decomposeStage.ts | 15 ++- .../tests/addressQueryRuntimeM23.test.ts | 25 ++++ scripts/domain_case_loop.py | 117 +++++++++++++++--- tests/test_domain_case_loop.py | 110 +++++++++++++++- 19 files changed, 469 insertions(+), 44 deletions(-) diff --git a/.codex/agents/domain_analyst.toml b/.codex/agents/domain_analyst.toml index f72bf02..6465a6b 100644 --- a/.codex/agents/domain_analyst.toml +++ b/.codex/agents/domain_analyst.toml @@ -54,14 +54,20 @@ Rules: - Explicitly state what the first line of the answer should have been for the user. - If the answer is technically grounded but business-useless, say so directly and lower the score. - Treat selected-object continuity and reusable answer-object memory as first-class analysis objects. +- Treat focus-object continuity, provenance-bundle reuse, and follow-up action resolution as first-class analysis objects. - Call out when the runtime found the underlying document/trace but failed to retain the resolved business object for the next follow-up. +- Call out when the runtime retained the item but resolved the wrong action over that item, for example `покажи документы по этой позиции` -> `documents_by_counterparty`. +- Call out when the runtime recomputed a supplier/date/document lookup from scratch instead of reusing an already resolved provenance bundle. - Distinguish `object_memory_gap`, `field_mapping_gap`, `business_utility_gap`, and `domain_anchor_gap` from pure route gaps. +- Distinguish `followup_action_resolution_gap` and `bundle_reuse_gap` from both `object_memory_gap` and pure route gaps. - Check field truth explicitly: supplier must not be mislabeled as organization, buyer must not be mislabeled as organization, and document-side fields must not be presented as business truth without evidence. - Under the scenario-tree section, explicitly name the root node, critical child nodes, critical edges, and the primary user path. - Under the acceptance matrix, list at least the critical nodes/edges and mark each one by wording family: `canonical`, `colloquial`, `ui_selected_object`. -- Distinguish these defect classes explicitly when relevant: `semantic_understanding_gap`, `edge_carryover_gap`, `object_memory_gap`, `field_mapping_gap`, `answer_shape_mismatch`, `ordering_semantics_mismatch`, `runtime_capability_gap`, `business_utility_gap`, `loop_coverage_gap`, `domain_anchor_gap`. +- Under the state continuity section, explicitly say whether the scenario behaved as if it had a stable `focus_object` and reusable bundles such as `provenance_bundle` or `sale_trace_bundle`. +- Distinguish these defect classes explicitly when relevant: `semantic_understanding_gap`, `edge_carryover_gap`, `object_memory_gap`, `followup_action_resolution_gap`, `bundle_reuse_gap`, `field_mapping_gap`, `answer_shape_mismatch`, `ordering_semantics_mismatch`, `runtime_capability_gap`, `business_utility_gap`, `loop_coverage_gap`, `domain_anchor_gap`. - If the root node works but the primary user path is broken at the first selected-object drilldown, treat that as a real failure of domain hardening. - If the runtime nearly supports the path but the loop never validated the realistic wording family, call it `loop_coverage_gap`, not product success. +- If short pronoun follow-ups like `по ней`, `по этой позиции`, `эта`, `ее` are product-relevant, evaluate them as first-class coverage rather than as optional polish. Quality score: - Output one integer score from 0 to 100. diff --git a/.codex/agents/orchestrator.toml b/.codex/agents/orchestrator.toml index 080eeb1..6f8e316 100644 --- a/.codex/agents/orchestrator.toml +++ b/.codex/agents/orchestrator.toml @@ -49,11 +49,15 @@ Hard rules: - Require the analyst to judge business usefulness, not only technical groundedness. - Require the analyst to judge whether the direct answer appears in the first line when the user asked a direct lookup question. - Treat selected-object continuity, pronoun resolution, and reusable resolved-object state as mandatory audit targets for follow-up-heavy domains. +- Treat stable `focus_object` state and reusable bundles such as `provenance_bundle` / `sale_trace_bundle` as mandatory audit targets for follow-up-heavy domains. +- If a short follow-up like `по ней`, `по этой позиции`, `когда купили ее`, `покажи документы по этой позиции` exists in the realistic flow, validate it explicitly instead of only validating quoted-object variants. - Distinguish runtime capability gaps from state-layer continuity gaps and from business-presentation gaps before choosing coder tasks. +- Distinguish wrong follow-up action resolution over the same object from missing-object defects; for example item-follow-up drifting into counterparty documents is not the same problem as losing the item entirely. - If the root node works but the first critical selected-object or drilldown edge is still broken, do not treat the scenario as hardened. - Require an explicit `scenario_acceptance_matrix.md` artifact for follow-up-heavy domains and packs. - Use the matrix to drive coder tasks: patch the narrowest broken edge or wording family first, not the whole domain at once. - Distinguish `runtime_capability_gap` from `loop_coverage_gap`; do not confuse “not validated in the loop” with “product already works”. +- When the analyst says the main gap is object-centric dialog state, prefer the smallest state-layer fix over prompt inflation or broad intent rewrites. Acceptance gate: - accepted requires analyst quality_score >= 80 diff --git a/.codex/skills/domain-case-loop/SKILL.md b/.codex/skills/domain-case-loop/SKILL.md index f4f72de..b0e0dd0 100644 --- a/.codex/skills/domain-case-loop/SKILL.md +++ b/.codex/skills/domain-case-loop/SKILL.md @@ -137,7 +137,7 @@ The verdict must explicitly say whether the case is: - a missing route/intent/capability inside project scope; - a true out-of-scope request. - a `runtime_capability_gap`, `semantic_understanding_gap`, `edge_carryover_gap`, `answer_shape_mismatch`, `ordering_semantics_mismatch`, or `loop_coverage_gap`. -- an `object_memory_gap`, `field_mapping_gap`, `business_utility_gap`, or `domain_anchor_gap` when that is the real blocker. +- an `object_memory_gap`, `followup_action_resolution_gap`, `bundle_reuse_gap`, `field_mapping_gap`, `business_utility_gap`, or `domain_anchor_gap` when that is the real blocker. ### Step 4 - Domain patch @@ -213,6 +213,8 @@ Accepted requires: - Treat direct-answer-first behavior as part of correctness: if the user asked a direct lookup question, the first line must contain the direct answer before the evidence blocks. - Treat business usefulness as part of correctness: factual-but-business-useless output is not acceptance-quality output. - Treat stable follow-up object memory as part of correctness: when the prior turn already resolved the relevant item/object, the next turn must not re-ask for it. +- Treat object-centric dialog state as part of correctness: short follow-ups like `по ней`, `по этой позиции`, `когда купили ее`, `покажи документы по этой позиции` must resolve against the active selected item before broader routing guesses. +- Treat reusable supplier/date/document bundles as part of correctness: adjacent follow-ups over the same item should reuse a resolved provenance bundle when available. ## Domain-specific framing diff --git a/.codex/skills/domain-case-loop/references/business_first_analyst_rubric.md b/.codex/skills/domain-case-loop/references/business_first_analyst_rubric.md index 86f330f..7608d2e 100644 --- a/.codex/skills/domain-case-loop/references/business_first_analyst_rubric.md +++ b/.codex/skills/domain-case-loop/references/business_first_analyst_rubric.md @@ -9,6 +9,7 @@ The analyst must not stop at route/debug correctness. The analyst must judge whe The analyst evaluates five layers at once: - user intent; - scenario tree and state continuity; +- object-centric dialog continuity; - business usefulness of the answer; - evidence and field truthfulness; - root cause and smallest defensible fix direction. @@ -30,6 +31,8 @@ For every critical turn or critical edge, answer these questions explicitly: - originating date or period; - warehouse or organization scope when still relevant; - reusable resolved bundle, for example provenance trace or sale trace. +- stable focus object, for example `focus_object` for a selected inventory item; +- reusable resolved bundle, for example `provenance_bundle` or `sale_trace_bundle`. 4. Did the answer stay on the same business object? - item question -> item answer; @@ -39,6 +42,14 @@ For every critical turn or critical edge, answer these questions explicitly: If the system silently switched to raw documents, movements, or another lower-level object, call it an answer-shape defect. +6. Did the runtime resolve the correct follow-up action on the same object? +- `кто это поставил` should stay on item -> supplier provenance; +- `когда купили ее` should stay on item -> purchase date; +- `покажи документы по этой позиции` should stay on item -> purchase documents; +- `покажи все закупки по ней` should stay on item -> receipts / provenance documents. + +If the selected item stayed known but the action was reinterpreted as a different drilldown such as `documents_by_counterparty`, call that a `followup_action_resolution_gap`. + 5. Are the surfaced fields truthful and correctly labeled? - do not confuse supplier with organization; - do not confuse buyer with organization; @@ -62,6 +73,7 @@ The analyst must verify: - date/period continuity; - reusable evidence continuity; - pronoun resolution continuity. +- follow-up action resolution continuity on the active business object. Important pronoun examples: - `эту позицию` @@ -72,6 +84,12 @@ Important pronoun examples: If the previous turn already resolved a concrete object, the next turn must reuse it instead of asking for the anchor again. +Short follow-up examples that should first resolve against the active object: +- `по этой позиции` +- `покажи документы по ней` +- `когда купили ее` +- `это тот же поставщик?` + ## Reusable answer-object cache For follow-up-heavy domains, the analyst should explicitly look for evidence that the product behaves as if it had a reusable resolved object bundle. @@ -81,11 +99,14 @@ Examples: - `current_as_of_date` - `current_provenance_trace` - `current_sale_trace` +- `focus_object` +- `provenance_bundle` - `first_purchase_date` - `supplier_if_known` - `source_document_if_known` If the runtime recomputes everything from scratch and loses the already resolved object, call that out as a state-layer defect. +If the runtime retains the object but fails to reuse a resolved supplier/date/document bundle for the next adjacent lookup, call that out as a `bundle_reuse_gap`. ## Root-cause layers @@ -94,6 +115,8 @@ Use one or more of these root-cause layers explicitly: - `runtime_capability_gap` - `edge_carryover_gap` - `object_memory_gap` +- `followup_action_resolution_gap` +- `bundle_reuse_gap` - `field_mapping_gap` - `answer_shape_mismatch` - `ordering_semantics_mismatch` @@ -117,6 +140,10 @@ The analyst verdict should expose at least: - `root_cause_layers` - `broken_edge_ids` - `violated_invariants` +- `focus_object_continuity_ok` +- `bundle_reuse_ok` +- `followup_action_resolution_ok` +- `recommended_state_objects` ## Inventory-specific reminders @@ -124,5 +151,6 @@ For inventory follow-up chains, verify all of these: - the selected item remains the current focus object after the user clicks a result; - provenance questions answer supplier/date/document first, not only raw movement rows; - `когда купили` can reuse the already resolved provenance bundle; +- `покажи документы по этой позиции` stays in item-level purchase documents instead of falling into counterparty documents; - supplier and organization are not mixed up in the surfaced answer; - `на эту дату` keeps the original stock date unless the user explicitly changed it. diff --git a/.codex/skills/domain-case-loop/references/case_brief_template.md b/.codex/skills/domain-case-loop/references/case_brief_template.md index 6d0e899..8d6cb1e 100644 --- a/.codex/skills/domain-case-loop/references/case_brief_template.md +++ b/.codex/skills/domain-case-loop/references/case_brief_template.md @@ -28,21 +28,30 @@ - canonical - colloquial - ui_selected_object +- pronoun_followup when the active item can be referenced indirectly ## Required carryover invariants - selected object / item +- focus object / active business object - date or period - warehouse if relevant - organization if relevant - expected answer shape - direct-answer-first when the user asked a direct lookup question - reusable resolved-object continuity when the user asks a follow-up about the same selected object +- bundle reuse when the previous turn already resolved supplier/date/document details +- follow-up action resolution on the same selected object ## Field truth constraints - do not confuse supplier with organization - do not confuse buyer with organization - do not surface technical document-side fields as business truth without proof +## Recommended state objects +- focus_object +- provenance_bundle when the scenario contains item purchase trace +- sale_trace_bundle when the scenario contains buyer / sale follow-ups + ## Contour status - in_contour / outside_current_contour / unknown @@ -64,5 +73,6 @@ - root node works - critical edges on the primary user path work - colloquial and UI-generated follow-up variants work +- pronoun-only follow-up variants work when the UX already established a selected object - direct answer is placed first where expected - output is business-useful, not only technically grounded diff --git a/.codex/skills/domain-case-loop/references/scenario_tree_acceptance_canon.md b/.codex/skills/domain-case-loop/references/scenario_tree_acceptance_canon.md index 5365a2d..c230b19 100644 --- a/.codex/skills/domain-case-loop/references/scenario_tree_acceptance_canon.md +++ b/.codex/skills/domain-case-loop/references/scenario_tree_acceptance_canon.md @@ -36,6 +36,7 @@ Example for inventory: - child: selected item -> purchase documents - child: selected item -> aging on the same date - child: selected item -> sale trace +- child: selected item -> pronoun follow-up purchase documents The primary user path is the path a real user is most likely to take first, not the prettiest canonical wording. @@ -60,12 +61,14 @@ Each critical edge must define its required carryover invariants. Typical invariants: - selected object survives from previous assistant output +- stable focus object survives as the active business object - originating date / period survives into follow-up filters - warehouse survives if the follow-up still targets the same stock slice - organization survives if the previous slice was organization-bound - route family remains in the same business contour unless the user clearly changed intent - reusable resolved-object state survives when the previous turn already answered a closely related lookup - pronoun references can reuse the active focus object when the wording supports it +- follow-up action resolution stays on the same business object, for example item -> purchase documents rather than counterparty -> documents If an edge loses a required invariant, that is a real regression even if the target node works in isolation. @@ -80,6 +83,7 @@ Examples: - resolved purchase document bundle If turn N already resolved such an object and turn N+1 asks a natural follow-up about the same object, the system should reuse that state instead of demanding the same anchor again. +If turn N already resolved supplier/date/document provenance and turn N+1 asks for one adjacent field such as `когда купили ее` or `покажи документы по этой позиции`, the system should prefer bundle reuse before re-entering a broad generic router. ## Mandatory paraphrase families @@ -89,8 +93,9 @@ Minimum family: - `canonical` - `colloquial` - `ui_selected_object` +- `pronoun_followup` when the UX already established a selected object or active item -If canonical works but colloquial or UI-generated follow-up fails, the node/edge is not accepted. +If canonical works but colloquial, UI-generated, or pronoun-only follow-up fails, the node/edge is not accepted. ## Acceptance matrix @@ -118,6 +123,8 @@ Use these classes explicitly: - `semantic_understanding_gap` - `edge_carryover_gap` - `object_memory_gap` +- `followup_action_resolution_gap` +- `bundle_reuse_gap` - `field_mapping_gap` - `answer_shape_mismatch` - `ordering_semantics_mismatch` @@ -130,6 +137,8 @@ Definitions: - `semantic_understanding_gap`: the system did not understand the real user meaning - `edge_carryover_gap`: the follow-up lost date / object / scope across steps - `object_memory_gap`: the system resolved the object once but failed to retain it for the next follow-up +- `followup_action_resolution_gap`: the system kept the business object but resolved the wrong action over that object, for example item-follow-up -> counterparty-documents +- `bundle_reuse_gap`: the system resolved a reusable supplier/date/document bundle once but failed to reuse it for an adjacent follow-up - `field_mapping_gap`: the answer surfaced the wrong business field or mislabeled a field - `answer_shape_mismatch`: the business object in the answer does not match the requested object - `ordering_semantics_mismatch`: ranking / chronology semantics are wrong @@ -149,6 +158,7 @@ The analyst must: - verify business usefulness explicitly, not only technical validity; - verify field truthfulness for surfaced supplier / buyer / organization labels; - verify selected-object continuity and reusable object memory; +- verify focus-object continuity, pronoun follow-up continuity, and follow-up action resolution on the active business object; - verify answer granularity and ordering semantics; - lower the score when any critical edge or paraphrase family is broken. @@ -158,6 +168,7 @@ The orchestrator must: - define the tree before iterating deeply; - prioritize the primary user path first; - rerun at least one colloquial variant and one UI-selected-object variant for each critical branch; +- rerun at least one short pronoun follow-up such as `по ней` / `по этой позиции` when the product UX already established a selected object; - treat a broken critical edge as an unfinished scenario even if the root node works; - route coder work to the narrowest broken edge or node rather than issuing broad “improve the domain” tasks. @@ -167,6 +178,7 @@ Do not accept a domain when: - only the root node works; - only one curated phrasing works; - selected-object follow-up is broken; +- pronoun-only selected-object follow-up is broken or misrouted to another business object; - `на эту дату` / `на ту дату` loses the originating date; - the answer shape is wrong for the business question; - chronology / ranking semantics are inverted; diff --git a/.codex/skills/domain-case-loop/references/verdict_template.md b/.codex/skills/domain-case-loop/references/verdict_template.md index f830e7b..273b028 100644 --- a/.codex/skills/domain-case-loop/references/verdict_template.md +++ b/.codex/skills/domain-case-loop/references/verdict_template.md @@ -26,9 +26,12 @@ ## 7. State continuity and selected-object memory - selected object continuity: +- focus object continuity: - date/period continuity: - reusable answer-object continuity: +- provenance or sale bundle reuse: - pronoun resolution continuity: +- follow-up action resolution continuity: ## 8. Field truth and evidence quality - supplier vs organization: @@ -53,10 +56,12 @@ - Canonical wording: - Colloquial wording: - UI-generated selected-object wording: +- Pronoun-only follow-up wording: - Carryover invariants: - Expected answer shape: - Expected direct answer: - Business usefulness: +- Recommended state objects: - Defect class: ## 14. Acceptance criteria for rerun @@ -66,6 +71,7 @@ - Require direct-answer-first behavior on direct lookup questions. - Require business-useful output rather than technically-grounded-but-noisy output. - Require selected-object continuity and reusable answer-object continuity on follow-up chains. +- Require focus-object continuity, bundle reuse, and correct action resolution for short follow-ups like `по ней` / `по этой позиции` when they are part of the business flow. ## 15. Quality score - integer from 0 to 100 diff --git a/AGENTS.md b/AGENTS.md index 15156f6..43ec7c3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,6 +27,7 @@ Rules: - For critical branches, validate at least canonical wording, colloquial wording, and UI-generated selected-object wording when that UX exists. - Treat temporal carryover, selected-object carryover, answer-shape match, and ordering semantics as first-class acceptance invariants rather than optional polish. - Treat direct-answer-first behavior, business usefulness, selected-object memory, and field truthfulness as first-class analyst criteria rather than optional presentation polish. +- Treat stable `focus_object`, reusable bundles such as `provenance_bundle`, and pronoun-style follow-up resolution (`по ней`, `по этой позиции`) as first-class analyst criteria in follow-up-heavy domains. - If a case falls outside the current routed contour because the route/intent/capability is not wired yet, treat it as domain enablement work for this project, not as automatic out-of-scope rejection. - For new unmarked domains, `needs_exact_capability` means "bootstrap or extend the contour" rather than "close the case as unsupported". - A case can be marked `accepted` only when analyst verdict is at least `80/100`, no unresolved `P0` remains, and the rerun does not mask heuristic output as confirmed. diff --git a/docs/orchestration/active_domain_contract.json b/docs/orchestration/active_domain_contract.json index 6d7e3c4..e948fbc 100644 --- a/docs/orchestration/active_domain_contract.json +++ b/docs/orchestration/active_domain_contract.json @@ -41,9 +41,9 @@ "buyer_candidate": "Департамент капитального ремонта города Москвы" }, "question_pool": { - "total_questions": 20, + "total_questions": 21, "core_questions_total": 17, - "followup_checkpoints_total": 3, + "followup_checkpoints_total": 4, "questions": [ { "question_id": "Q01", @@ -224,6 +224,15 @@ "role": "critical_child", "wording_family": "ui_selected_object_colloquial", "semantic_goal": "проверить selected-object follow-up в закупочные документы без ручного переписывания item" + }, + { + "question_id": "Q21", + "text": "покажи документы по этой позиции", + "layer": "selected_item_provenance", + "node_id": "N05_selected_item_purchase_documents", + "role": "critical_child", + "wording_family": "pronoun_followup", + "semantic_goal": "проверить короткий местоименный follow-up по активному товару без съезда в counterparty drilldown" } ] }, @@ -247,6 +256,10 @@ { "family_id": "followup_date_carryover", "description": "follow-up с фразой `на эту дату` или `на ту дату`, где дата обязана тянуться из предыдущего шага" + }, + { + "family_id": "pronoun_followup", + "description": "короткий follow-up по активному объекту через местоимение или указатель типа `по ней`, `по этой позиции`, `ее`" } ], "scenario_tree": { @@ -277,8 +290,8 @@ "covers_question_ids": ["Q06", "Q19"], "expected_intents": ["inventory_purchase_provenance_for_item"], "expected_answer_shape": "direct_supplier_answer_first_then_evidence", - "required_wording_families": ["canonical", "colloquial", "ui_selected_object", "ui_selected_object_colloquial"], - "required_carryover_invariants": ["selected_object", "date_scope", "warehouse_scope", "organization_scope"], + "required_wording_families": ["canonical", "colloquial", "ui_selected_object", "ui_selected_object_colloquial", "pronoun_followup"], + "required_carryover_invariants": ["selected_object", "focus_object", "date_scope", "warehouse_scope", "organization_scope", "reusable_bundle"], "children": ["N04_selected_item_purchase_date", "N05_selected_item_purchase_documents", "N09_old_purchase_aging"] }, { @@ -293,11 +306,11 @@ { "node_id": "N05_selected_item_purchase_documents", "title": "Закупочные документы выбранного товара", - "covers_question_ids": ["Q10", "Q20"], + "covers_question_ids": ["Q10", "Q20", "Q21"], "expected_intents": ["inventory_purchase_documents_for_item"], "expected_answer_shape": "document_list_for_selected_item", - "required_wording_families": ["canonical", "ui_selected_object", "ui_selected_object_colloquial"], - "required_carryover_invariants": ["selected_object", "date_scope", "warehouse_scope"] + "required_wording_families": ["canonical", "ui_selected_object", "ui_selected_object_colloquial", "pronoun_followup"], + "required_carryover_invariants": ["selected_object", "focus_object", "date_scope", "warehouse_scope", "reusable_bundle", "followup_action_resolution"] }, { "node_id": "N09_old_purchase_aging", @@ -381,7 +394,7 @@ "to_node": "N05_selected_item_purchase_documents", "transition_type": "selected_object_deeper_trace", "primary_user_path": true, - "required_carryover_invariants": ["selected_object", "date_scope"], + "required_carryover_invariants": ["selected_object", "focus_object", "date_scope", "reusable_bundle", "followup_action_resolution"], "failure_means": "сломано углубление из поставщика в документы закупки" }, { @@ -527,13 +540,13 @@ { "scenario_id": "inventory_selected_item_provenance", "title": "Selected-item supplier provenance", - "question_ids": ["Q02", "Q06", "Q09", "Q10", "Q19", "Q20"], + "question_ids": ["Q02", "Q06", "Q09", "Q10", "Q19", "Q20", "Q21"], "node_ids": ["N01_stock_snapshot", "N03_selected_item_supplier", "N04_selected_item_purchase_date", "N05_selected_item_purchase_documents"], "acceptance_canon": { "root_step_id": "step_01_snapshot_historical", - "primary_user_path": ["step_01_snapshot_historical", "step_02_selected_item_supplier_colloquial", "step_05_selected_item_documents_ui"], - "required_paraphrase_families": ["canonical", "colloquial", "ui_selected_object", "ui_selected_object_colloquial"], - "required_carryover_invariants": ["selected_object", "date_scope", "warehouse_scope", "organization_scope", "answer_shape"] + "primary_user_path": ["step_01_snapshot_historical", "step_02_selected_item_supplier_colloquial", "step_06_selected_item_documents_pronoun"], + "required_paraphrase_families": ["canonical", "colloquial", "ui_selected_object", "ui_selected_object_colloquial", "pronoun_followup"], + "required_carryover_invariants": ["selected_object", "focus_object", "date_scope", "warehouse_scope", "organization_scope", "answer_shape", "reusable_bundle", "followup_action_resolution"] }, "steps": [ { @@ -606,10 +619,26 @@ "source": "binding_target_date_historical" }, "expected_capability": "inventory_purchase_documents_for_item", - "required_carryover_invariants": ["selected_object", "date_scope"] + "required_carryover_invariants": ["selected_object", "focus_object", "date_scope", "reusable_bundle", "followup_action_resolution"] }, { - "step_id": "step_06_selected_item_documents_canonical", + "step_id": "step_06_selected_item_documents_pronoun", + "question_id": "Q21", + "node_id": "N05_selected_item_purchase_documents", + "node_role": "critical_child", + "paraphrase_family": "pronoun_followup", + "title": "Selected item purchase documents pronoun follow-up", + "question": "покажи документы по этой позиции", + "depends_on": ["step_01_snapshot_historical", "step_02_selected_item_supplier_colloquial"], + "analysis_context": { + "as_of_date": "2019-03-31", + "source": "binding_target_date_historical" + }, + "expected_capability": "inventory_purchase_documents_for_item", + "required_carryover_invariants": ["selected_object", "focus_object", "date_scope", "reusable_bundle", "followup_action_resolution"] + }, + { + "step_id": "step_07_selected_item_documents_canonical", "question_id": "Q10", "node_id": "N05_selected_item_purchase_documents", "node_role": "critical_child", @@ -841,10 +870,18 @@ "business_utility_required": true, "state_continuity_required": true, "selected_object_memory_required": true, + "focus_object_required": true, + "pronoun_followup_resolution_required": true, + "followup_action_resolution_required": true, + "bundle_reuse_required": true, "field_truth_checks": [ "supplier_vs_organization", "buyer_vs_organization" ], + "required_state_objects": [ + "focus_object", + "provenance_bundle" + ], "reusable_answer_object_expectations": [ "current_item", "current_as_of_date", @@ -857,6 +894,7 @@ "do_not_accept_if": [ "работает только root snapshot, но ломается critical selected-object edge", "работает только canonical wording, но ломается colloquial или ui_selected_object wording", + "работает только quoted selected-object wording, но ломается короткий местоименный follow-up по активной позиции", "теряется date_scope на follow-up с `на эту дату` или `на ту дату`", "ответ меняет business object, например вместо item-level ответа отдаёт dump документов", "нарушается ordering semantics, например `старые закупки` идут не oldest-first" @@ -866,11 +904,14 @@ "critical edges on primary_user_paths", "canonical coverage on critical nodes", "colloquial coverage on critical nodes", - "ui_selected_object coverage where UI supports object selection" + "ui_selected_object coverage where UI supports object selection", + "pronoun_followup coverage where the UX already established an active selected object" ], "required_defect_classes": [ "semantic_understanding_gap", "edge_carryover_gap", + "followup_action_resolution_gap", + "bundle_reuse_gap", "answer_shape_mismatch", "ordering_semantics_mismatch", "runtime_capability_gap", @@ -903,6 +944,16 @@ "pattern_id": "F05_oldest_first_violation", "symptom": "`старые закупки` are listed newest-first or in another non-business order", "defect_class": "ordering_semantics_mismatch" + }, + { + "pattern_id": "F06_pronoun_item_documents_misroute", + "symptom": "short follow-up like `покажи документы по этой позиции` drifts into `documents_by_counterparty` instead of selected-item purchase documents", + "defect_class": "followup_action_resolution_gap" + }, + { + "pattern_id": "F07_provenance_bundle_not_reused", + "symptom": "supplier/date/document lookup was already resolved for the selected item but adjacent follow-up recomputes broadly or loses the reusable bundle", + "defect_class": "bundle_reuse_gap" } ], "legacy_references": [ diff --git a/docs/orchestration/schemas/domain_loop_analyst_verdict.schema.json b/docs/orchestration/schemas/domain_loop_analyst_verdict.schema.json index 5790869..7bbf060 100644 --- a/docs/orchestration/schemas/domain_loop_analyst_verdict.schema.json +++ b/docs/orchestration/schemas/domain_loop_analyst_verdict.schema.json @@ -16,6 +16,10 @@ "state_continuity_score", "answer_shape_score", "evidence_clarity_score", + "focus_object_continuity_ok", + "bundle_reuse_ok", + "followup_action_resolution_ok", + "recommended_state_objects", "loop_decision", "requires_user_decision", "user_decision_type", @@ -78,6 +82,21 @@ "minimum": 0, "maximum": 100 }, + "focus_object_continuity_ok": { + "type": "boolean" + }, + "bundle_reuse_ok": { + "type": "boolean" + }, + "followup_action_resolution_ok": { + "type": "boolean" + }, + "recommended_state_objects": { + "type": "array", + "items": { + "type": "string" + } + }, "loop_decision": { "type": "string", "enum": ["accepted", "continue", "partial", "blocked", "needs_exact_capability"] @@ -121,6 +140,8 @@ "runtime_capability_gap", "edge_carryover_gap", "object_memory_gap", + "followup_action_resolution_gap", + "bundle_reuse_gap", "field_mapping_gap", "answer_shape_mismatch", "ordering_semantics_mismatch", @@ -170,6 +191,8 @@ "semantic_understanding_gap", "edge_carryover_gap", "object_memory_gap", + "followup_action_resolution_gap", + "bundle_reuse_gap", "field_mapping_gap", "answer_shape_mismatch", "ordering_semantics_mismatch", diff --git a/llm_normalizer/backend/dist/services/addressIntentResolver.js b/llm_normalizer/backend/dist/services/addressIntentResolver.js index bf6706f..ad640cc 100644 --- a/llm_normalizer/backend/dist/services/addressIntentResolver.js +++ b/llm_normalizer/backend/dist/services/addressIntentResolver.js @@ -1341,7 +1341,7 @@ function hasInventorySaleTraceSignal(text) { return /(?:продаж|покупател|buyer|sale trace|purchase[\s-]?to[\s-]?sale|purchase -> warehouse -> sale|закупка.*продаж)/iu.test(text); } function hasSelectedObjectInventoryCue(text) { - return /(?:по\s+выбранному\s+объекту|selected\s+object)/iu.test(text); + return /(?:по\s+выбранному\s+объекту|по\s+этой\s+позиции|по\s+этому\s+товару|по\s+нему|по\s+ней|по\s+нему\s+же|по\s+ней\s+же|selected\s+object)/iu.test(text); } function hasSelectedObjectInventoryProvenanceSignal(text) { return (hasSelectedObjectInventoryCue(text) && diff --git a/llm_normalizer/backend/dist/services/address_runtime/composeStage.js b/llm_normalizer/backend/dist/services/address_runtime/composeStage.js index d71fab8..bdc00c0 100644 --- a/llm_normalizer/backend/dist/services/address_runtime/composeStage.js +++ b/llm_normalizer/backend/dist/services/address_runtime/composeStage.js @@ -3107,7 +3107,13 @@ function composeFactualReply(intent, rows, options = {}) { const purchaseRows = rows.filter((row) => isInventoryPurchaseMovement(row)); const summary = summarizeInventoryTraceRows(purchaseRows); const itemLabel = summary.item ?? "товар не определен"; + const directAnswerLine = summary.counterparties.length === 1 + ? `По товару ${itemLabel} документы поступления связаны с поставщиком: ${summary.counterparties[0]}.` + : summary.counterparties.length > 1 + ? `По товару ${itemLabel} документы поступления ведут к нескольким поставщикам: ${summary.counterparties.slice(0, 4).join("; ")}.` + : `По товару ${itemLabel} найдены документы поступления, но поставщик не материализован отдельным полем в текущем exact-контуре.`; const lines = [ + directAnswerLine, `Собран подтвержденный список документов поступления по товару ${itemLabel} до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", @@ -3191,7 +3197,13 @@ function composeFactualReply(intent, rows, options = {}) { const summary = summarizeInventoryTraceRows(purchaseRows); const unresolvedRows = purchaseRows.filter((row) => extractInventoryCounterpartyCandidates(row).length === 0); const warehouseLabel = summary.warehouses[0] ?? "не указанного склада"; + const directAnswerLine = summary.counterparties.length === 1 + ? `По складскому остатку ${warehouseLabel} выявлен поставщик: ${summary.counterparties[0]}.` + : summary.counterparties.length > 1 + ? `По складскому остатку ${warehouseLabel} найдено несколько поставщиков: ${summary.counterparties.slice(0, 6).join("; ")}.` + : `По складскому остатку ${warehouseLabel} поставщик в текущем exact-контуре не материализован.`; const lines = [ + directAnswerLine, `Собран exact-срез supplier overlap для складского остатка до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", @@ -3287,7 +3299,13 @@ function composeFactualReply(intent, rows, options = {}) { const saleRows = rows.filter((row) => isInventorySaleMovement(row)); const summary = summarizeInventoryTraceRows(saleRows); const itemLabel = summary.item ?? "товар не определен"; + const directAnswerLine = summary.counterparties.length === 1 + ? `По товару ${itemLabel} покупатель определен: ${summary.counterparties[0]}.` + : summary.counterparties.length > 1 + ? `По товару ${itemLabel} найдено несколько покупателей: ${summary.counterparties.slice(0, 4).join("; ")}.` + : `По товару ${itemLabel} покупатель в текущем exact-контуре не материализован.`; const lines = [ + directAnswerLine, `Собран подтвержденный след выбытия по товару ${itemLabel} до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", @@ -3332,7 +3350,11 @@ function composeFactualReply(intent, rows, options = {}) { const purchaseSummary = summarizeInventoryTraceRows(purchaseRows); const saleSummary = summarizeInventoryTraceRows(saleRows); const itemLabel = purchaseSummary.item ?? saleSummary.item ?? "товар не определен"; + const directAnswerLine = purchaseSummary.counterparties.length === 1 && saleSummary.counterparties.length === 1 + ? `По товару ${itemLabel} цепочка поставки и продажи связана с поставщиком ${purchaseSummary.counterparties[0]} и покупателем ${saleSummary.counterparties[0]}.` + : `По товару ${itemLabel} цепочка поставки и продажи подтверждена частично или разнообразно: детали идут следом.`; const lines = [ + directAnswerLine, `Собрана документальная цепочка по товару ${itemLabel} до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", diff --git a/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js b/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js index a349a0a..1b7415f 100644 --- a/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js +++ b/llm_normalizer/backend/dist/services/address_runtime/decomposeStage.js @@ -260,7 +260,7 @@ function hasInventorySupplierFollowupCue(text) { return /(?:кто\s+(?:(?:это|этот\s+товар|эту\s+позицию)\s+)?(?:нам\s+)?поставил|кто\s+(?:нам\s+)?поставил\s+(?:это|этот\s+товар|эту\s+позицию)|от\s+какого\s+поставщика|у\s+какого\s+поставщика|от\s+кого\s+куплен|supplier|vendor|поставщик)/iu.test(String(text ?? "")); } function hasInventoryPurchaseDocumentsFollowupCue(text) { - return /(?:по\s+каким\s+документам\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|по\s+каким\s+документам\s+(?:был\s+)?куплен|какими\s+документами\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|какими\s+документами\s+(?:был\s+)?куплен|purchase\s+documents|documents\s+of\s+purchase|through\s+which\s+documents)/iu.test(String(text ?? "")); + return /(?:по\s+каким\s+документам\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|по\s+каким\s+документам\s+(?:был\s+)?куплен|какими\s+документами\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|какими\s+документами\s+(?:был\s+)?куплен|покажи\s+документы\s+по\s+(?:этой\s+позиции|этому\s+товару|ней|нему)|документы\s+по\s+(?:этой\s+позиции|этому\s+товару|ней|нему)|purchase\s+documents|documents\s+of\s+purchase|through\s+which\s+documents)/iu.test(String(text ?? "")); } function hasAddressFollowupContextSignal(text) { const normalized = String(text ?? "").trim(); @@ -328,6 +328,7 @@ function mergeFollowupFilters(current, intent, userMessage, followupContext) { const previousCounterparty = toNonEmptyString(previous.counterparty); const previousContract = toNonEmptyString(previous.contract); const previousAccount = toNonEmptyString(previous.account); + const previousItem = toNonEmptyString(previous.item); const previousOrganization = toNonEmptyString(previous.organization); const previousAsOfDate = toNonEmptyString(previous.as_of_date); const previousPeriodFrom = toNonEmptyString(previous.period_from); @@ -440,6 +441,16 @@ function mergeFollowupFilters(current, intent, userMessage, followupContext) { merged.counterparty = inheritedCounterparty; reasons.push(currentCounterparty ? "counterparty_replaced_from_followup_context" : "counterparty_from_followup_context"); } + if ((intent === "inventory_purchase_provenance_for_item" || + intent === "inventory_purchase_documents_for_item" || + intent === "inventory_sale_trace_for_item" || + intent === "inventory_purchase_to_sale_chain" || + intent === "inventory_aging_by_purchase_date") && + !toNonEmptyString(merged.item) && + previousItem) { + merged.item = previousItem; + reasons.push("item_from_followup_context"); + } if (sameDateRequested) { const inheritedAsOfDate = previousAsOfDate ?? previousPeriodTo ?? previousPeriodFrom; if (inheritedAsOfDate && merged.as_of_date !== inheritedAsOfDate) { diff --git a/llm_normalizer/backend/src/services/addressIntentResolver.ts b/llm_normalizer/backend/src/services/addressIntentResolver.ts index 4d7895c..d658398 100644 --- a/llm_normalizer/backend/src/services/addressIntentResolver.ts +++ b/llm_normalizer/backend/src/services/addressIntentResolver.ts @@ -1604,7 +1604,9 @@ function hasInventorySaleTraceSignal(text: string): boolean { } function hasSelectedObjectInventoryCue(text: string): boolean { - return /(?:по\s+выбранному\s+объекту|selected\s+object)/iu.test(text); + return /(?:по\s+выбранному\s+объекту|по\s+этой\s+позиции|по\s+этому\s+товару|по\s+нему|по\s+ней|по\s+нему\s+же|по\s+ней\s+же|selected\s+object)/iu.test( + text + ); } function hasSelectedObjectInventoryProvenanceSignal(text: string): boolean { diff --git a/llm_normalizer/backend/src/services/address_runtime/composeStage.ts b/llm_normalizer/backend/src/services/address_runtime/composeStage.ts index 30d48b8..bde3807 100644 --- a/llm_normalizer/backend/src/services/address_runtime/composeStage.ts +++ b/llm_normalizer/backend/src/services/address_runtime/composeStage.ts @@ -4020,7 +4020,14 @@ export function composeFactualReply( const purchaseRows = rows.filter((row) => isInventoryPurchaseMovement(row)); const summary = summarizeInventoryTraceRows(purchaseRows); const itemLabel = summary.item ?? "товар не определен"; + const directAnswerLine = + summary.counterparties.length === 1 + ? `По товару ${itemLabel} документы поступления связаны с поставщиком: ${summary.counterparties[0]}.` + : summary.counterparties.length > 1 + ? `По товару ${itemLabel} документы поступления ведут к нескольким поставщикам: ${summary.counterparties.slice(0, 4).join("; ")}.` + : `По товару ${itemLabel} найдены документы поступления, но поставщик не материализован отдельным полем в текущем exact-контуре.`; const lines: string[] = [ + directAnswerLine, `Собран подтвержденный список документов поступления по товару ${itemLabel} до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", @@ -4105,7 +4112,14 @@ export function composeFactualReply( const summary = summarizeInventoryTraceRows(purchaseRows); const unresolvedRows = purchaseRows.filter((row) => extractInventoryCounterpartyCandidates(row).length === 0); const warehouseLabel = summary.warehouses[0] ?? "не указанного склада"; + const directAnswerLine = + summary.counterparties.length === 1 + ? `По складскому остатку ${warehouseLabel} выявлен поставщик: ${summary.counterparties[0]}.` + : summary.counterparties.length > 1 + ? `По складскому остатку ${warehouseLabel} найдено несколько поставщиков: ${summary.counterparties.slice(0, 6).join("; ")}.` + : `По складскому остатку ${warehouseLabel} поставщик в текущем exact-контуре не материализован.`; const lines: string[] = [ + directAnswerLine, `Собран exact-срез supplier overlap для складского остатка до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", @@ -4201,7 +4215,14 @@ export function composeFactualReply( const saleRows = rows.filter((row) => isInventorySaleMovement(row)); const summary = summarizeInventoryTraceRows(saleRows); const itemLabel = summary.item ?? "товар не определен"; + const directAnswerLine = + summary.counterparties.length === 1 + ? `По товару ${itemLabel} покупатель определен: ${summary.counterparties[0]}.` + : summary.counterparties.length > 1 + ? `По товару ${itemLabel} найдено несколько покупателей: ${summary.counterparties.slice(0, 4).join("; ")}.` + : `По товару ${itemLabel} покупатель в текущем exact-контуре не материализован.`; const lines: string[] = [ + directAnswerLine, `Собран подтвержденный след выбытия по товару ${itemLabel} до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", @@ -4244,7 +4265,12 @@ export function composeFactualReply( const purchaseSummary = summarizeInventoryTraceRows(purchaseRows); const saleSummary = summarizeInventoryTraceRows(saleRows); const itemLabel = purchaseSummary.item ?? saleSummary.item ?? "товар не определен"; + const directAnswerLine = + purchaseSummary.counterparties.length === 1 && saleSummary.counterparties.length === 1 + ? `По товару ${itemLabel} цепочка поставки и продажи связана с поставщиком ${purchaseSummary.counterparties[0]} и покупателем ${saleSummary.counterparties[0]}.` + : `По товару ${itemLabel} цепочка поставки и продажи подтверждена частично или разнообразно: детали идут следом.`; const lines: string[] = [ + directAnswerLine, `Собрана документальная цепочка по товару ${itemLabel} до ${formatDateRu(asOfDate)}.`, "", "Блок 1. Статус результата", diff --git a/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts b/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts index 3736920..144a404 100644 --- a/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts +++ b/llm_normalizer/backend/src/services/address_runtime/decomposeStage.ts @@ -329,7 +329,7 @@ function hasInventorySupplierFollowupCue(text: string): boolean { } function hasInventoryPurchaseDocumentsFollowupCue(text: string): boolean { - return /(?:по\s+каким\s+документам\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|по\s+каким\s+документам\s+(?:был\s+)?куплен|какими\s+документами\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|какими\s+документами\s+(?:был\s+)?куплен|purchase\s+documents|documents\s+of\s+purchase|through\s+which\s+documents)/iu.test( + return /(?:по\s+каким\s+документам\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|по\s+каким\s+документам\s+(?:был\s+)?куплен|какими\s+документами\s+(?:это|его|этот\s+товар|эту\s+позицию)\s+купили|какими\s+документами\s+(?:был\s+)?куплен|покажи\s+документы\s+по\s+(?:этой\s+позиции|этому\s+товару|ней|нему)|документы\s+по\s+(?:этой\s+позиции|этому\s+товару|ней|нему)|purchase\s+documents|documents\s+of\s+purchase|through\s+which\s+documents)/iu.test( String(text ?? "") ); } @@ -424,6 +424,7 @@ function mergeFollowupFilters( const previousCounterparty = toNonEmptyString(previous.counterparty); const previousContract = toNonEmptyString(previous.contract); const previousAccount = toNonEmptyString(previous.account); + const previousItem = toNonEmptyString(previous.item); const previousOrganization = toNonEmptyString(previous.organization); const previousAsOfDate = toNonEmptyString(previous.as_of_date); const previousPeriodFrom = toNonEmptyString(previous.period_from); @@ -554,6 +555,18 @@ function mergeFollowupFilters( merged.counterparty = inheritedCounterparty; reasons.push(currentCounterparty ? "counterparty_replaced_from_followup_context" : "counterparty_from_followup_context"); } + if ( + (intent === "inventory_purchase_provenance_for_item" || + intent === "inventory_purchase_documents_for_item" || + intent === "inventory_sale_trace_for_item" || + intent === "inventory_purchase_to_sale_chain" || + intent === "inventory_aging_by_purchase_date") && + !toNonEmptyString(merged.item) && + previousItem + ) { + merged.item = previousItem; + reasons.push("item_from_followup_context"); + } if (sameDateRequested) { const inheritedAsOfDate = previousAsOfDate ?? previousPeriodTo ?? previousPeriodFrom; if (inheritedAsOfDate && merged.as_of_date !== inheritedAsOfDate) { diff --git a/llm_normalizer/backend/tests/addressQueryRuntimeM23.test.ts b/llm_normalizer/backend/tests/addressQueryRuntimeM23.test.ts index 8ee8c0e..9d5efad 100644 --- a/llm_normalizer/backend/tests/addressQueryRuntimeM23.test.ts +++ b/llm_normalizer/backend/tests/addressQueryRuntimeM23.test.ts @@ -293,6 +293,7 @@ describe("address query shape classifier", () => { useRubCurrency: true } ); + expect(reply.text.split("\n")[0]).toContain("поставщиком"); expect(reply.text).toContain("Шкаф картотечный"); expect(reply.text).toContain("Поступление товаров и услуг 0001"); expect(reply.semantics?.result_mode).toBe("confirmed_balance"); @@ -319,6 +320,7 @@ describe("address query shape classifier", () => { useRubCurrency: true } ); + expect(reply.text.split("\n")[0]).toContain("поставщиком"); expect(reply.text).toContain("закупочный след"); expect(reply.text).toContain("Гамма-мебель, ООО"); expect(reply.semantics?.balance_confirmed).toBe(true); @@ -345,6 +347,7 @@ describe("address query shape classifier", () => { useRubCurrency: true } ); + expect(reply.text.split("\n")[0]).toContain("покупатель"); expect(reply.text).toContain("след выбытия"); expect(reply.text).toContain("Реализация товаров и услуг 0007"); expect(reply.text).toContain("Департамент капитального ремонта города Москвы"); @@ -3947,6 +3950,28 @@ describe("address decompose stage follow-up carryover", () => { ).toBe(true); }); + it("promotes pronoun selected-item purchase-doc follow-up into inventory purchase documents with inherited date context", () => { + const result = runAddressDecomposeStage('покажи документы по этой позиции', { + previous_intent: "inventory_purchase_provenance_for_item", + previous_filters: { + as_of_date: "2019-03-31", + period_from: "2019-03-01", + period_to: "2019-03-31", + item: "Столешница 600*3050*26 дуб ниагара" + }, + previous_anchor_type: "unknown", + previous_anchor_value: null + }); + expect(result).not.toBeNull(); + expect(result?.intent.intent).toBe("inventory_purchase_documents_for_item"); + expect(result?.filters.extracted_filters.item).toBe("Столешница 600*3050*26 дуб ниагара"); + expect(result?.filters.extracted_filters.as_of_date).toBe("2019-03-31"); + expect( + result?.baseReasons?.includes("intent_adjusted_to_inventory_followup_context") || + result?.intent.reasons.includes("inventory_selected_object_purchase_documents_signal_detected") + ).toBe(true); + }); + it("keeps slang all-customers-all-time wording in address lane via resolved intent fallback", () => { const result = runAddressDecomposeStage("выведи всех заков за все время", null); expect(result).not.toBeNull(); diff --git a/scripts/domain_case_loop.py b/scripts/domain_case_loop.py index 053e368..744ac3f 100644 --- a/scripts/domain_case_loop.py +++ b/scripts/domain_case_loop.py @@ -171,6 +171,8 @@ def merge_analysis_context(base_context: Any, override_context: Any) -> dict[str def carry_forward_analysis_context( scenario_state: dict[str, Any], analysis_context: dict[str, Any], + *, + prefer_carryover: bool = False, ) -> dict[str, Any]: carried = dict(analysis_context) @@ -179,10 +181,23 @@ def carry_forward_analysis_context( date_scope = semantic_memory.get("date_scope") if isinstance(date_scope, dict): carried_as_of_date = normalize_iso_date(date_scope.get("as_of_date")) - if carried_as_of_date and not carried.get("as_of_date"): + if carried_as_of_date and (prefer_carryover or not carried.get("as_of_date")): carried["as_of_date"] = carried_as_of_date if not carried.get("source"): carried["source"] = "scenario_state_carryover" + for key in ( + "focus_object", + "selected_object_ref", + "warehouse_scope", + "organization_scope", + "provenance_bundle", + "sale_trace_bundle", + "purchase_documents_bundle", + "supplier_if_known", + "first_purchase_date", + ): + if (prefer_carryover or key not in carried) and semantic_memory.get(key) is not None: + carried[key] = semantic_memory.get(key) return carried @@ -1386,7 +1401,11 @@ def execute_scenario_manifest( for step_index, step in enumerate(manifest["steps"], start=1): step_dir = steps_dir / step["step_id"] step_analysis_context = merge_analysis_context(manifest.get("analysis_context"), step.get("analysis_context")) - step_analysis_context = carry_forward_analysis_context(scenario_state, step_analysis_context) + step_analysis_context = carry_forward_analysis_context( + scenario_state, + step_analysis_context, + prefer_carryover=bool(step.get("depends_on")), + ) try: resolved_question = resolve_question_template(step["question_template"], scenario_state) result = run_assistant_step( @@ -1690,6 +1709,24 @@ def derive_coverage_status(statuses: list[str]) -> str: return "partial" +def derive_pack_final_status(pack: dict[str, Any], scenario_results: list[dict[str, Any]]) -> str: + aggregate_statuses = [item["final_status"] for item in scenario_results] + if not aggregate_statuses: + return "blocked" + if any(status == "blocked" for status in aggregate_statuses): + return "blocked" + if any(status == "needs_exact_capability" for status in aggregate_statuses): + return "needs_exact_capability" + if any(status == "partial" for status in aggregate_statuses): + return "partial" + + acceptance_matrix = build_scenario_acceptance_matrix(pack, scenario_results) + if "| partial |" in acceptance_matrix: + return "partial" + + return "accepted" if len(scenario_results) == len(pack.get("scenarios") or []) else "partial" + + def build_scenario_acceptance_matrix(pack: dict[str, Any], scenario_results: list[dict[str, Any]]) -> str: scenario_status_map = { str(item.get("scenario_id") or ""): str(item.get("final_status") or "unknown") @@ -1709,6 +1746,7 @@ def build_scenario_acceptance_matrix(pack: dict[str, Any], scenario_results: lis scenario_questions_map: dict[str, list[str]] = {} scenario_nodes_map: dict[str, list[str]] = {} + scenario_wording_map: dict[str, list[str]] = {} for scenario in scenarios: if not isinstance(scenario, dict): continue @@ -1737,9 +1775,16 @@ def build_scenario_acceptance_matrix(pack: dict[str, Any], scenario_results: lis node_ids.append(node_id) scenario_questions_map[scenario_id] = question_ids scenario_nodes_map[scenario_id] = list(dict.fromkeys(node_ids)) + scenario_wording_map[scenario_id] = _scenario_observed_wording_families(scenario) scenario_tree = pack.get("scenario_tree") if isinstance(pack.get("scenario_tree"), dict) else {} source_contract = pack.get("source_contract") if isinstance(pack.get("source_contract"), dict) else {} + all_nodes: list[dict[str, Any]] = [] + for section_key in ("root_nodes", "critical_nodes", "supporting_nodes"): + raw_nodes = scenario_tree.get(section_key) + if isinstance(raw_nodes, list): + all_nodes.extend(node for node in raw_nodes if isinstance(node, dict)) + lines = [ "# Scenario acceptance matrix", "", @@ -1796,15 +1841,23 @@ def build_scenario_acceptance_matrix(pack: dict[str, Any], scenario_results: lis scenario_id for scenario_id, node_ids in scenario_nodes_map.items() if node_id in node_ids ) statuses = [scenario_status_map.get(scenario_id, "not_run") for scenario_id in backed_by] + required_wording_families = normalize_string_list(node.get("required_wording_families")) + observed_wording_families = sorted( + {family for scenario_id in backed_by for family in scenario_wording_map.get(scenario_id, [])} + ) + missing_wording_families = [family for family in required_wording_families if family not in observed_wording_families] + status = derive_coverage_status(statuses) + if status == "green" and missing_wording_families: + status = "partial" lines.append( "| " + " | ".join( [ node_id, - derive_coverage_status(statuses), + status, ", ".join(backed_by) or "-", ", ".join(normalize_string_list(node.get("covers_question_ids"))) or "-", - ", ".join(normalize_string_list(node.get("required_wording_families"))) or "-", + ", ".join(required_wording_families) or "-", ] ) + " |" @@ -1839,12 +1892,28 @@ def build_scenario_acceptance_matrix(pack: dict[str, Any], scenario_results: lis if from_node in node_ids and to_node in node_ids ) statuses = [scenario_status_map.get(scenario_id, "not_run") for scenario_id in backed_by] + from_required = [] + to_required = [] + for node in all_nodes: + node_id = str(node.get("node_id") or "").strip() + if node_id == from_node: + from_required = normalize_string_list(node.get("required_wording_families")) + elif node_id == to_node: + to_required = normalize_string_list(node.get("required_wording_families")) + observed_wording_families = sorted( + {family for scenario_id in backed_by for family in scenario_wording_map.get(scenario_id, [])} + ) + edge_required_families = list(dict.fromkeys(from_required + [family for family in to_required if family not in from_required])) + missing_wording_families = [family for family in edge_required_families if family not in observed_wording_families] + status = derive_coverage_status(statuses) + if status == "green" and missing_wording_families: + status = "partial" lines.append( "| " + " | ".join( [ edge_id, - derive_coverage_status(statuses), + status, from_node or "-", to_node or "-", ", ".join(backed_by) or "-", @@ -2031,6 +2100,8 @@ def compact_step_output_for_review(step_output: Any) -> dict[str, Any]: "selected_recipe": step_output.get("selected_recipe"), "capability_id": step_output.get("capability_id"), "result_mode": step_output.get("result_mode"), + "answer_shape": step_output.get("answer_shape"), + "actual_direct_answer": step_output.get("actual_direct_answer"), "fallback_type": step_output.get("fallback_type"), "mcp_call_status": step_output.get("mcp_call_status"), "failure_type": step_output.get("failure_type"), @@ -2080,6 +2151,20 @@ def build_pack_review_bundle(pack_dir: Path) -> str: return dump_json(bundle) +def _scenario_observed_wording_families(scenario: dict[str, Any]) -> list[str]: + families: list[str] = [] + steps = scenario.get("steps") + if not isinstance(steps, list): + return families + for step in steps: + if not isinstance(step, dict): + continue + family = str(step.get("paraphrase_family") or step.get("wording_family") or "").strip() + if family: + families.append(family) + return list(dict.fromkeys(families)) + + def build_analyst_loop_prompt( *, loop_dir: Path, @@ -2137,6 +2222,7 @@ def build_analyst_loop_prompt( Goal: - evaluate current domain-pack correctness for business meaning, route/capability quality, evidence quality, and absence of silent heuristic masking; - evaluate business usefulness, direct-answer-first behavior, state continuity, and field truthfulness, not only technical groundedness; + - evaluate object-centric dialog continuity: stable `focus_object`, reusable bundles such as `provenance_bundle`, and correct action resolution for pronoun-style follow-ups; - determine whether the gate `quality_score >= {target_score}` is reached; - if not, provide the smallest high-value fix targets for the coder. @@ -2155,8 +2241,10 @@ def build_analyst_loop_prompt( - if `requires_user_decision = true`, fill `user_decision_type` and `user_decision_prompt`; - if the pack is below {target_score} but there is still safe autonomous implementation work, keep `requires_user_decision = false`; - do not request user input merely because the score is still below {target_score}; request it only when the loop would otherwise guess, overfit, or risk architecture drift. - - return machine-readable fields for: `user_intent_summary`, `expected_direct_answer`, `actual_direct_answer`, `direct_answer_ok`, `business_usefulness_ok`, `business_utility_score`, `direct_answer_priority_score`, `state_continuity_score`, `answer_shape_score`, `evidence_clarity_score`, `root_cause_layers`, `broken_edge_ids`, `violated_invariants`; + - return machine-readable fields for: `user_intent_summary`, `expected_direct_answer`, `actual_direct_answer`, `direct_answer_ok`, `business_usefulness_ok`, `business_utility_score`, `direct_answer_priority_score`, `state_continuity_score`, `answer_shape_score`, `evidence_clarity_score`, `focus_object_continuity_ok`, `bundle_reuse_ok`, `followup_action_resolution_ok`, `recommended_state_objects`, `root_cause_layers`, `broken_edge_ids`, `violated_invariants`; - if the product found the evidence but failed to retain the selected object, provenance bundle, or another reusable resolved object across turns, classify that as `object_memory_gap` or `edge_carryover_gap`, not as a generic route problem; + - if the product retained the item but resolved the wrong action over that item, for example `покажи документы по этой позиции` -> `documents_by_counterparty`, classify that as `followup_action_resolution_gap`; + - if the product already resolved supplier/date/document details for the active item but failed to reuse that bundle for adjacent follow-ups, classify that as `bundle_reuse_gap`; - if the surfaced business field looks mislabeled, for example supplier vs organization, classify that as `field_mapping_gap`; - if the answer is technically grounded but still weak for a manager/accountant/operator, classify that as `business_utility_gap`. @@ -2204,8 +2292,9 @@ def build_coder_loop_prompt( - do not touch unrelated files; - preserve already successful baseline flows. - use `root_cause_layers`, `broken_edge_ids`, `violated_invariants`, and business-utility scores from the analyst verdict to choose the smallest fix; - - prioritize state continuity, selected-object persistence, direct-answer-first behavior, and field-truth mapping when those are the blocking layers; - - do not broaden scope when the analyst says the defect is mainly `object_memory_gap`, `field_mapping_gap`, `answer_shape_mismatch`, or `business_utility_gap`. + - prioritize state continuity, selected-object persistence, stable `focus_object`, reusable `provenance_bundle` / `sale_trace_bundle`, direct-answer-first behavior, and field-truth mapping when those are the blocking layers; + - do not broaden scope when the analyst says the defect is mainly `object_memory_gap`, `followup_action_resolution_gap`, `bundle_reuse_gap`, `field_mapping_gap`, `answer_shape_mismatch`, or `business_utility_gap`; + - when the verdict points to pronoun follow-ups or item-centric drilldowns, prefer a narrow object-state or follow-up-action fix over prompt inflation. Required outputs: - create `{iteration_dir / 'coder_plan.md'}` with a short plan; @@ -2291,17 +2380,7 @@ def handle_run_pack(args: argparse.Namespace) -> int: } ) - aggregate_statuses = [item["final_status"] for item in scenario_results] - if not aggregate_statuses: - final_status = "blocked" - elif any(status == "blocked" for status in aggregate_statuses): - final_status = "blocked" - elif any(status == "needs_exact_capability" for status in aggregate_statuses): - final_status = "needs_exact_capability" - elif any(status == "partial" for status in aggregate_statuses): - final_status = "partial" - else: - final_status = "accepted" if len(scenario_results) == len(pack.get("scenarios") or []) else "partial" + final_status = derive_pack_final_status(pack, scenario_results) pack_state = { "schema_version": SCENARIO_PACK_SCHEMA_VERSION, diff --git a/tests/test_domain_case_loop.py b/tests/test_domain_case_loop.py index 07a7088..2b0fb6c 100644 --- a/tests/test_domain_case_loop.py +++ b/tests/test_domain_case_loop.py @@ -9,6 +9,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from scripts.domain_case_loop import ( build_scenario_acceptance_matrix, carry_forward_analysis_context, + derive_pack_final_status, load_scenario_pack, merge_scenario_date_scope, ) @@ -148,7 +149,7 @@ def test_build_scenario_acceptance_matrix_marks_green_edge_when_covering_scenari { "node_id": "N03_selected_item_supplier", "covers_question_ids": ["Q19"], - "required_wording_families": ["canonical", "ui_selected_object_colloquial"], + "required_wording_families": ["canonical"], } ], "critical_edges": [ @@ -168,8 +169,18 @@ def test_build_scenario_acceptance_matrix_marks_green_edge_when_covering_scenari "scenario_id": "inventory_selected_item_provenance", "question_ids": ["Q01", "Q19"], "steps": [ - {"step_id": "step_01_snapshot", "question_id": "Q01", "node_id": "N01_stock_snapshot"}, - {"step_id": "step_02_supplier", "question_id": "Q19", "node_id": "N03_selected_item_supplier"}, + { + "step_id": "step_01_snapshot", + "question_id": "Q01", + "node_id": "N01_stock_snapshot", + "paraphrase_family": "canonical", + }, + { + "step_id": "step_02_supplier", + "question_id": "Q19", + "node_id": "N03_selected_item_supplier", + "paraphrase_family": "canonical", + }, ], } ], @@ -188,3 +199,96 @@ def test_build_scenario_acceptance_matrix_marks_green_edge_when_covering_scenari assert "E01_snapshot_to_selected_item_supplier" in matrix assert "| E01_snapshot_to_selected_item_supplier | green |" in matrix assert "| P01_snapshot_to_supplier | green |" in matrix + + +def test_build_scenario_acceptance_matrix_marks_partial_when_wording_family_is_missing() -> None: + pack = { + "pack_id": "inventory_active_contract_smoke", + "domain": "inventory_stock", + "source_contract": {"domain_id": "inventory_stock_supplier_provenance", "title": "Warehouse domain"}, + "question_pool": { + "questions": [ + {"question_id": "Q19", "node_id": "N03_selected_item_supplier"}, + ] + }, + "scenario_tree": { + "critical_nodes": [ + { + "node_id": "N03_selected_item_supplier", + "covers_question_ids": ["Q19"], + "required_wording_families": ["canonical", "ui_selected_object_colloquial"], + } + ] + }, + "scenarios": [ + { + "scenario_id": "inventory_selected_item_provenance", + "question_ids": ["Q19"], + "steps": [ + { + "step_id": "step_01_supplier", + "question_id": "Q19", + "node_id": "N03_selected_item_supplier", + "paraphrase_family": "canonical", + } + ], + } + ], + } + scenario_results = [ + { + "scenario_id": "inventory_selected_item_provenance", + "final_status": "accepted", + "session_id": "asst-demo", + "artifact_dir": "artifacts/domain_runs/demo", + } + ] + + matrix = build_scenario_acceptance_matrix(pack, scenario_results) + + assert "| N03_selected_item_supplier | partial |" in matrix + + +def test_derive_pack_final_status_downgrades_accepted_when_matrix_contains_partial_coverage() -> None: + pack = { + "pack_id": "inventory_active_contract_smoke", + "domain": "inventory_stock", + "scenarios": [ + { + "scenario_id": "inventory_selected_item_provenance", + "question_ids": ["Q19"], + "steps": [ + { + "step_id": "step_01_supplier", + "question_id": "Q19", + "node_id": "N03_selected_item_supplier", + "paraphrase_family": "canonical", + } + ], + }, + ], + "scenario_tree": { + "critical_nodes": [ + { + "node_id": "N03_selected_item_supplier", + "covers_question_ids": ["Q19"], + "required_wording_families": ["canonical", "ui_selected_object_colloquial"], + } + ] + }, + "question_pool": { + "questions": [ + {"question_id": "Q19", "node_id": "N03_selected_item_supplier"}, + ] + }, + } + scenario_results = [ + { + "scenario_id": "inventory_selected_item_provenance", + "final_status": "accepted", + "session_id": "asst-demo", + "artifact_dir": "artifacts/domain_runs/demo", + } + ] + + assert derive_pack_final_status(pack, scenario_results) == "partial"