From c2ac0c610bf072bf824aa2bb0bcdb746d9e5f2e7 Mon Sep 17 00:00:00 2001 From: dctouch Date: Mon, 13 Apr 2026 20:04:43 +0300 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=9E=D0=9C=D0=95=D0=9D=D0=AB=20-=20?= =?UTF-8?q?=D0=92=D0=9E=D0=9F=D0=A0=D0=9E=D0=A1=D0=AB=20-=20=D0=9E=D0=A0?= =?UTF-8?q?=D0=A0=D0=9A=D0=95=D0=A1=D0=A2=D0=A0=D0=90=D0=A6=D0=98=D0=AF=20?= =?UTF-8?q?-=20=D0=91=D0=90=D0=97=D0=90=20-=D0=97=D0=B0=D1=84=D0=B8=D0=BA?= =?UTF-8?q?=D1=81=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D1=82=D1=8C=20=D0=B2=20orc?= =?UTF-8?q?hestration=20loop=20=D0=BF=D1=80=D0=B0=D0=B2=D0=B8=D0=BB=D0=BE?= =?UTF-8?q?=20domain=20enablement=20=D0=B4=D0=BB=D1=8F=20=D0=BD=D0=BE?= =?UTF-8?q?=D0=B2=D1=8B=D1=85=20=D0=BD=D0=B5=D1=80=D0=B0=D0=B7=D0=BC=D0=B5?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=BD=D1=8B=D1=85=20=D0=B4=D0=BE=D0=BC=D0=B5?= =?UTF-8?q?=D0=BD=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .codex/skills/domain-case-loop/SKILL.md | 12 ++++++++++ .../references/case_brief_template.md | 6 +++++ .../references/domain_constraints.md | 2 ++ AGENTS.md | 2 ++ .../domain_case_loop_repo_adapter.md | 23 +++++++++++++++++++ ...istant_autogen_runtime_job-Bv_o7Vircc.json | 22 ++++++++++++++++++ ...istant_autogen_runtime_job-vHloW0L_3W.json | 22 ++++++++++++++++++ scripts/domain_case_loop.py | 23 +++++++++++++++++++ 8 files changed, 112 insertions(+) create mode 100644 llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-Bv_o7Vircc.json create mode 100644 llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-vHloW0L_3W.json diff --git a/.codex/skills/domain-case-loop/SKILL.md b/.codex/skills/domain-case-loop/SKILL.md index a862162..afd1523 100644 --- a/.codex/skills/domain-case-loop/SKILL.md +++ b/.codex/skills/domain-case-loop/SKILL.md @@ -29,6 +29,8 @@ Read `references/repo_runtime_map.md` before the first real cycle. Use these repo-native capture paths: - automated capture: `python scripts/domain_case_loop.py run-case ...` - import existing technical export: `python scripts/domain_case_loop.py import-export ...` +- `run-case` defaults to the repo's live local profile: `local / qwen2.5-14b-instruct-1m / http://127.0.0.1:1234/v1` +- override with `--llm-provider`, `--llm-model`, `--llm-base-url`, `--llm-api-key` when needed ## Workflow @@ -69,6 +71,11 @@ Spawn `domain_analyst` and provide: Require a full verdict using `references/verdict_template.md`. +The verdict must explicitly say whether the case is: +- an existing in-contour regression; +- a missing route/intent/capability inside project scope; +- a true out-of-scope request. + ### Step 4 - Domain patch Spawn `domain_coder` with: @@ -80,6 +87,7 @@ Require: - a minimal patch - zero architecture drift - rerun after changes +- if the domain is in project scope but outside the current contour, convert the verdict into capability enablement work instead of closing the case as unsupported ### Step 5 - Rerun @@ -104,6 +112,8 @@ Write `final_status.md` with one of: - blocked - needs_exact_capability +`needs_exact_capability` is the default status when the business/domain request is valid for the project, but the current contour is missing the route, intent, capability, or domain bootstrap needed to answer it. + Accepted requires: - quality score >= 80 - no unresolved P0 defects @@ -114,6 +124,7 @@ Accepted requires: - Do not count heuristic candidates as confirmed business answers. - If exact data should exist in 1C/MCP, prefer exact route work over prompt cosmetics. - If exact data does not exist yet in the reachable contour, return a technical insufficiency with a crisp blocker. +- If the user case belongs to a project-relevant domain but is outside the current contour, do not treat that as a terminal rejection. Treat it as domain enablement work and record the missing route/intent/capability explicitly. - Never fabricate 1C data. - Keep domain fixes minimal and localized. - Preserve successful baseline scenarios. @@ -127,6 +138,7 @@ For this repository: - analyst output must be detailed and business-readable; - answers should be suitable for product hardening, not just debugging notes; - machine-readable turn artifacts are first-class inputs for analysis. +- New user domains may be unmarked in the current repo. Missing markup is expected and should be handled as enablement, not as a reason to stop the loop. ## Recommended artifact set diff --git a/.codex/skills/domain-case-loop/references/case_brief_template.md b/.codex/skills/domain-case-loop/references/case_brief_template.md index 882041e..8504fff 100644 --- a/.codex/skills/domain-case-loop/references/case_brief_template.md +++ b/.codex/skills/domain-case-loop/references/case_brief_template.md @@ -15,6 +15,12 @@ ## Expected result mode - confirmed_balance / confirmed_tax_liability / partial / technical_insufficiency / other +## Contour status +- in_contour / outside_current_contour / unknown + +## Enablement hypothesis +- missing_route / missing_intent / missing_capability / missing_data_access / unknown + ## Constraints - no architecture changes - 1C/MCP first diff --git a/.codex/skills/domain-case-loop/references/domain_constraints.md b/.codex/skills/domain-case-loop/references/domain_constraints.md index ea9b414..540cecf 100644 --- a/.codex/skills/domain-case-loop/references/domain_constraints.md +++ b/.codex/skills/domain-case-loop/references/domain_constraints.md @@ -6,4 +6,6 @@ - Не считать heuristic ответ продуктовым успехом. - Математика вне 1С допустима только как детерминированный постпроцесс над уже подтвержденными фактами. - Analyst read-only, Coder implementation-focused. +- Если домен или маршрут еще не прокинут, но кейс лежит внутри целевого project scope, это считается работой на enablement, а не поводом закрыть кейс как unsupported. +- Для новых неразмеченных доменов outer loop должен явно фиксировать, чего не хватает: intent, route, capability, data access или bootstrap слоя. - Accepted требует score >= 80, zero unresolved P0 и отсутствия silent fallback masking. diff --git a/AGENTS.md b/AGENTS.md index 819891a..12969f7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,4 +18,6 @@ Rules: - Use `.codex/skills/domain-case-loop` for repeatable domain hardening loops on one concrete case. - Preserve current architecture: domain loop may automate capture, review, rerun, and artifact storage, but must not rewrite runtime foundations. - Prefer machine-readable case artifacts in `artifacts/domain_runs//`, especially `baseline_turn.json` / `rerun_turn.json`, over ad hoc prose-only summaries. +- If a case falls outside the current routed contour because the route/intent/capability is not wired yet, treat it as domain enablement work for this project, not as automatic out-of-scope rejection. +- For new unmarked domains, `needs_exact_capability` means "bootstrap or extend the contour" rather than "close the case as unsupported". - A case can be marked `accepted` only when analyst verdict is at least `80/100`, no unresolved `P0` remains, and the rerun does not mask heuristic output as confirmed. diff --git a/docs/orchestration/domain_case_loop_repo_adapter.md b/docs/orchestration/domain_case_loop_repo_adapter.md index 3f3da1c..f54600a 100644 --- a/docs/orchestration/domain_case_loop_repo_adapter.md +++ b/docs/orchestration/domain_case_loop_repo_adapter.md @@ -46,6 +46,9 @@ Использует живой backend: +- по умолчанию helper запускает кейс через `local / qwen2.5-14b-instruct-1m / http://127.0.0.1:1234/v1`; +- для переопределения доступны `--llm-provider`, `--llm-model`, `--llm-base-url`, `--llm-api-key`. + ```powershell python scripts/domain_case_loop.py run-case ` --domain open_contracts ` @@ -103,3 +106,23 @@ python scripts/domain_case_loop.py import-export ` - `partial` - `blocked` - `needs_exact_capability` + +`needs_exact_capability` здесь означает не "закрыть кейс как чужой", а "доменный запрос валиден для проекта, но текущий контур еще не умеет его отрабатывать точно". + +## Политика для новых и неразмеченных доменов + +Outer loop должен считать нормальным, что пользователь будет приносить кейсы из доменов, которые еще не размечены в текущем runtime. + +Если baseline показывает, что: + +1. вопрос лежит внутри целевого project scope; +2. 1С/MCP/данные проекта концептуально относятся к этому домену; +3. ответ не получен из-за отсутствующего intent/route/capability/bootstrap; + +то кейс нельзя автоматически считать `out_of_scope`. + +Такой кейс должен переходить в режим domain enablement: + +- явно зафиксировать, чего не хватает: `intent`, `route`, `capability`, `data access`, `contour bootstrap`; +- сформировать минимальную задачу на прокидывание контура; +- после этого повторно прогонять baseline/rerun уже как продуктовый кейс. diff --git a/llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-Bv_o7Vircc.json b/llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-Bv_o7Vircc.json new file mode 100644 index 0000000..409cd66 --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-Bv_o7Vircc.json @@ -0,0 +1,22 @@ +{ + "suite_id": "assistant_autogen_runtime_job-Bv_o7Vircc", + "suite_version": "0.1.0", + "schema_version": "assistant_autogen_runtime_v0_1", + "scenario_count": 1, + "case_ids": [ + "AUTO-001" + ], + "cases": [ + { + "case_id": "AUTO-001", + "scenario_tag": "autogen_runtime", + "question_type": "direct", + "broadness_level": "medium", + "turns": [ + { + "user_message": "Какие товары сейчас лежат на складе" + } + ] + } + ] +} \ No newline at end of file diff --git a/llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-vHloW0L_3W.json b/llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-vHloW0L_3W.json new file mode 100644 index 0000000..a5bdcf8 --- /dev/null +++ b/llm_normalizer/data/eval_cases/assistant_autogen_runtime_job-vHloW0L_3W.json @@ -0,0 +1,22 @@ +{ + "suite_id": "assistant_autogen_runtime_job-vHloW0L_3W", + "suite_version": "0.1.0", + "schema_version": "assistant_autogen_runtime_v0_1", + "scenario_count": 1, + "case_ids": [ + "AUTO-001" + ], + "cases": [ + { + "case_id": "AUTO-001", + "scenario_tag": "autogen_runtime", + "question_type": "direct", + "broadness_level": "medium", + "turns": [ + { + "user_message": "Какие товары лежат на складе сейчас или на любую выбранную дату" + } + ] + } + ] +} \ No newline at end of file diff --git a/scripts/domain_case_loop.py b/scripts/domain_case_loop.py index e367d00..bf0c102 100644 --- a/scripts/domain_case_loop.py +++ b/scripts/domain_case_loop.py @@ -18,6 +18,13 @@ DEFAULT_ARTIFACTS_ROOT = REPO_ROOT / "artifacts" / "domain_runs" DEFAULT_SESSIONS_DIR = REPO_ROOT / "llm_normalizer" / "data" / "assistant_sessions" DEFAULT_REPORTS_DIR = REPO_ROOT / "llm_normalizer" / "reports" DEFAULT_BACKEND_URL = "http://127.0.0.1:8787" +DEFAULT_PROMPT_VERSION = "address_query_runtime_v1" +DEFAULT_LLM_PROVIDER = "local" +DEFAULT_LLM_MODEL = "qwen2.5-14b-instruct-1m" +DEFAULT_LLM_BASE_URL = "http://127.0.0.1:1234/v1" +DEFAULT_LLM_API_KEY = "" +DEFAULT_TEMPERATURE = 0.0 +DEFAULT_MAX_OUTPUT_TOKENS = 900 TECH_SECTION_HEADER = "### technical_debug_payload_json" @@ -444,6 +451,15 @@ def handle_run_case(args: argparse.Namespace) -> int: ) payload: dict[str, Any] = { + "normalizeConfig": { + "llmProvider": args.llm_provider, + "apiKey": args.llm_api_key, + "model": args.llm_model, + "baseUrl": args.llm_base_url, + "temperature": args.temperature, + "maxOutputTokens": args.max_output_tokens, + "promptVersion": args.prompt_version, + }, "eval_target": "assistant_stage1", "questions": [args.question], "useMock": bool(args.use_mock), @@ -564,6 +580,13 @@ def build_parser() -> argparse.ArgumentParser: run_case.add_argument("--output-root", default=str(DEFAULT_ARTIFACTS_ROOT)) run_case.add_argument("--sessions-dir", default=str(DEFAULT_SESSIONS_DIR)) run_case.add_argument("--reports-dir", default=str(DEFAULT_REPORTS_DIR)) + run_case.add_argument("--prompt-version", default=DEFAULT_PROMPT_VERSION) + run_case.add_argument("--llm-provider", default=DEFAULT_LLM_PROVIDER, choices=["openai", "local"]) + run_case.add_argument("--llm-model", default=DEFAULT_LLM_MODEL) + run_case.add_argument("--llm-base-url", default=DEFAULT_LLM_BASE_URL) + run_case.add_argument("--llm-api-key", default=DEFAULT_LLM_API_KEY) + run_case.add_argument("--temperature", type=float, default=DEFAULT_TEMPERATURE) + run_case.add_argument("--max-output-tokens", type=int, default=DEFAULT_MAX_OUTPUT_TOKENS) run_case.add_argument("--timeout-seconds", type=int, default=300) run_case.add_argument("--poll-interval-seconds", type=float, default=1.5) run_case.add_argument("--expected-capability")