ARCH: проверить и расширить gate ответа MCP discovery
This commit is contained in:
parent
c744308223
commit
72804557aa
|
|
@ -1003,6 +1003,48 @@ Validation:
|
|||
- `npm test -- assistantMcpDiscoveryResponsePolicy.test.ts assistantLivingChatRuntimeAdapter.test.ts assistantMcpDiscoveryResponseCandidate.test.ts assistantMcpDiscoveryDebugAttachment.test.ts` passed 19/19;
|
||||
- `npm run build` passed.
|
||||
|
||||
## Progress Update - 2026-04-20 MCP Discovery Response Gate Replay
|
||||
|
||||
The fourteenth implementation slice of Big Block 5 added and executed a targeted AGENT semantic replay for the first guarded MCP discovery answer-replacement gate:
|
||||
|
||||
- `docs/orchestration/address_truth_harness_phase19_mcp_discovery_response_gate.json`
|
||||
- `assistantMcpDiscoveryResponsePolicy.ts`
|
||||
- `assistantMcpDiscoveryResponsePolicy.test.ts`
|
||||
- `assistantLivingChatRuntimeAdapter.test.ts`
|
||||
|
||||
The first live replay exposed a real semantic/runtime gap:
|
||||
|
||||
- the MCP discovery entry point executed successfully;
|
||||
- the response candidate was `ready_for_guarded_use`;
|
||||
- the final visible answer still came from `llm_chat` because the policy only applied to deterministic `unsupported_current_turn_meaning_boundary` replies;
|
||||
- the LLM answer was more fluent but less contract-safe because it used a stale runtime date and did not clearly distinguish inferred 1C activity duration from legal registration age.
|
||||
|
||||
The policy gate was expanded safely.
|
||||
|
||||
It can now apply a guarded candidate when either:
|
||||
|
||||
- the current reply is the deterministic unsupported-current-turn boundary; or
|
||||
- the current reply source is `llm_chat`, the MCP discovery entry point is `bridge_executed`, `discovery_attempted=true`, `turn_input.should_run_discovery=true`, and the candidate passes all guarded response checks.
|
||||
|
||||
The gate still does not apply to ordinary chat or exact supported routes because those turns do not have a valid discovery-ready entry point and eligible candidate.
|
||||
|
||||
Replay result:
|
||||
|
||||
- first run: `address_truth_harness_phase19_mcp_discovery_response_gate_live` failed 3/4 on the lifecycle step;
|
||||
- rerun after the policy fix: `address_truth_harness_phase19_mcp_discovery_response_gate_live_rerun1` passed 4/4 with final status `accepted`;
|
||||
- the lifecycle answer source became `mcp_discovery_response_candidate_guarded`;
|
||||
- debug confirmed `mcp_discovery_response_applied=true`, `mcp_discovery_entry_status=bridge_executed`, and `mcp_discovery_answer_mode=confirmed_with_bounded_inference`.
|
||||
|
||||
Validation:
|
||||
|
||||
- `npm test -- assistantMcpDiscoveryResponsePolicy.test.ts assistantLivingChatRuntimeAdapter.test.ts assistantMcpDiscoveryResponseCandidate.test.ts assistantMcpDiscoveryDebugAttachment.test.ts` passed 21/21;
|
||||
- `npm run build` passed;
|
||||
- `python scripts/domain_truth_harness.py run-live --spec docs/orchestration/address_truth_harness_phase19_mcp_discovery_response_gate.json --output-dir artifacts/domain_runs/address_truth_harness_phase19_mcp_discovery_response_gate_live_rerun1 --timeout-seconds 180` passed 4/4, final status `accepted`.
|
||||
|
||||
Known next quality gap:
|
||||
|
||||
- the guarded candidate is now honest and safe, but it still does not compute and verbalize the exact activity duration such as "5 years N months" from first/latest confirmed rows. That belongs to the next evidence-derivation slice, not to the response gate itself.
|
||||
|
||||
## Execution Rule
|
||||
|
||||
Do not implement this plan as:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
{
|
||||
"schema_version": "domain_truth_harness_spec_v1",
|
||||
"scenario_id": "address_truth_harness_phase19_mcp_discovery_response_gate",
|
||||
"domain": "address_phase19_mcp_discovery_response_gate",
|
||||
"title": "Phase 19 MCP discovery response gate replay",
|
||||
"description": "Targeted AGENT replay for the first guarded MCP discovery answer-replacement gate. The scenario validates that exact supported routes remain authoritative, unsupported-but-understood counterparty lifecycle questions can use guarded discovered evidence, off-domain living chat is not hijacked, and internal MCP/runtime mechanics do not leak into the user-facing answer.",
|
||||
"bindings": {},
|
||||
"steps": [
|
||||
{
|
||||
"step_id": "step_01_human_smalltalk_sanity",
|
||||
"title": "Human smalltalk remains living chat and does not expose discovery internals",
|
||||
"question": "привет, ты на связи?",
|
||||
"required_answer_patterns_any": [
|
||||
"(?i)привет|на связи|готов|помочь"
|
||||
],
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)mcp",
|
||||
"(?i)runtime_",
|
||||
"(?i)query_documents",
|
||||
"(?i)primitive"
|
||||
],
|
||||
"criticality": "important",
|
||||
"semantic_tags": [
|
||||
"human_answer",
|
||||
"mcp_discovery_gate_sanity"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_02_supported_counterparty_documents_stays_exact",
|
||||
"title": "Supported counterparty documents route is not replaced by MCP discovery",
|
||||
"question": "покажи документы по свк за 2020",
|
||||
"allowed_reply_types": [
|
||||
"factual",
|
||||
"factual_with_explanation"
|
||||
],
|
||||
"expected_intents": [
|
||||
"list_documents_by_counterparty"
|
||||
],
|
||||
"required_direct_answer_patterns_any": [
|
||||
"(?i)свк|группа свк",
|
||||
"(?i)документ|поступление|счет|счёт"
|
||||
],
|
||||
"forbidden_direct_answer_patterns": [
|
||||
"(?i)mcp",
|
||||
"(?i)runtime_",
|
||||
"(?i)query_documents",
|
||||
"(?i)primitive",
|
||||
"(?i)точный маршрут.*не подключ"
|
||||
],
|
||||
"criticality": "critical",
|
||||
"semantic_tags": [
|
||||
"counterparty_documents",
|
||||
"supported_route_not_hijacked_by_mcp_discovery"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_03_counterparty_lifecycle_uses_guarded_discovery",
|
||||
"title": "Unsupported-but-understood counterparty lifecycle question uses guarded discovery answer",
|
||||
"question": "сколько лет мы работаем с Группа СВК?",
|
||||
"required_answer_patterns_all": [
|
||||
"(?i)свк",
|
||||
"(?i)1с|активност|подтвержд",
|
||||
"(?i)вывод|оцен|инфер|можно оцен",
|
||||
"(?i)юридическ|регистрац|не подтвержд|не доказ"
|
||||
],
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)точный маршрут.*не подключ",
|
||||
"(?i)не буду подставлять",
|
||||
"(?i)query_documents",
|
||||
"(?i)query_movements",
|
||||
"(?i)runtime_",
|
||||
"(?i)planner_",
|
||||
"(?i)catalog_",
|
||||
"(?i)primitive"
|
||||
],
|
||||
"criticality": "critical",
|
||||
"semantic_tags": [
|
||||
"mcp_discovery_response_gate",
|
||||
"counterparty_lifecycle",
|
||||
"unsupported_current_turn_meaning_boundary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"step_id": "step_04_off_domain_living_chat_not_hijacked",
|
||||
"title": "Off-domain living chat remains human and is not hijacked by discovery carryover",
|
||||
"question": "а чем капибара отличается от утки?",
|
||||
"required_answer_patterns_any": [
|
||||
"(?i)капибар.*утк|утк.*капибар",
|
||||
"(?i)млекопита|птиц|грызун"
|
||||
],
|
||||
"forbidden_answer_patterns": [
|
||||
"(?i)свк",
|
||||
"(?i)контрагент",
|
||||
"(?i)mcp",
|
||||
"(?i)query_documents",
|
||||
"(?i)runtime_",
|
||||
"(?i)primitive"
|
||||
],
|
||||
"criticality": "critical",
|
||||
"semantic_tags": [
|
||||
"off_domain_living_chat",
|
||||
"stale_replay_forbidden"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -65,18 +65,30 @@ function isUnsupportedCurrentTurnBoundary(input) {
|
|||
input.livingChatSource === "deterministic_unsupported_current_turn_boundary" ||
|
||||
input.currentReplySource === "deterministic_unsupported_current_turn_boundary");
|
||||
}
|
||||
function isDiscoveryReadyChatCandidate(input, entryPoint) {
|
||||
const turnInput = toRecordObject(entryPoint?.turn_input);
|
||||
return (entryPoint?.entry_status === "bridge_executed" &&
|
||||
entryPoint.discovery_attempted === true &&
|
||||
turnInput?.should_run_discovery === true &&
|
||||
(input.livingChatSource === "llm_chat" || input.currentReplySource === "llm_chat"));
|
||||
}
|
||||
function applyAssistantMcpDiscoveryResponsePolicy(input) {
|
||||
const currentReply = String(input.currentReply ?? "");
|
||||
const currentReplySource = toNonEmptyString(input.currentReplySource) ?? toNonEmptyString(input.livingChatSource) ?? "unknown";
|
||||
const entryPoint = resolveEntryPoint(input);
|
||||
const candidate = (0, assistantMcpDiscoveryResponseCandidate_1.buildAssistantMcpDiscoveryResponseCandidate)(entryPoint);
|
||||
const reasonCodes = [...candidate.reason_codes];
|
||||
const unsupportedBoundary = isUnsupportedCurrentTurnBoundary(input);
|
||||
const discoveryReadyChatCandidate = isDiscoveryReadyChatCandidate(input, entryPoint);
|
||||
if (!entryPoint) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_no_entry_point");
|
||||
}
|
||||
if (!isUnsupportedCurrentTurnBoundary(input)) {
|
||||
if (!unsupportedBoundary) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_not_unsupported_boundary");
|
||||
}
|
||||
if (!discoveryReadyChatCandidate) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_not_discovery_ready_chat_candidate");
|
||||
}
|
||||
if (!ALLOWED_CANDIDATE_STATUSES.has(candidate.candidate_status)) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_candidate_status_not_allowed");
|
||||
}
|
||||
|
|
@ -90,7 +102,7 @@ function applyAssistantMcpDiscoveryResponsePolicy(input) {
|
|||
pushReason(reasonCodes, "mcp_discovery_response_policy_candidate_contains_internal_mechanics");
|
||||
}
|
||||
const canApply = Boolean(entryPoint) &&
|
||||
isUnsupportedCurrentTurnBoundary(input) &&
|
||||
(unsupportedBoundary || discoveryReadyChatCandidate) &&
|
||||
ALLOWED_CANDIDATE_STATUSES.has(candidate.candidate_status) &&
|
||||
candidate.eligible_for_future_hot_runtime &&
|
||||
Boolean(toNonEmptyString(candidate.reply_text)) &&
|
||||
|
|
|
|||
|
|
@ -109,6 +109,19 @@ function isUnsupportedCurrentTurnBoundary(input: ApplyAssistantMcpDiscoveryRespo
|
|||
);
|
||||
}
|
||||
|
||||
function isDiscoveryReadyChatCandidate(
|
||||
input: ApplyAssistantMcpDiscoveryResponsePolicyInput,
|
||||
entryPoint: AssistantMcpDiscoveryRuntimeEntryPointContract | null
|
||||
): boolean {
|
||||
const turnInput = toRecordObject(entryPoint?.turn_input);
|
||||
return (
|
||||
entryPoint?.entry_status === "bridge_executed" &&
|
||||
entryPoint.discovery_attempted === true &&
|
||||
turnInput?.should_run_discovery === true &&
|
||||
(input.livingChatSource === "llm_chat" || input.currentReplySource === "llm_chat")
|
||||
);
|
||||
}
|
||||
|
||||
export function applyAssistantMcpDiscoveryResponsePolicy(
|
||||
input: ApplyAssistantMcpDiscoveryResponsePolicyInput
|
||||
): AssistantMcpDiscoveryResponsePolicyResult {
|
||||
|
|
@ -118,13 +131,18 @@ export function applyAssistantMcpDiscoveryResponsePolicy(
|
|||
const entryPoint = resolveEntryPoint(input);
|
||||
const candidate = buildAssistantMcpDiscoveryResponseCandidate(entryPoint);
|
||||
const reasonCodes = [...candidate.reason_codes];
|
||||
const unsupportedBoundary = isUnsupportedCurrentTurnBoundary(input);
|
||||
const discoveryReadyChatCandidate = isDiscoveryReadyChatCandidate(input, entryPoint);
|
||||
|
||||
if (!entryPoint) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_no_entry_point");
|
||||
}
|
||||
if (!isUnsupportedCurrentTurnBoundary(input)) {
|
||||
if (!unsupportedBoundary) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_not_unsupported_boundary");
|
||||
}
|
||||
if (!discoveryReadyChatCandidate) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_not_discovery_ready_chat_candidate");
|
||||
}
|
||||
if (!ALLOWED_CANDIDATE_STATUSES.has(candidate.candidate_status)) {
|
||||
pushReason(reasonCodes, "mcp_discovery_response_policy_candidate_status_not_allowed");
|
||||
}
|
||||
|
|
@ -140,7 +158,7 @@ export function applyAssistantMcpDiscoveryResponsePolicy(
|
|||
|
||||
const canApply =
|
||||
Boolean(entryPoint) &&
|
||||
isUnsupportedCurrentTurnBoundary(input) &&
|
||||
(unsupportedBoundary || discoveryReadyChatCandidate) &&
|
||||
ALLOWED_CANDIDATE_STATUSES.has(candidate.candidate_status) &&
|
||||
candidate.eligible_for_future_hot_runtime &&
|
||||
Boolean(toNonEmptyString(candidate.reply_text)) &&
|
||||
|
|
|
|||
|
|
@ -229,6 +229,54 @@ describe("assistant living chat runtime adapter", () => {
|
|||
expect(executeLlmChat).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("replaces discovery-ready llm chat business answer with guarded MCP discovery response", async () => {
|
||||
const executeLlmChat = vi.fn(async () => "stale llm answer with old date");
|
||||
const input = buildRuntimeInput({
|
||||
userMessage: "how long has svk been active",
|
||||
modeDecision: { mode: "chat", reason: "non_domain_query_indexed" },
|
||||
addressRuntimeMeta: {
|
||||
mcpDiscoveryRuntimeEntryPoint: {
|
||||
schema_version: "assistant_mcp_discovery_runtime_entry_point_v1",
|
||||
policy_owner: "assistantMcpDiscoveryRuntimeEntryPoint",
|
||||
entry_status: "bridge_executed",
|
||||
hot_runtime_wired: false,
|
||||
discovery_attempted: true,
|
||||
turn_input: {
|
||||
adapter_status: "ready",
|
||||
should_run_discovery: true
|
||||
},
|
||||
bridge: {
|
||||
bridge_status: "answer_draft_ready",
|
||||
user_facing_response_allowed: true,
|
||||
business_fact_answer_allowed: true,
|
||||
requires_user_clarification: false,
|
||||
answer_draft: {
|
||||
answer_mode: "confirmed_with_bounded_inference",
|
||||
headline: "Confirmed scoped answer.",
|
||||
confirmed_lines: ["Confirmed fact"],
|
||||
inference_lines: ["Bounded inference"],
|
||||
unknown_lines: ["Unconfirmed legal fact"],
|
||||
limitation_lines: [],
|
||||
next_step_line: null
|
||||
}
|
||||
},
|
||||
reason_codes: ["runtime_entry_point_bridge_executed"]
|
||||
}
|
||||
},
|
||||
executeLlmChat
|
||||
});
|
||||
|
||||
const output = await runAssistantLivingChatRuntime(input);
|
||||
|
||||
expect(output.handled).toBe(true);
|
||||
expect(output.chatText).toContain("Confirmed fact");
|
||||
expect(output.chatText).not.toContain("old date");
|
||||
expect(output.debug?.living_chat_response_source).toBe("mcp_discovery_response_candidate_guarded");
|
||||
expect(output.debug?.mcp_discovery_response_applied).toBe(true);
|
||||
expect(output.debug?.mcp_discovery_entry_status).toBe("bridge_executed");
|
||||
expect(executeLlmChat).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("adds proactive organization offer on first smalltalk turn when multiple organizations are available", async () => {
|
||||
const resolveDataScopeProbe = vi.fn(async () => ({
|
||||
status: "resolved",
|
||||
|
|
|
|||
|
|
@ -63,6 +63,30 @@ describe("assistant MCP discovery response policy", () => {
|
|||
expect(result.reply_text).toBe("regular chat");
|
||||
expect(result.reply_source).toBe("llm_chat");
|
||||
expect(result.reason_codes).toContain("mcp_discovery_response_policy_not_unsupported_boundary");
|
||||
expect(result.reason_codes).toContain("mcp_discovery_response_policy_not_discovery_ready_chat_candidate");
|
||||
});
|
||||
|
||||
it("applies a guarded candidate for discovery-ready llm chat business answers", () => {
|
||||
const result = applyAssistantMcpDiscoveryResponsePolicy({
|
||||
currentReply: "stale llm business answer",
|
||||
currentReplySource: "llm_chat",
|
||||
modeDecisionReason: "non_domain_query_indexed",
|
||||
addressRuntimeMeta: {
|
||||
mcpDiscoveryRuntimeEntryPoint: entryPoint({
|
||||
turn_input: {
|
||||
adapter_status: "ready",
|
||||
should_run_discovery: true
|
||||
}
|
||||
})
|
||||
}
|
||||
});
|
||||
|
||||
expect(result.applied).toBe(true);
|
||||
expect(result.decision).toBe("apply_candidate");
|
||||
expect(result.reply_source).toBe("mcp_discovery_response_candidate_guarded");
|
||||
expect(result.reply_text).toContain("Confirmed fact");
|
||||
expect(result.reason_codes).toContain("mcp_discovery_response_policy_not_unsupported_boundary");
|
||||
expect(result.reason_codes).not.toContain("mcp_discovery_response_policy_not_discovery_ready_chat_candidate");
|
||||
});
|
||||
|
||||
it("keeps the current reply when the candidate has no grounded text", () => {
|
||||
|
|
|
|||
Loading…
Reference in New Issue