diff --git a/agents/Aevatar.GAgents.ChannelRuntime/AgentBuilderTool.cs b/agents/Aevatar.GAgents.ChannelRuntime/AgentBuilderTool.cs index e5372af32..6bc01fd7c 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/AgentBuilderTool.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/AgentBuilderTool.cs @@ -232,6 +232,22 @@ private async Task CreateDailyReportAgentAsync( if (!TryParseApiKeyCreateResponse(createKeyResponse, out var apiKeyId, out var apiKeyValue, out var apiKeyError)) return JsonSerializer.Serialize(new { error = apiKeyError }); + // Issue #411: the new agent API key is allowed_service_ids=api-github but might lack a + // bound GitHub credential, in which case every scheduled run hits 401/403 from + // proxy/s/api-github and the user sees no useful daily report. Preflight a safe + // GitHub endpoint with the freshly minted key and fail the create with an actionable + // error rather than persisting an agent that will never produce a usable report. + // + // Reviewer (PR #412 r3141699756): on the fail-fast path the freshly created NyxID API + // key would be left behind — repeated `/daily` attempts that hit GitHub preflight + // accumulate orphan proxy keys. Best-effort revoke before returning. + var preflight = await PreflightGitHubProxyAsync(nyxClient, apiKeyValue!, providerSlug, ct); + if (preflight is not null) + { + await BestEffortRevokeApiKeyAsync(nyxClient, token, apiKeyId!, "github_preflight_failed", ct); + return preflight; + } + var actor = await actorRuntime.GetAsync(agentId) ?? await actorRuntime.CreateAsync(agentId, ct); @@ -257,8 +273,10 @@ private async Task CreateDailyReportAgentAsync( NyxApiKey = apiKeyValue!, OwnerNyxUserId = ownerNyxUserId!, ApiKeyId = apiKeyId!, - LarkReceiveId = deliveryTarget.ReceiveId, - LarkReceiveIdType = deliveryTarget.ReceiveIdType, + LarkReceiveId = deliveryTarget.Primary.ReceiveId, + LarkReceiveIdType = deliveryTarget.Primary.ReceiveIdType, + LarkReceiveIdFallback = deliveryTarget.Fallback?.ReceiveId ?? string.Empty, + LarkReceiveIdTypeFallback = deliveryTarget.Fallback?.ReceiveIdType ?? string.Empty, }, }; @@ -396,8 +414,10 @@ private async Task CreateSocialMediaAgentAsync( ApiKeyId = apiKeyId!, Enabled = true, ScopeId = scopeId.Trim(), - LarkReceiveId = deliveryTarget.ReceiveId, - LarkReceiveIdType = deliveryTarget.ReceiveIdType, + LarkReceiveId = deliveryTarget.Primary.ReceiveId, + LarkReceiveIdType = deliveryTarget.Primary.ReceiveIdType, + LarkReceiveIdFallback = deliveryTarget.Fallback?.ReceiveId ?? string.Empty, + LarkReceiveIdTypeFallback = deliveryTarget.Fallback?.ReceiveIdType ?? string.Empty, }; await actor.HandleEventAsync(BuildDirectEnvelope(actor.Id, initialize), ct); @@ -1412,31 +1432,162 @@ private static string NormalizeScopeId(string? value) => } /// - /// Builds the typed Lark delivery target from the current AgentToolRequestContext and emits - /// a LogDebug breadcrumb when falls - /// back from the cross-app safe pair (union_id / chat_id) to the legacy open_id / - /// conversation_id path. The fallback flag is intentionally NOT persisted on - /// SkillRunnerOutboundConfig / InitializeWorkflowAgentCommand because the - /// downstream path treats any populated typed - /// pair as authoritative — so this is the only place the cross-app risk surfaces. Operators - /// correlating Lark code:99992361 open_id cross app rejections need this log line to - /// confirm whether the relay surfaced union_id at agent-create time. + /// Builds the typed Lark delivery target (primary + optional fallback) from the current + /// AgentToolRequestContext, and emits a LogDebug breadcrumb when the primary fell back from + /// the cross-app safe pair (chat_id / union_id) to the legacy open_id / conversation_id + /// path. The primary is what + /// returns; the fallback (when the primary is a DM chat_id and we also have a union_id at + /// ingress) is captured so the runtime can retry once on a Lark + /// 230002 bot not in chat rejection — the failure mode for cross-app same-tenant + /// deployments where the outbound app is not in the inbound DM. Operators correlating Lark + /// 99992361 open_id cross app rejections need the log line to confirm whether the + /// relay surfaced union_id at agent-create time. + /// + /// + /// Preflights GitHub proxy access using the newly created agent API key against + /// /rate_limit — an unauthenticated-friendly read endpoint that returns 401/403 when + /// the key lacks a bound GitHub credential. Returns a structured error JSON suitable for + /// returning verbatim from the tool when access is denied; returns null when access + /// works (or when the daily-report's required service list does not include GitHub, in + /// which case there's nothing to preflight). Issue aevatarAI/aevatar#411. /// - private LarkReceiveTarget ResolveDeliveryTarget(string conversationId, string agentId) + private async Task PreflightGitHubProxyAsync( + NyxIdApiClient nyxClient, + string apiKey, + string nyxProviderSlug, + CancellationToken ct) + { + // Cheap read-only endpoint; succeeds even with a rate-limited token, fails with 401/403 + // when the proxy can't resolve a bound GitHub credential. + var probe = await nyxClient.ProxyRequestAsync( + apiKey, + "api-github", + "/rate_limit", + "GET", + body: null, + extraHeaders: null, + ct); + + if (string.IsNullOrWhiteSpace(probe)) + return null; + + // `NyxIdApiClient.SendAsync` (NyxIdApiClient.cs:680) wraps HTTP non-2xx as + // `{"error": true, "status": , "body": ""}` — `status`, + // not `code`. Reviewer (PR #412 r3141699476): the previous parser only read `code`, + // so for the actual #411 production failures (HTTP 403 from /api/v1/proxy/s/api-github + // /rate_limit) it set status=0, returned null, and persisted a daily_report agent + // that would fail at runtime. Read both `status` (the SendAsync envelope) AND `code` + // (any future inverted-naming envelope or top-level Lark code). Treat 401/403 as the + // signal to fail-fast; let other shapes flow through (rate limits, 5xx etc are + // operational and not "agent fundamentally broken"). + try + { + using var doc = JsonDocument.Parse(probe); + var root = doc.RootElement; + if (root.ValueKind != JsonValueKind.Object) + return null; + + if (!root.TryGetProperty("error", out var errorProp)) + return null; + if (errorProp.ValueKind != JsonValueKind.True && errorProp.ValueKind != JsonValueKind.String) + return null; + + var status = TryReadInt32Property(root, "status") + ?? TryReadInt32Property(root, "code") + ?? 0; + if (status != (int)HttpStatusCode.Unauthorized && status != (int)HttpStatusCode.Forbidden) + return null; + + var detail = root.TryGetProperty("message", out var msgProp) && msgProp.ValueKind == JsonValueKind.String + ? msgProp.GetString() + : null; + var body = root.TryGetProperty("body", out var bodyProp) && bodyProp.ValueKind == JsonValueKind.String + ? bodyProp.GetString() + : null; + + return JsonSerializer.Serialize(new + { + error = "github_proxy_access_denied", + detail = string.IsNullOrWhiteSpace(detail) ? "GitHub proxy returned 401/403 for the new agent API key." : detail, + http_status = status, + proxy_body = string.IsNullOrWhiteSpace(body) ? null : body, + hint = "The new agent API key was created with `allowed_service_ids=api-github` but cannot reach GitHub via NyxID. Verify the GitHub OAuth provider is connected at NyxID and that the key picks up the binding (NyxID `api-keys/{id}/bindings`). Until this is resolved the daily report will return empty/degraded output every run.", + nyx_provider_slug = nyxProviderSlug, + }); + } + catch (JsonException) + { + // Non-JSON probe response: don't pretend we know what's going on; let creation + // proceed so the agent can at least be created (operator can debug from logs). + return null; + } + } + + private static int? TryReadInt32Property(JsonElement element, string propertyName) + { + if (!element.TryGetProperty(propertyName, out var property) || + property.ValueKind != JsonValueKind.Number || + !property.TryGetInt32(out var value)) + { + return null; + } + return value; + } + + /// + /// Best-effort revoke of an API key minted earlier in the create flow. Used when a + /// preflight (e.g. GitHub proxy access) detects that the agent will be DOA at runtime, so + /// we don't leave orphan proxy-scoped keys behind on every retry. Failures here are + /// logged at Warning but do NOT propagate — the structured create-time error is the + /// user-facing signal; an orphan key is an ops cleanup concern, not a hard failure. + /// + private async Task BestEffortRevokeApiKeyAsync( + NyxIdApiClient nyxClient, + string sessionToken, + string apiKeyId, + string reason, + CancellationToken ct) + { + if (string.IsNullOrWhiteSpace(apiKeyId)) + return; + + try + { + var response = await nyxClient.DeleteApiKeyAsync(sessionToken, apiKeyId, ct); + if (LarkProxyResponse.TryGetError(response, out _, out var detail)) + { + _logger?.LogWarning( + "Failed to revoke orphan agent API key {ApiKeyId} after {Reason}: {Detail}", + apiKeyId, + reason, + detail); + } + } + catch (Exception ex) + { + _logger?.LogWarning( + ex, + "Exception revoking orphan agent API key {ApiKeyId} after {Reason}", + apiKeyId, + reason); + } + } + + private LarkReceiveTargetWithFallback ResolveDeliveryTarget(string conversationId, string agentId) { var chatType = AgentToolRequestContext.TryGet(ChannelMetadataKeys.ChatType); var senderId = AgentToolRequestContext.TryGet(ChannelMetadataKeys.SenderId); var unionId = AgentToolRequestContext.TryGet(ChannelMetadataKeys.LarkUnionId); var chatId = AgentToolRequestContext.TryGet(ChannelMetadataKeys.LarkChatId); - var target = LarkConversationTargets.BuildFromInbound( + var target = LarkConversationTargets.BuildFromInboundWithFallback( chatType, conversationId, senderId, unionId, chatId); - if (target.FellBackToPrefixInference) + if (target.Primary.FellBackToPrefixInference) { _logger?.LogDebug( "Agent builder fell back to legacy delivery target inference for {AgentId}: chatType={ChatType}, hasUnionId={HasUnionId}, hasLarkChatId={HasLarkChatId}, hasSenderId={HasSenderId}, resolvedReceiveIdType={ReceiveIdType}. Cross-app outbound (e.g. customer api-lark-bot) may surface Lark `99992361 open_id cross app` until the relay propagates union_id.", @@ -1445,7 +1596,7 @@ private LarkReceiveTarget ResolveDeliveryTarget(string conversationId, string ag !string.IsNullOrWhiteSpace(unionId), !string.IsNullOrWhiteSpace(chatId), !string.IsNullOrWhiteSpace(senderId), - target.ReceiveIdType); + target.Primary.ReceiveIdType); } return target; diff --git a/agents/Aevatar.GAgents.ChannelRuntime/ChannelConversationTurnRunner.cs b/agents/Aevatar.GAgents.ChannelRuntime/ChannelConversationTurnRunner.cs index d55996b75..5f9121ec4 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/ChannelConversationTurnRunner.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/ChannelConversationTurnRunner.cs @@ -496,18 +496,32 @@ private async Task SendReplyAsync( relayDelivery); } + // The dispatcher has already consumed the relay reply token via NyxID's + // `channel-relay/reply` endpoint — even when the upstream returns 5xx, NyxID's + // single-use semantics mark the token as used before the failure surfaces. A second + // call with the same token (the previous "degrade to text" retry) lands as + // `401 Reply token already used`, which then escapes as a hard relay failure and + // queues an inbound turn retry that re-consumes the (already gone) token forever + // — observed in production after PR #409 introduced interactive cards: NyxID + // returned 502 for the card payload, the legacy fallback re-sent as text and got + // 401, and the bot looked silent on every subsequent DM. + // + // Use the distinct `relay_reply_token_consumed` error code so `ToRelayFailure` maps + // it to `PermanentFailure` (vs. transient). Without this, `ConversationGAgent + // .HandleInboundTurnTransientFailureAsync` would queue an `InboundTurnRetryScheduled + // Event` and re-run the same inbound turn with the same already-consumed token — + // shifting the 401 cascade from in-turn replay (fixed) to grain-level replay (still + // broken). The token is single-use, so we get exactly one attempt per inbound; if + // that fails, the only correct recovery is to NOT replay it. _logger.LogWarning( - "Interactive relay reply rejected; degrading to text. messageId={MessageId}, detail={Detail}", + "Interactive relay reply rejected; reply token consumed, not retrying. messageId={MessageId}, detail={Detail}", relayDelivery.ReplyMessageId, dispatch.Detail); - return await SendRelayTextFallbackAsync( - fallbackText, - sentActivitySeed, - conversation, - inbound, - relayDelivery, - relayToken, - ct); + return ToRelayFailure(EmitResult.Failed( + "relay_reply_token_consumed", + string.IsNullOrWhiteSpace(dispatch.Detail) + ? "Interactive relay reply rejected; reply token consumed." + : dispatch.Detail)); } private async Task SendRelayTextFallbackAsync( @@ -1025,6 +1039,12 @@ private static ConversationTurnResult ToRelayFailure(EmitResult emit) return errorCode switch { + // The reply token has already been consumed (single-use). Re-running the inbound + // turn at grain level (`ConversationGAgent.HandleInboundTurnTransientFailureAsync`) + // would replay the same token and get `401 Reply token already used` forever, so + // route to PermanentFailure to short-circuit the retry queue. The user-facing + // recovery is to send a fresh inbound message which carries a fresh token. + "relay_reply_token_consumed" or "reply_token_missing_or_expired" or "missing_reply_message_id" or "empty_reply" => ConversationTurnResult.PermanentFailure(errorCode, errorMessage), _ when emit.RetryAfterTimeSpan is { } retryAfter => diff --git a/agents/Aevatar.GAgents.ChannelRuntime/FeishuCardHumanInteractionPort.cs b/agents/Aevatar.GAgents.ChannelRuntime/FeishuCardHumanInteractionPort.cs index 289a67633..e3b5e3fb9 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/FeishuCardHumanInteractionPort.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/FeishuCardHumanInteractionPort.cs @@ -338,29 +338,90 @@ private async Task SendMessageAsync( deliveryTarget.ReceiveIdType); } + var outcome = await TrySendWithFallbackAsync( + target, + messageType, + contentJson, + deliveryTarget, + emptyResponseMessage, + cancellationToken); + + if (!outcome.Succeeded) + { + throw new InvalidOperationException(BuildLarkRejectionMessage(failurePrefix, outcome.LarkCode, outcome.Detail)); + } + } + + private readonly record struct SendOutcome(bool Succeeded, int? LarkCode, string Detail) + { + public static SendOutcome Success() => new(true, null, string.Empty); + public static SendOutcome Failed(int? larkCode, string detail) => new(false, larkCode, detail); + } + + /// + /// Mirrors SkillRunnerGAgent.TrySendWithFallbackAsync: tries the typed primary + /// delivery target, then on a Lark 230002 bot not in chat rejection retries once + /// with the fallback target persisted on . + /// Returns success vs. failure (with Lark code+detail) so the caller can throw cleanly. + /// + private async Task TrySendWithFallbackAsync( + UserAgentCatalogEntry target, + string messageType, + string contentJson, + LarkReceiveTarget primary, + string emptyResponseMessage, + CancellationToken cancellationToken) + { + var primaryResult = await SendOutboundAsync(target, messageType, contentJson, primary, cancellationToken); + if (string.IsNullOrWhiteSpace(primaryResult)) + throw new InvalidOperationException(emptyResponseMessage); + if (!LarkProxyResponse.TryGetError(primaryResult, out var larkCode, out var detail)) + return SendOutcome.Success(); + + if (larkCode != LarkBotErrorCodes.BotNotInChat) + return SendOutcome.Failed(larkCode, detail); + + var fallbackId = target.LarkReceiveIdFallback?.Trim(); + var fallbackType = target.LarkReceiveIdTypeFallback?.Trim(); + if (string.IsNullOrEmpty(fallbackId) || string.IsNullOrEmpty(fallbackType)) + return SendOutcome.Failed(larkCode, detail); + + _logger.LogInformation( + "Feishu human interaction port primary delivery target rejected as `bot not in chat` (code 230002); retrying with fallback typed pair: agent={AgentId}, fallbackType={FallbackType}", + target.AgentId, + fallbackType); + + var fallbackTarget = new LarkReceiveTarget(fallbackId, fallbackType, FellBackToPrefixInference: false); + var fallbackResult = await SendOutboundAsync(target, messageType, contentJson, fallbackTarget, cancellationToken); + if (string.IsNullOrWhiteSpace(fallbackResult)) + throw new InvalidOperationException(emptyResponseMessage); + if (!LarkProxyResponse.TryGetError(fallbackResult, out var fallbackCode, out var fallbackDetail)) + return SendOutcome.Success(); + return SendOutcome.Failed(fallbackCode, fallbackDetail); + } + + private async Task SendOutboundAsync( + UserAgentCatalogEntry target, + string messageType, + string contentJson, + LarkReceiveTarget receiveTarget, + CancellationToken cancellationToken) + { var body = JsonSerializer.Serialize(new { - receive_id = deliveryTarget.ReceiveId, + receive_id = receiveTarget.ReceiveId, msg_type = messageType, content = contentJson, }); - var result = await _nyxIdApiClient.ProxyRequestAsync( + return await _nyxIdApiClient.ProxyRequestAsync( target.NyxApiKey, target.NyxProviderSlug, - $"open-apis/im/v1/messages?receive_id_type={deliveryTarget.ReceiveIdType}", + $"open-apis/im/v1/messages?receive_id_type={receiveTarget.ReceiveIdType}", "POST", body, extraHeaders: null, cancellationToken); - - if (string.IsNullOrWhiteSpace(result)) - throw new InvalidOperationException(emptyResponseMessage); - - if (LarkProxyResponse.TryGetError(result, out var larkCode, out var detail)) - { - throw new InvalidOperationException(BuildLarkRejectionMessage(failurePrefix, larkCode, detail)); - } } private static string BuildLarkRejectionMessage(string failurePrefix, int? larkCode, string detail) @@ -378,6 +439,21 @@ private static string BuildLarkRejectionMessage(string failurePrefix, int? larkC "delete and recreate it (`/agents` → Delete → `/social-media`) to pick up the cross-app safe target."; } + if (larkCode == LarkBotErrorCodes.UserIdCrossTenant) + { + // Cross-tenant variant of the open_id case — even union_id fails. Same recovery + // shape: recreate the agent so the chat_id-preferred outbound takes effect, or + // align the NyxID `s/api-lark-bot` proxy with the channel-bot that received the + // inbound event so the apps share a tenant. + return + $"{failurePrefix} (code={larkCode}): {detail}. " + + "The outbound Lark app is in a different tenant than the inbound app, so " + + "user-id translation is impossible. Delete and recreate the workflow agent " + + "(`/agents` → Delete → `/social-media`) so the new chat_id-preferred outbound " + + "path takes effect, or align the NyxID `s/api-lark-bot` proxy with the channel-bot " + + "that received the inbound event."; + } + return larkCode is { } code ? $"{failurePrefix} (code={code}): {detail}" : $"{failurePrefix}: {detail}"; diff --git a/agents/Aevatar.GAgents.ChannelRuntime/LarkBotErrorCodes.cs b/agents/Aevatar.GAgents.ChannelRuntime/LarkBotErrorCodes.cs index a546384f2..4db37ee85 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/LarkBotErrorCodes.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/LarkBotErrorCodes.cs @@ -23,4 +23,26 @@ internal static class LarkBotErrorCodes /// cross-app safe pair. /// public const int OpenIdCrossApp = 99992361; + + /// + /// "user id cross tenant" — Lark union_id is tenant-scoped. When the relay-side + /// ingress Lark app and the outbound proxy Lark app live in different Lark tenants (e.g. + /// NyxID-administered api-lark-bot proxy bound to a different tenant than the user's + /// own bot that subscribed to events), receive_id_type=union_id is rejected. + /// Resolution is configuration-side: align the NyxID proxy's downstream Lark app with the + /// channel-bot that received the inbound event, OR rebuild the agent so the new + /// chat_id-preferred path takes effect (chat_id traverses no user-id translation). + /// + public const int UserIdCrossTenant = 99992364; + + /// + /// "Bot is not in the chat" — the outbound app is not a member of the chat referenced by + /// receive_id_type=chat_id. For DMs, each Lark app has its own DM thread with the + /// user, so a chat_id captured by the relay-side ingress app is rejected by a different + /// outbound app even within the same tenant. Triggers the runtime fallback to the + /// secondary delivery target (typically union_id) in + /// SkillRunnerGAgent.SendOutputAsync and + /// FeishuCardHumanInteractionPort.SendMessageAsync. + /// + public const int BotNotInChat = 230002; } diff --git a/agents/Aevatar.GAgents.ChannelRuntime/LarkConversationTargets.cs b/agents/Aevatar.GAgents.ChannelRuntime/LarkConversationTargets.cs index ded90680b..e19de2c2f 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/LarkConversationTargets.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/LarkConversationTargets.cs @@ -60,29 +60,30 @@ public static string ResolveReceiveIdType(string? conversationId) /// Builds the typed receive-target for a Lark inbound captured at agent creation. /// /// - /// p2p (DM): prefer the tenant-stable union_id (on_*) when the relay - /// surfaces it. union_id is cross-app safe within the tenant — Lark accepts it as a - /// receive_id_type=union_id target regardless of whether the relay-side ingress app - /// matches the customer's outbound app. Without union_id we fall back to the sender - /// open_id (ou_*), which is app-scoped and produces - /// code:99992361 open_id cross app when the two apps differ; the fallback flips - /// FellBackToPrefixInference=true so the call site emits a Debug breadcrumb and - /// operators can correlate Lark rejections with missing-union_id ingress. + /// Priority order (all conversation types): chat_id > union_id > + /// open_id. chat_id (oc_*) is the most direct identifier — for DMs it is + /// the literal chat thread between the user and the bot that received the inbound event, so + /// when the outbound proxy authenticates as the SAME Lark app, sending back via + /// receive_id_type=chat_id targets the same chat without traversing any user-id + /// resolution. union_id is tenant-scoped, valid across apps in one tenant but rejected + /// cross-tenant (code:99992364 user id cross tenant). open_id is app-scoped and + /// rejected even cross-app within the same tenant (code:99992361 open_id cross app). /// /// /// - /// group / channel / thread: prefer the inbound Lark chat_id (oc_*) which - /// is tenant-scoped — any app added to the chat can address it via - /// receive_id_type=chat_id. Without an explicit Lark chat_id the helper falls back to - /// the routing , which works only when the routing id is - /// itself a Lark oc_*; otherwise the outbound proxy will surface a Lark validation - /// failure that the call site logs and retries. + /// Earlier revisions inverted this for p2p (preferring union_id) on the assumption + /// that DM chat_id is bot-specific and the relay-side ingress bot might differ from + /// the outbound app. Production logs from PR #409 showed the opposite failure mode in this + /// deployment (NyxID's s/api-lark-bot proxy and the relay-side ingress are in + /// different tenants), so union_id hits cross tenant for the typical case. + /// chat_id works whenever the outbound app matches the ingress app — the most common + /// real configuration — and degrades cleanly to union_id / open_id otherwise. /// /// /// - /// If the inbound is p2p but the relay omitted both union_id and senderId, - /// returning a typed pair would silently re-create the original /daily 400 (typing the user - /// open_id as chat_id). Instead, return an empty typed pair with + /// If none of the typed identifiers are available (no chat_id, no union_id, no senderId), + /// returning a typed pair would silently re-create the original /daily 400 (typing the + /// conversation_id as chat_id). Instead return an empty typed pair with /// FellBackToPrefixInference=true so falls back to the legacy /// prefix path and call sites emit a Debug breadcrumb. /// @@ -94,16 +95,25 @@ public static LarkReceiveTarget BuildFromInbound( string? larkUnionId = null, string? larkChatId = null) { + // Most-direct first: the actual Lark chat the inbound was received in. Tenant-scoped + // and survives cross-app-within-tenant configurations as long as the outbound app is + // also a member of the chat — which the relay-side ingress bot is by construction (it + // received the message there). + var trimmedChat = (larkChatId ?? string.Empty).Trim(); + if (!string.IsNullOrEmpty(trimmedChat)) + return new LarkReceiveTarget(trimmedChat, DefaultReceiveIdType, FellBackToPrefixInference: false); + if (IsDirectMessage(chatType)) { - // Cross-app safe: tenant-stable user identifier, accepted by any Lark app. + // Tenant-stable user identifier. Surfaces `code:99992364 user id cross tenant` when + // the relay-side ingress and outbound apps are in different tenants — flag the + // fallback so call sites can LogDebug for incident correlation. var trimmedUnion = (larkUnionId ?? string.Empty).Trim(); if (!string.IsNullOrEmpty(trimmedUnion)) - return new LarkReceiveTarget(trimmedUnion, UnionIdReceiveIdType, FellBackToPrefixInference: false); + return new LarkReceiveTarget(trimmedUnion, UnionIdReceiveIdType, FellBackToPrefixInference: true); - // Fallback: app-scoped open_id. Will surface `code:99992361 open_id cross app` from - // Lark when the relay-side ingress app does not match the customer's outbound app. - // Flag the fallback so call sites can LogDebug for incident correlation. + // App-scoped open_id. Surfaces `code:99992361 open_id cross app` when the apps + // differ even within the same tenant. var trimmedSender = (senderId ?? string.Empty).Trim(); if (!string.IsNullOrEmpty(trimmedSender)) return new LarkReceiveTarget(trimmedSender, OpenIdReceiveIdType, FellBackToPrefixInference: true); @@ -111,14 +121,9 @@ public static LarkReceiveTarget BuildFromInbound( return new LarkReceiveTarget(string.Empty, string.Empty, FellBackToPrefixInference: true); } - // group / channel / thread: prefer the inbound Lark chat_id (cross-app within tenant). - var trimmedChat = (larkChatId ?? string.Empty).Trim(); - if (!string.IsNullOrEmpty(trimmedChat)) - return new LarkReceiveTarget(trimmedChat, DefaultReceiveIdType, FellBackToPrefixInference: false); - - // Fallback: assume the routing conversation_id is a Lark `oc_*` (legacy behavior pre - // ingress-side chat_id capture). If it is not, the proxy will reject and the call site - // logs the surfaced Lark error. + // Non-DM with no Lark chat_id surfaced: assume the routing conversation_id is a Lark + // `oc_*` (legacy behavior pre ingress-side chat_id capture). If it is not, the proxy + // will reject and the call site logs the surfaced Lark error. var trimmedConversation = (conversationId ?? string.Empty).Trim(); return new LarkReceiveTarget(trimmedConversation, DefaultReceiveIdType, FellBackToPrefixInference: false); } @@ -128,9 +133,61 @@ public static LarkReceiveTarget BuildFromInbound( // narrow until a second emitter (e.g. a Telegram bridge) actually lands. private static bool IsDirectMessage(string? chatType) => string.Equals(chatType?.Trim(), "p2p", StringComparison.Ordinal); + + /// + /// Builds the primary outbound delivery target plus a secondary fallback. The primary + /// follows 's priority (chat_id > union_id > open_id). + /// The fallback is the next-best identifier we have at ingress time so the runtime can + /// retry once on a Lark 230002 bot not in chat rejection without needing a fresh + /// ingress event: + /// + /// For p2p: when the primary is chat_id (`oc_*`), the fallback is union_id (`on_*`) when the relay surfaced one. This recovers cross-app same-tenant deployments where the outbound app is not in the DM chat. + /// For groups: no fallback — chat_id is tenant-scoped and either works (any app in the chat) or fails for reasons that union_id wouldn't fix. + /// + /// + public static LarkReceiveTargetWithFallback BuildFromInboundWithFallback( + string? chatType, + string? conversationId, + string? senderId, + string? larkUnionId = null, + string? larkChatId = null) + { + var primary = BuildFromInbound(chatType, conversationId, senderId, larkUnionId, larkChatId); + + // Only useful when the primary is chat_id AND we still have a tenant-stable user + // identifier to try in cross-app same-tenant scenarios. Skip when the primary is + // already union_id or open_id — those don't need a fallback because they are NOT + // app-specific in the way DM chat_id is. + if (!IsDirectMessage(chatType)) + return new LarkReceiveTargetWithFallback(primary, Fallback: null); + + if (!string.Equals(primary.ReceiveIdType, DefaultReceiveIdType, StringComparison.Ordinal)) + return new LarkReceiveTargetWithFallback(primary, Fallback: null); + + var trimmedUnion = (larkUnionId ?? string.Empty).Trim(); + if (!string.IsNullOrEmpty(trimmedUnion)) + return new LarkReceiveTargetWithFallback( + primary, + new LarkReceiveTarget(trimmedUnion, UnionIdReceiveIdType, FellBackToPrefixInference: false)); + + return new LarkReceiveTargetWithFallback(primary, Fallback: null); + } } internal readonly record struct LarkReceiveTarget( string ReceiveId, string ReceiveIdType, bool FellBackToPrefixInference); + +/// +/// Primary outbound delivery target plus a secondary fallback. Captured at agent-create time +/// when the inbound surfaces both a chat_id (primary) and a union_id (fallback). The runtime +/// tries the primary first; on a Lark 230002 bot not in chat rejection — the failure +/// mode for cross-app same-tenant deployments where the outbound app is not a member of the +/// inbound DM chat — it retries once with the fallback. Without the fallback, switching to +/// chat_id-first would regress those deployments because chat_id is bot-specific for DMs and +/// only valid when the same Lark app received the inbound. +/// +internal readonly record struct LarkReceiveTargetWithFallback( + LarkReceiveTarget Primary, + LarkReceiveTarget? Fallback); diff --git a/agents/Aevatar.GAgents.ChannelRuntime/LarkProxyResponse.cs b/agents/Aevatar.GAgents.ChannelRuntime/LarkProxyResponse.cs index 7d823b469..b705a7d0b 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/LarkProxyResponse.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/LarkProxyResponse.cs @@ -4,19 +4,50 @@ namespace Aevatar.GAgents.ChannelRuntime; /// /// Inspects response bodies returned by NyxIdApiClient.ProxyRequestAsync for downstream -/// Lark API calls. The proxy is two-layered: HTTP non-2xx from NyxID gets packaged into a -/// Nyx error envelope ({"error": true, "message": "..."} or {"error": "..."}), -/// and HTTP 200 from NyxID may still carry a Lark business error -/// ({"code": <non-zero>, "msg": "..."}). Callers that ignore the result silently -/// drop both classes of failure, which is what motivates this helper. +/// Lark API calls. The proxy is two-layered: +/// +/// +/// HTTP 200 from NyxID may still carry a Lark business error at the top +/// level: {"code": <non-zero>, "msg": "..."}. +/// HTTP non-2xx from NyxID is wrapped by NyxIdApiClient.SendAsync +/// (NyxIdApiClient.cs:680) as {"error": true, "status": <http>, "body": "<raw +/// downstream body>"}. The body is a STRING containing the raw Lark JSON, so +/// the Lark business code (e.g. 99992364 user id cross tenant on HTTP 400, or +/// 230002 bot not in chat on HTTP 400) is nested INSIDE the string and must be parsed +/// from there before larkCode-gated branches can fire. +/// Network/exception path uses {"error": true, "message": "..."} +/// (no status, no body). +/// +/// +/// Callers that ignore the result silently drop all three classes of failure, which is what +/// motivates this helper. Reviewer (PR #412 r3141700469) verbatim: +/// +/// production failures arrive through NyxIdApiClient.SendAsync as an HTTP-400 Nyx envelope: +/// {"error": true, "status": 400, "body": "{\"code\":99992364,...}"}. +/// LarkProxyResponse.TryGetError currently returns true for that shape but leaves +/// larkCode=null because it does not parse the nested body. +/// /// internal static class LarkProxyResponse { /// /// Returns true when the response body indicates a downstream failure. - /// is set only for the Lark business-error path so callers can selectively gate logging on - /// known recurring config gaps (e.g. 231002 = no permission to react). - /// is a short human-readable summary suitable for log lines or exception messages. + /// is set whenever a Lark business code can be extracted — top-level for HTTP-200 + /// responses, OR nested in the body string for HTTP-non-2xx wrapped envelopes. + /// is a short human-readable summary suitable for log lines or + /// exception messages. + /// + /// Branch order (invariant): top-level code is checked BEFORE the + /// error envelope. The two production shapes are mutually exclusive today + /// (HTTP-200 → top-level code only; HTTP-non-2xx → {error:true,status,body} + /// only — SendAsync never emits both at the same level), so for every observed + /// response either order yields the same result. The order is fixed deliberately for + /// forward-compat: if NyxID ever wraps a successful Lark business rejection as a hybrid + /// {"error":true,"status":200,"code":230002,...}, this prioritizes the explicit + /// Lark business code over the generic "nyx says error" envelope. The branch ordering + /// reversed in PR #412 (was: error first, then top-level code); reviewer + /// (PR #412 long-form review §4) flagged the implicit reversal — capturing the rationale + /// here so future readers do not "fix" it back. /// public static bool TryGetError(string? response, out int? larkCode, out string detail) { @@ -32,36 +63,52 @@ public static bool TryGetError(string? response, out int? larkCode, out string d if (root.ValueKind != JsonValueKind.Object) return false; + // Top-level Lark business error: HTTP 200 from NyxID, but Lark embedded a non-zero + // code (e.g. `code:231002 no permission to react` on a successful HTTP transport). + if (root.TryGetProperty("code", out var topCodeProperty) && + topCodeProperty.ValueKind == JsonValueKind.Number && + topCodeProperty.TryGetInt32(out var topCode) && + topCode != 0) + { + larkCode = topCode; + detail = TryReadString(root, "msg") ?? $"code={topCode}"; + return true; + } + if (root.TryGetProperty("error", out var errorProperty)) { + var hasErrorFlag = errorProperty.ValueKind == JsonValueKind.True || + (errorProperty.ValueKind == JsonValueKind.String && !string.IsNullOrWhiteSpace(errorProperty.GetString())); + if (!hasErrorFlag) + return false; + + // Nyx HTTP-non-2xx wrapper: try to recover the nested Lark code/msg from the + // `body` string. Reviewer (PR #412 r3141700469) called this out — without + // nested parsing, every HTTP-400 Lark business error (which is the common + // production case) hits this path with `larkCode=null` and the gated branches + // (BotNotInChat retry, UserIdCrossTenant hint) never fire. + if (TryParseNestedLarkBody(root, out var nestedCode, out var nestedDetail)) + { + larkCode = nestedCode; + detail = nestedDetail; + return true; + } + if (errorProperty.ValueKind == JsonValueKind.True) { detail = TryReadString(root, "message") ?? TryReadString(root, "body") + ?? FormatStatusFallback(root) ?? "proxy_error"; return true; } if (errorProperty.ValueKind == JsonValueKind.String) { - var error = errorProperty.GetString()?.Trim(); - if (!string.IsNullOrWhiteSpace(error)) - { - detail = error; - return true; - } + detail = errorProperty.GetString()!.Trim(); + return true; } } - - if (root.TryGetProperty("code", out var codeProperty) && - codeProperty.ValueKind == JsonValueKind.Number && - codeProperty.TryGetInt32(out var code) && - code != 0) - { - larkCode = code; - detail = TryReadString(root, "msg") ?? $"code={code}"; - return true; - } } catch (JsonException) { @@ -72,6 +119,67 @@ public static bool TryGetError(string? response, out int? larkCode, out string d return false; } + /// + /// Parses {"error":true,"status":400,"body":"<raw json string>"} shapes — the + /// envelope NyxIdApiClient.SendAsync produces for HTTP-non-2xx responses. When the + /// nested body is a JSON object with code != 0, returns the Lark code and a + /// `code=N: msg` detail so callers see the actual upstream rejection. + /// + private static bool TryParseNestedLarkBody(JsonElement root, out int? larkCode, out string detail) + { + larkCode = null; + detail = string.Empty; + + var rawBody = TryReadString(root, "body"); + if (string.IsNullOrEmpty(rawBody)) + return false; + + try + { + using var nested = JsonDocument.Parse(rawBody); + var nestedRoot = nested.RootElement; + if (nestedRoot.ValueKind != JsonValueKind.Object) + return false; + + if (!nestedRoot.TryGetProperty("code", out var codeProperty) || + codeProperty.ValueKind != JsonValueKind.Number || + !codeProperty.TryGetInt32(out var code) || + code == 0) + return false; + + larkCode = code; + var msg = TryReadString(nestedRoot, "msg") ?? $"code={code}"; + // Carry the Nyx HTTP status alongside the Lark code so log lines and exception + // messages preserve enough context to correlate with NyxIdApiClient warnings. + var status = TryReadInt32(root, "status"); + detail = status is { } s + ? $"nyx_status={s} lark_code={code} msg={msg}" + : $"lark_code={code} msg={msg}"; + return true; + } + catch (JsonException) + { + return false; + } + } + + private static string? FormatStatusFallback(JsonElement root) + { + var status = TryReadInt32(root, "status"); + return status is { } s ? $"nyx_status={s}" : null; + } + + private static int? TryReadInt32(JsonElement element, string propertyName) + { + if (!element.TryGetProperty(propertyName, out var property) || + property.ValueKind != JsonValueKind.Number || + !property.TryGetInt32(out var value)) + { + return null; + } + return value; + } + private static string? TryReadString(JsonElement element, string propertyName) { if (!element.TryGetProperty(propertyName, out var property) || diff --git a/agents/Aevatar.GAgents.ChannelRuntime/SkillRunnerGAgent.cs b/agents/Aevatar.GAgents.ChannelRuntime/SkillRunnerGAgent.cs index 60a7e78fa..baa86249c 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/SkillRunnerGAgent.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/SkillRunnerGAgent.cs @@ -284,20 +284,9 @@ private async Task SendOutputAsync(string output, CancellationToken ct) deliveryTarget.ReceiveIdType); } - var body = JsonSerializer.Serialize(new - { - receive_id = deliveryTarget.ReceiveId, - msg_type = "text", - content = JsonSerializer.Serialize(new { text = output }), - }); + var outcome = await TrySendWithFallbackAsync(client, output, deliveryTarget, ct); - var response = await client.ProxyRequestAsync( - State.OutboundConfig.NyxApiKey, - State.OutboundConfig.NyxProviderSlug, - $"open-apis/im/v1/messages?receive_id_type={deliveryTarget.ReceiveIdType}", - "POST", body, null, ct); - - if (LarkProxyResponse.TryGetError(response, out var larkCode, out var detail)) + if (!outcome.Succeeded) { // Surface downstream rejection so HandleTriggerAsync sees a real failure instead of // persisting SkillRunnerExecutionCompletedEvent on a silently-dropped Lark response. @@ -306,10 +295,79 @@ private async Task SendOutputAsync(string output, CancellationToken ct) // message into actionable recovery guidance — otherwise the user sees a cryptic // `99992361 open_id cross app` and has no way to know they need to rebuild the // agent. - throw new InvalidOperationException(BuildLarkRejectionMessage(larkCode, detail)); + throw new InvalidOperationException(BuildLarkRejectionMessage(outcome.LarkCode, outcome.Detail)); } } + private readonly record struct SendOutcome(bool Succeeded, int? LarkCode, string Detail) + { + public static SendOutcome Success() => new(true, null, string.Empty); + public static SendOutcome Failed(int? larkCode, string detail) => new(false, larkCode, detail); + } + + /// + /// Sends the outbound text via the typed primary delivery target, then on a Lark + /// 230002 bot not in chat rejection retries once with the fallback target if one + /// was captured at agent-create time. The fallback covers cross-app same-tenant + /// deployments where the outbound app is not a member of the inbound DM chat — without + /// it, the chat_id-first priority would regress those deployments. Returns success vs. + /// failure (with Lark code+detail) so the caller can throw cleanly without re-parsing + /// the response. + /// + private async Task TrySendWithFallbackAsync( + NyxIdApiClient client, + string output, + LarkReceiveTarget primary, + CancellationToken ct) + { + var primaryResponse = await SendOutboundAsync(client, output, primary, ct); + if (!LarkProxyResponse.TryGetError(primaryResponse, out var larkCode, out var detail)) + return SendOutcome.Success(); + + // Only Lark `bot not in chat` triggers the fallback. Nyx envelope errors (no Lark + // code) and other Lark business errors propagate directly so the user sees actionable + // recovery guidance for the actual failure mode. + if (larkCode != LarkBotErrorCodes.BotNotInChat) + return SendOutcome.Failed(larkCode, detail); + + var fallbackId = State.OutboundConfig.LarkReceiveIdFallback?.Trim(); + var fallbackType = State.OutboundConfig.LarkReceiveIdTypeFallback?.Trim(); + if (string.IsNullOrEmpty(fallbackId) || string.IsNullOrEmpty(fallbackType)) + return SendOutcome.Failed(larkCode, detail); + + Logger.LogInformation( + "Skill runner {ActorId} primary delivery target rejected as `bot not in chat` (code 230002); retrying with fallback typed pair (receive_id_type={FallbackType})", + Id, + fallbackType); + + var fallbackTarget = new LarkReceiveTarget(fallbackId, fallbackType, FellBackToPrefixInference: false); + var fallbackResponse = await SendOutboundAsync(client, output, fallbackTarget, ct); + if (!LarkProxyResponse.TryGetError(fallbackResponse, out var fallbackCode, out var fallbackDetail)) + return SendOutcome.Success(); + + return SendOutcome.Failed(fallbackCode, fallbackDetail); + } + + private async Task SendOutboundAsync( + NyxIdApiClient client, + string output, + LarkReceiveTarget target, + CancellationToken ct) + { + var body = JsonSerializer.Serialize(new + { + receive_id = target.ReceiveId, + msg_type = "text", + content = JsonSerializer.Serialize(new { text = output }), + }); + + return await client.ProxyRequestAsync( + State.OutboundConfig.NyxApiKey, + State.OutboundConfig.NyxProviderSlug, + $"open-apis/im/v1/messages?receive_id_type={target.ReceiveIdType}", + "POST", body, null, ct); + } + private static string BuildLarkRejectionMessage(int? larkCode, string detail) { if (larkCode == LarkBotErrorCodes.OpenIdCrossApp) @@ -324,6 +382,21 @@ private static string BuildLarkRejectionMessage(int? larkCode, string detail) "delete and recreate it (`/agents` → Delete → `/daily`) to pick up the cross-app safe target."; } + if (larkCode == LarkBotErrorCodes.UserIdCrossTenant) + { + // Even union_id is rejected — the relay-side ingress and outbound apps are in + // different Lark tenants. No user-id-based identifier survives that boundary; + // recreating the agent makes the new chat_id-preferred path take effect (chat_id + // bypasses user-id translation entirely as long as the same app is on both ends). + return + $"Lark message delivery rejected (code={larkCode}): {detail}. " + + "The outbound Lark app is in a different tenant than the inbound app, so " + + "user-id translation is impossible. Delete and recreate the agent " + + "(`/agents` → Delete → `/daily`) so the new chat_id-preferred outbound path " + + "takes effect, or align the NyxID `s/api-lark-bot` proxy with the channel-bot that " + + "received the inbound event."; + } + return larkCode is { } code ? $"Lark message delivery rejected (code={code}): {detail}" : $"Lark message delivery rejected: {detail}"; @@ -383,6 +456,8 @@ private async Task UpsertRegistryAsync(string status, CancellationToken ct) Status = status, LarkReceiveId = State.OutboundConfig?.LarkReceiveId ?? string.Empty, LarkReceiveIdType = State.OutboundConfig?.LarkReceiveIdType ?? string.Empty, + LarkReceiveIdFallback = State.OutboundConfig?.LarkReceiveIdFallback ?? string.Empty, + LarkReceiveIdTypeFallback = State.OutboundConfig?.LarkReceiveIdTypeFallback ?? string.Empty, }; await actor.HandleEventAsync(BuildDirectEnvelope(actor.Id, command), ct); diff --git a/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogGAgent.cs b/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogGAgent.cs index 64724a3cf..27cb56430 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogGAgent.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogGAgent.cs @@ -55,6 +55,8 @@ public async Task HandleUpsertAsync(UserAgentCatalogUpsertCommand command) LastError = existing?.LastError ?? string.Empty, LarkReceiveId = MergeNonEmpty(command.LarkReceiveId, existing?.LarkReceiveId), LarkReceiveIdType = MergeNonEmpty(command.LarkReceiveIdType, existing?.LarkReceiveIdType), + LarkReceiveIdFallback = MergeNonEmpty(command.LarkReceiveIdFallback, existing?.LarkReceiveIdFallback), + LarkReceiveIdTypeFallback = MergeNonEmpty(command.LarkReceiveIdTypeFallback, existing?.LarkReceiveIdTypeFallback), }; await PersistDomainEventAsync(new UserAgentCatalogUpsertedEvent diff --git a/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogProjector.cs b/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogProjector.cs index 0beada63b..018df56a5 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogProjector.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogProjector.cs @@ -58,5 +58,7 @@ protected override UserAgentCatalogDocument Materialize( CreatedAt = entry.CreatedAt != null ? entry.CreatedAt.ToDateTimeOffset() : updatedAt, LarkReceiveId = entry.LarkReceiveId ?? string.Empty, LarkReceiveIdType = entry.LarkReceiveIdType ?? string.Empty, + LarkReceiveIdFallback = entry.LarkReceiveIdFallback ?? string.Empty, + LarkReceiveIdTypeFallback = entry.LarkReceiveIdTypeFallback ?? string.Empty, }; } diff --git a/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogQueryPort.cs b/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogQueryPort.cs index 03b01bb4e..b60a5f80a 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogQueryPort.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/UserAgentCatalogQueryPort.cs @@ -63,5 +63,7 @@ internal static UserAgentCatalogEntry ToEntry(UserAgentCatalogDocument document, Tombstoned = document.Tombstoned, LarkReceiveId = document.LarkReceiveId ?? string.Empty, LarkReceiveIdType = document.LarkReceiveIdType ?? string.Empty, + LarkReceiveIdFallback = document.LarkReceiveIdFallback ?? string.Empty, + LarkReceiveIdTypeFallback = document.LarkReceiveIdTypeFallback ?? string.Empty, }; } diff --git a/agents/Aevatar.GAgents.ChannelRuntime/WorkflowAgentGAgent.cs b/agents/Aevatar.GAgents.ChannelRuntime/WorkflowAgentGAgent.cs index 02130fac3..671cf012b 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/WorkflowAgentGAgent.cs +++ b/agents/Aevatar.GAgents.ChannelRuntime/WorkflowAgentGAgent.cs @@ -74,6 +74,8 @@ await PersistDomainEventAsync(new WorkflowAgentInitializedEvent Platform = command.Platform?.Trim() ?? string.Empty, LarkReceiveId = command.LarkReceiveId?.Trim() ?? string.Empty, LarkReceiveIdType = command.LarkReceiveIdType?.Trim() ?? string.Empty, + LarkReceiveIdFallback = command.LarkReceiveIdFallback?.Trim() ?? string.Empty, + LarkReceiveIdTypeFallback = command.LarkReceiveIdTypeFallback?.Trim() ?? string.Empty, }); await Scheduler.ScheduleNextRunAsync(DateTimeOffset.UtcNow, CancellationToken.None); @@ -233,6 +235,8 @@ private async Task UpsertRegistryAsync(string status, CancellationToken ct) Status = status, LarkReceiveId = State.LarkReceiveId ?? string.Empty, LarkReceiveIdType = State.LarkReceiveIdType ?? string.Empty, + LarkReceiveIdFallback = State.LarkReceiveIdFallback ?? string.Empty, + LarkReceiveIdTypeFallback = State.LarkReceiveIdTypeFallback ?? string.Empty, }; await actor.HandleEventAsync(BuildDirectEnvelope(actor.Id, command), ct); @@ -285,6 +289,8 @@ private static WorkflowAgentState ApplyInitialized(WorkflowAgentState current, W next.Platform = evt.Platform ?? string.Empty; next.LarkReceiveId = evt.LarkReceiveId ?? string.Empty; next.LarkReceiveIdType = evt.LarkReceiveIdType ?? string.Empty; + next.LarkReceiveIdFallback = evt.LarkReceiveIdFallback ?? string.Empty; + next.LarkReceiveIdTypeFallback = evt.LarkReceiveIdTypeFallback ?? string.Empty; return next; } diff --git a/agents/Aevatar.GAgents.ChannelRuntime/channel_runtime_messages.proto b/agents/Aevatar.GAgents.ChannelRuntime/channel_runtime_messages.proto index d3074cc18..6504947fa 100644 --- a/agents/Aevatar.GAgents.ChannelRuntime/channel_runtime_messages.proto +++ b/agents/Aevatar.GAgents.ChannelRuntime/channel_runtime_messages.proto @@ -187,10 +187,21 @@ message UserAgentCatalogEntry { int64 tombstone_state_version = 21; // Authoritative Lark outbound delivery target captured at create time. When // both fields are present, outbound senders use them verbatim instead of - // inferring receive_id_type from the conversation_id prefix. For p2p the - // creator stores the user open_id (`ou_*`) here, not the DM thread chat_id. + // inferring receive_id_type from the conversation_id prefix. The creator + // pins the chat-scoped `chat_id` (`oc_*`) here when available because it is + // the only identifier that survives both cross-app and cross-tenant + // boundaries when the same Lark app is on both ends of the relay. string lark_receive_id = 22; string lark_receive_id_type = 23; + // Secondary outbound delivery target. The runtime attempts the primary + // (`lark_receive_id`/`lark_receive_id_type`) first and falls back to this + // pair on a Lark `230002 bot not in chat` rejection — the failure mode + // when the outbound app is a different Lark app than the one that received + // the inbound DM (cross-app same-tenant deployment). The creator pins + // `union_id` (`on_*`) here so the fallback survives a chat_id rejection + // without needing a fresh ingress event. + string lark_receive_id_fallback = 24; + string lark_receive_id_type_fallback = 25; } message UserAgentCatalogState { @@ -215,6 +226,9 @@ message UserAgentCatalogUpsertCommand { // preserve any existing entry value via the merge-non-empty upsert policy. string lark_receive_id = 14; string lark_receive_id_type = 15; + // See UserAgentCatalogEntry.lark_receive_id_fallback for semantics. + string lark_receive_id_fallback = 16; + string lark_receive_id_type_fallback = 17; } message UserAgentCatalogTombstoneCommand { @@ -287,6 +301,11 @@ message UserAgentCatalogDocument { // through the projection rather than re-deriving from conversation_id. string lark_receive_id = 24; string lark_receive_id_type = 25; + // Mirrors UserAgentCatalogEntry.lark_receive_id_fallback*. Carried through + // the projection so catalog-backed senders see the same primary+fallback + // pair as actor-state senders. + string lark_receive_id_fallback = 26; + string lark_receive_id_type_fallback = 27; } // Runtime-only Nyx credential read model for delivery-target execution paths. @@ -315,6 +334,13 @@ message SkillRunnerOutboundConfig { // verbatim; conversation_id stays for LLM metadata propagation only. string lark_receive_id = 7; string lark_receive_id_type = 8; + // Secondary outbound delivery target. Used by SkillRunnerGAgent.SendOutputAsync + // when the primary chat_id-typed send is rejected with Lark `230002 bot not in + // chat` — the failure mode for cross-app same-tenant deployments where the + // outbound app is not a member of the inbound DM. See + // UserAgentCatalogEntry.lark_receive_id_fallback for the recommended pinning. + string lark_receive_id_fallback = 9; + string lark_receive_id_type_fallback = 10; } message SkillRunnerState { @@ -442,6 +468,10 @@ message WorkflowAgentState { // (e.g. FeishuCardHumanInteractionPort) read the typed target. string lark_receive_id = 19; string lark_receive_id_type = 20; + // Secondary outbound delivery target. See UserAgentCatalogEntry + // .lark_receive_id_fallback for runtime fallback semantics. + string lark_receive_id_fallback = 21; + string lark_receive_id_type_fallback = 22; } message InitializeWorkflowAgentCommand { @@ -462,6 +492,10 @@ message InitializeWorkflowAgentCommand { string platform = 14; string lark_receive_id = 15; string lark_receive_id_type = 16; + // Secondary outbound delivery target. See UserAgentCatalogEntry + // .lark_receive_id_fallback for runtime fallback semantics. + string lark_receive_id_fallback = 17; + string lark_receive_id_type_fallback = 18; } message WorkflowAgentInitializedEvent { @@ -482,6 +516,10 @@ message WorkflowAgentInitializedEvent { string platform = 14; string lark_receive_id = 15; string lark_receive_id_type = 16; + // Secondary outbound delivery target. See UserAgentCatalogEntry + // .lark_receive_id_fallback for runtime fallback semantics. + string lark_receive_id_fallback = 17; + string lark_receive_id_type_fallback = 18; } message TriggerWorkflowAgentExecutionCommand { diff --git a/test/Aevatar.GAgents.ChannelRuntime.Tests/AgentBuilderToolTests.cs b/test/Aevatar.GAgents.ChannelRuntime.Tests/AgentBuilderToolTests.cs index a71cae4bf..3d19f9883 100644 --- a/test/Aevatar.GAgents.ChannelRuntime.Tests/AgentBuilderToolTests.cs +++ b/test/Aevatar.GAgents.ChannelRuntime.Tests/AgentBuilderToolTests.cs @@ -231,14 +231,14 @@ await skillRunnerActor.Received(1).HandleEventAsync( } [Fact] - public async Task ExecuteAsync_CreateAgent_DailyReport_PinsLarkUnionId_When_RelayPropagatesIt() + public async Task ExecuteAsync_CreateAgent_DailyReport_PinsLarkChatId_When_RelayPropagatesIt() { - // Cross-app outbound delivery (`code:99992361 open_id cross app`) requires the - // tenant-stable `union_id`. When the relay surfaces it via - // ChannelMetadataKeys.LarkUnionId the typed delivery target on - // InitializeSkillRunnerCommand must pin (union_id, "union_id") instead of falling back - // to the relay-app-scoped open_id. Integration counterpart of - // LarkConversationTargetsTests.BuildFromInbound_ShouldPreferLarkUnionId_*. + // The new outbound priority pins (chat_id, "chat_id") whenever the relay surfaces + // ChannelMetadataKeys.LarkChatId — chat_id is the literal DM thread, no user-id + // translation is needed. This is the integration counterpart of + // LarkConversationTargetsTests.BuildFromInbound_ShouldPreferLarkChatId_ForP2pDirectMessages + // and is what survives both `99992361 open_id cross app` (PR #403/409) and + // `99992364 user id cross tenant` (PR after #409) failure modes in production. var queryPort = Substitute.For(); queryPort.GetStateVersionAsync("skill-runner-union-1", Arg.Any()) .Returns(Task.FromResult(null), Task.FromResult(1)); @@ -330,8 +330,8 @@ await skillRunnerActor.Received(1).HandleEventAsync( Arg.Is(e => e.Payload != null && e.Payload.Is(InitializeSkillRunnerCommand.Descriptor) && - e.Payload.Unpack().OutboundConfig.LarkReceiveId == "on_user_1" && - e.Payload.Unpack().OutboundConfig.LarkReceiveIdType == "union_id"), + e.Payload.Unpack().OutboundConfig.LarkReceiveId == "oc_dm_chat_1" && + e.Payload.Unpack().OutboundConfig.LarkReceiveIdType == "chat_id"), Arg.Any()); } finally @@ -340,6 +340,120 @@ await skillRunnerActor.Received(1).HandleEventAsync( } } + [Fact] + public async Task ExecuteAsync_CreateAgent_DailyReport_FailsClosed_When_GithubProxyDeniedForNewKey() + { + // Issue aevatarAI/aevatar#411: a daily_report agent is created with the new agent API + // key flagged `allowed_service_ids=api-github`, but if NyxID's binding for the user's + // GitHub OAuth credential is missing/expired, every scheduled run hits 401/403 from + // the proxy and the user only sees an empty / degraded report. Preflight + // `proxy/s/api-github/rate_limit` with the freshly minted key and fail-fast with an + // actionable error so the agent is not persisted in a "always-fails-at-runtime" state. + var queryPort = Substitute.For(); + queryPort.GetStateVersionAsync(Arg.Any(), Arg.Any()) + .Returns(Task.FromResult(null)); + + var skillRunnerActor = Substitute.For(); + skillRunnerActor.Id.Returns("skill-runner-github-403"); + var actorRuntime = Substitute.For(); + actorRuntime.GetAsync("skill-runner-github-403").Returns(Task.FromResult(null)); + actorRuntime.CreateAsync("skill-runner-github-403", Arg.Any()) + .Returns(Task.FromResult(skillRunnerActor)); + + var handler = new RoutingJsonHandler(); + handler.Add(HttpMethod.Get, "/api/v1/users/me", """{"user":{"id":"user-1"}}"""); + handler.Add(HttpMethod.Get, "/api/v1/providers/my-tokens", """ + { + "tokens": [ + { + "provider_id":"provider-github", + "provider_name":"GitHub", + "provider_slug":"github", + "provider_type":"oauth2", + "status":"active", + "connected_at":"2026-04-15T00:00:00Z" + } + ] + } + """); + handler.Add(HttpMethod.Get, "/api/v1/proxy/services?per_page=100", """ + { + "services": [{"id":"svc-github","slug":"api-github"}], + "custom_services": [{"id":"svc-lark","slug":"api-lark-bot"}], + "total": 2, + "page": 1, + "per_page": 100 + } + """); + handler.Add(HttpMethod.Post, "/api/v1/api-keys", """{"id":"key-403","full_key":"full-key-403"}"""); + // The preflight: `NyxIdApiClient.SendAsync` wraps any HTTP non-2xx as + // `{"error": true, "status": , "body": ""}` (NyxIdApiClient.cs:680). + // Reviewer (PR #412 r3141699476) caught that the previous handler shape used `"code"` + // but real production uses `"status"` — mirror the actual envelope so the parser is + // exercised against what runtime delivers, not a synthetic shape. + handler.Add(HttpMethod.Get, "/api/v1/proxy/s/api-github/rate_limit", + """{"error": true, "status": 403, "body": "{\"message\":\"Bad credentials\",\"documentation_url\":\"https://docs.github.com/rest\"}"}"""); + // Reviewer (PR #412 r3141699756): on the fail-fast path we must also revoke the + // freshly-created agent API key, otherwise repeated `/daily` attempts accumulate + // orphan proxy-scoped keys. Wire a DELETE handler so the test can verify it ran. + handler.Add(HttpMethod.Delete, "/api/v1/api-keys/key-403", """{"deleted":true}"""); + + var nyxClient = new NyxIdApiClient( + new NyxIdToolOptions { BaseUrl = "https://nyx.example.com" }, + new HttpClient(handler) { BaseAddress = new Uri("https://nyx.example.com") }); + + var services = new ServiceCollection(); + services.AddSingleton(queryPort); + services.AddSingleton(actorRuntime); + services.AddSingleton(nyxClient); + var tool = new AgentBuilderTool(services.BuildServiceProvider()); + + AgentToolRequestContext.CurrentMetadata = new Dictionary + { + [LLMRequestMetadataKeys.NyxIdAccessToken] = "session-token", + [ChannelMetadataKeys.ChatType] = "p2p", + [ChannelMetadataKeys.ConversationId] = "oc_chat_1", + [ChannelMetadataKeys.SenderId] = "ou_user_1", + ["scope_id"] = "scope-1", + }; + try + { + var result = await tool.ExecuteAsync(""" + { + "action": "create_agent", + "template": "daily_report", + "agent_id": "skill-runner-github-403", + "github_username": "alice", + "schedule_cron": "0 9 * * *", + "schedule_timezone": "UTC" + } + """); + + using var doc = JsonDocument.Parse(result); + doc.RootElement.GetProperty("error").GetString().Should().Be("github_proxy_access_denied"); + doc.RootElement.GetProperty("http_status").GetInt32().Should().Be(403); + doc.RootElement.GetProperty("hint").GetString().Should().Contain("api-keys"); + + // The actor must NOT receive InitializeSkillRunnerCommand — preflight aborts + // BEFORE the actor is invoked so we don't leave a broken agent in the catalog. + await skillRunnerActor.DidNotReceive().HandleEventAsync( + Arg.Any(), + Arg.Any()); + + // Best-effort revoke of the freshly-minted API key so repeated `/daily` attempts + // that hit GitHub preflight don't accumulate orphan proxy-scoped keys (reviewer + // r3141699756). The DELETE call is the only proof from this layer that the orphan + // is being cleaned up. + handler.Requests.Should().Contain(r => + r.Method == HttpMethod.Delete && + r.Path == "/api/v1/api-keys/key-403"); + } + finally + { + AgentToolRequestContext.CurrentMetadata = null; + } + } + [Fact] public async Task ExecuteAsync_CreateAgent_DailyReport_LogsFallbackBreadcrumb_When_LarkUnionIdMissing() { diff --git a/test/Aevatar.GAgents.ChannelRuntime.Tests/ChannelConversationTurnRunnerTests.cs b/test/Aevatar.GAgents.ChannelRuntime.Tests/ChannelConversationTurnRunnerTests.cs index 2e4229f62..2ae93c557 100644 --- a/test/Aevatar.GAgents.ChannelRuntime.Tests/ChannelConversationTurnRunnerTests.cs +++ b/test/Aevatar.GAgents.ChannelRuntime.Tests/ChannelConversationTurnRunnerTests.cs @@ -723,6 +723,108 @@ await interactiveDispatcher.Received(1).DispatchAsync( Arg.Any()); } + [Fact] + public async Task RunLlmReplyAsync_ShouldNotRetryAsText_WhenInteractiveDispatcherFails() + { + // Production regression: NyxID's `channel-relay/reply` is single-use — even when the + // interactive payload returns a transport-level failure (e.g. NyxID 502), the relay + // token is already consumed. The legacy "degrade to text" path in + // TrySendInteractiveRelayReplyAsync re-sent the same token as plain text, which always + // came back as `401 Reply token already used`, escalated as `relay_reply_rejected`, and + // queued an inbound turn retry that re-consumed the (already gone) token forever — bot + // looked silent on every subsequent DM after PR #409 introduced interactive cards. + // + // The single-use semantics demand exactly one attempt per inbound. When the dispatcher + // reports failure, the runner must surface a failure result without making a second + // call to the relay HTTP API. + var registrationQueryPort = BuildRegistrationQueryPort(); + var adapter = new RecordingPlatformAdapter(); + var interactiveDispatcher = Substitute.For(); + interactiveDispatcher.DispatchAsync( + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any()) + .Returns(Task.FromResult(new InteractiveReplyDispatchResult( + Succeeded: false, + MessageId: null, + PlatformMessageId: null, + Capability: ComposeCapability.Exact, + FellBackToText: false, + Detail: "nyx_status=502 body=error code: 502"))); + var relayHandler = new RecordingJsonHandler("""{"message_id":"reply-1"}"""); + var runner = CreateRunner( + registrationQueryPort, + adapter, + relayHandler: relayHandler, + interactiveReplyDispatcher: interactiveDispatcher); + var activity = BuildInboundActivity( + "hello", + "msg-relay-card-fail-1", + ConversationScope.Group, + "oc_group_chat_1", + new OutboundDeliveryContext + { + ReplyMessageId = "relay-msg-1", + CorrelationId = "corr-relay-card-fail-1", + }, + new TransportExtras + { + NyxPlatform = "lark", + NyxUserAccessToken = "user-token-1", + }); + var outbound = new MessageContent + { + Text = "Choose one", + }; + outbound.Actions.Add(new ActionElement + { + Kind = ActionElementKind.Button, + ActionId = "confirm", + Label = "Confirm", + IsPrimary = true, + }); + + var result = await runner.RunLlmReplyAsync( + new LlmReplyReadyEvent + { + CorrelationId = "corr-relay-card-fail-1", + RegistrationId = "reg-1", + SourceActorId = "llm-worker-1", + Activity = activity, + Outbound = outbound, + TerminalState = LlmReplyTerminalState.Completed, + ReadyAtUnixMs = 42, + }, + RelayRuntimeContext("corr-relay-card-fail-1"), + CancellationToken.None); + + result.Success.Should().BeFalse(); + // Distinct error code routed to PermanentFailure (vs transient `relay_reply_rejected`) + // so `ConversationGAgent.HandleInboundTurnTransientFailureAsync` does NOT queue an + // `InboundTurnRetryScheduledEvent` that would re-run the inbound turn with the same + // already-consumed reply token. Without this routing, the in-turn retry fix would just + // shift the 401 cascade from in-turn replay to grain-level replay. + result.ErrorCode.Should().Be("relay_reply_token_consumed"); + result.FailureKind.Should().Be(FailureKind.PermanentAdapterError); + result.ErrorSummary.Should().Contain("502"); + + // Critical assertion: the runner MUST NOT make a second HTTP call to NyxID's + // channel-relay endpoint. The previous (broken) "degrade to text" path issued one + // additional POST that always failed with 401 and trashed the inbound turn's retry + // budget. Verify the relay handler stays clean. + relayHandler.Requests.Should().BeEmpty(); + await interactiveDispatcher.Received(1).DispatchAsync( + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any()); + } + [Fact] public async Task RunContinueAsync_DirectMessageWithoutPartition_ReturnsPermanentFailure() { diff --git a/test/Aevatar.GAgents.ChannelRuntime.Tests/FeishuCardHumanInteractionPortTests.cs b/test/Aevatar.GAgents.ChannelRuntime.Tests/FeishuCardHumanInteractionPortTests.cs index 1e8a7aa1d..b7a10342d 100644 --- a/test/Aevatar.GAgents.ChannelRuntime.Tests/FeishuCardHumanInteractionPortTests.cs +++ b/test/Aevatar.GAgents.ChannelRuntime.Tests/FeishuCardHumanInteractionPortTests.cs @@ -58,6 +58,58 @@ public async Task DeliverSuspensionAsync_ShouldSendInteractiveCardThroughNyxProx formElements[2].GetProperty("value").GetProperty("actor_id").GetString().Should().Be("workflow-actor-1"); } + [Fact] + public async Task DeliverSuspensionAsync_ShouldRetryWithFallback_When_PrimaryRejectedAsBotNotInChat_ViaHttp400Envelope() + { + // Reviewer (PR #412 second-pass review): the 230002→fallback retry was added to + // `FeishuCardHumanInteractionPort.SendMessageAsync` but coverage for the catalog-backed + // path lives only in `SkillRunnerGAgentTests`. If `UserAgentCatalogProjector.Materialize` + // / `UserAgentCatalogQueryPort.ToEntry` ever drop the new + // `LarkReceiveIdFallback` / `LarkReceiveIdTypeFallback` mirror, the existing port tests + // (which only assert primary success) would still pass while production cards stop + // delivering on cross-app same-tenant DMs. Pin: catalog entry exposes a chat_id primary + // + union_id fallback; primary is rejected with the real wrapped envelope shape that + // `NyxIdApiClient.SendAsync` produces for HTTP-non-2xx responses; the port retries once + // with the fallback typed pair and the second POST carries `receive_id_type=union_id` + // and `receive_id=on_*`. + var registry = Substitute.For(); + registry.GetAsync("agent-fb", Arg.Any()) + .Returns(Task.FromResult(new UserAgentCatalogEntry + { + AgentId = "agent-fb", + Platform = "lark", + ConversationId = "oc_dm_chat_1", + NyxProviderSlug = "api-lark-bot", + NyxApiKey = "nyx-api-key-fb", + TemplateName = "social_media", + LarkReceiveId = "oc_dm_chat_1", + LarkReceiveIdType = "chat_id", + LarkReceiveIdFallback = "on_user_1", + LarkReceiveIdTypeFallback = "union_id", + })); + + var handler = new SequencedRecordingHandler( + """{"error": true, "status": 400, "body": "{\"code\":230002,\"msg\":\"Bot is not in the chat\"}"}""", + """{"data":{"message_id":"om_fb"}}"""); + var nyxClient = new NyxIdApiClient( + new NyxIdToolOptions { BaseUrl = "https://nyx.example.com" }, + new HttpClient(handler)); + var port = new FeishuCardHumanInteractionPort(registry, nyxClient, new LarkMessageComposer(), NullLogger.Instance); + + await port.DeliverSuspensionAsync(BuildApprovalRequest(), "agent-fb", CancellationToken.None); + + handler.Requests.Should().HaveCount(2); + handler.Requests[0].RequestUri!.Query.Should().Contain("receive_id_type=chat_id"); + handler.Requests[1].RequestUri!.Query.Should().Contain("receive_id_type=union_id"); + + using var primaryBody = JsonDocument.Parse(handler.Bodies[0]!); + primaryBody.RootElement.GetProperty("receive_id").GetString().Should().Be("oc_dm_chat_1"); + + using var fallbackBody = JsonDocument.Parse(handler.Bodies[1]!); + fallbackBody.RootElement.GetProperty("receive_id").GetString().Should().Be("on_user_1"); + fallbackBody.RootElement.GetProperty("msg_type").GetString().Should().Be("interactive"); + } + [Fact] public async Task DeliverSuspensionAsync_ShouldThrow_WhenTargetMissing() { @@ -311,4 +363,32 @@ protected override async Task SendAsync(HttpRequestMessage }; } } + + /// + /// Returns a different response per request in the order given so the primary→fallback + /// retry can be exercised. Records every request + body so tests can assert the order + /// of receive_id_type and the fallback receive_id on the second POST. + /// + private sealed class SequencedRecordingHandler : HttpMessageHandler + { + private readonly Queue _responses; + public List Requests { get; } = []; + public List Bodies { get; } = []; + + public SequencedRecordingHandler(params string[] responses) + { + _responses = new Queue(responses); + } + + protected override async Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + Requests.Add(request); + Bodies.Add(request.Content == null ? null : await request.Content.ReadAsStringAsync(cancellationToken)); + var body = _responses.Count > 0 ? _responses.Dequeue() : """{"data":{}}"""; + return new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(body, Encoding.UTF8, "application/json"), + }; + } + } } diff --git a/test/Aevatar.GAgents.ChannelRuntime.Tests/LarkConversationTargetsTests.cs b/test/Aevatar.GAgents.ChannelRuntime.Tests/LarkConversationTargetsTests.cs index 8b1bac06a..4b367ca3f 100644 --- a/test/Aevatar.GAgents.ChannelRuntime.Tests/LarkConversationTargetsTests.cs +++ b/test/Aevatar.GAgents.ChannelRuntime.Tests/LarkConversationTargetsTests.cs @@ -90,33 +90,51 @@ public void Resolve_ShouldFallBackToChatIdDefault_WhenLegacyConversationIdIsUnre } [Fact] - public void BuildFromInbound_ShouldPreferLarkUnionId_ForP2pDirectMessages() + public void BuildFromInbound_ShouldPreferLarkChatId_ForP2pDirectMessages() { - // union_id is tenant-stable and cross-app safe, so a relay-side ingress app and an - // outbound-side customer app see the SAME `on_*` for the user. This eliminates the - // `code:99992361 open_id cross app` rejection that was breaking SkillRunner DMs when - // only the open_id was available. + // chat_id (oc_*) is the literal DM thread between the user and the bot that received + // the inbound event. When the outbound proxy authenticates as the SAME Lark app, sending + // back via `receive_id_type=chat_id` targets the same chat WITHOUT traversing any user-id + // resolution. This is the only path that survives both cross-app (`99992361`) and + // cross-tenant (`99992364`) deployments where the relay-side ingress and outbound apps + // share at least the inbound chat membership. var target = LarkConversationTargets.BuildFromInbound( chatType: "p2p", conversationId: "oc_dm_underlying_chat", senderId: "ou_user_1", larkUnionId: "on_user_1", - larkChatId: "oc_dm_underlying_chat"); + larkChatId: "oc_dm_chat_1"); + + target.ReceiveId.Should().Be("oc_dm_chat_1"); + target.ReceiveIdType.Should().Be("chat_id"); + target.FellBackToPrefixInference.Should().BeFalse(); + } + + [Fact] + public void BuildFromInbound_ShouldFallBackToUnionId_ForP2pWithoutChatId() + { + // When the relay does not surface a Lark chat_id (older relay revisions, malformed + // raw_platform_data), union_id is the next-best tenant-stable identifier. Lark rejects + // it as `code:99992364 user id cross tenant` when the relay-side ingress and outbound + // apps live in different tenants — flip FellBackToPrefixInference=true so call sites + // LogDebug and operators can correlate the rejection with a missing-chat_id ingress. + var target = LarkConversationTargets.BuildFromInbound( + chatType: "p2p", + conversationId: "oc_dm_underlying_chat", + senderId: "ou_user_1", + larkUnionId: "on_user_1"); target.ReceiveId.Should().Be("on_user_1"); target.ReceiveIdType.Should().Be("union_id"); - target.FellBackToPrefixInference.Should().BeFalse(); + target.FellBackToPrefixInference.Should().BeTrue(); } [Fact] - public void BuildFromInbound_ShouldFallBackToSenderOpenId_ForP2pWithoutUnionId() - { - // When the relay does not surface a union_id (older relay revisions, misconfigured - // Lark app, non-Lark traffic mistyped as p2p), the only DM identifier we have is the - // sender open_id. Lark accepts it inside the originating app and rejects it - // (`code:99992361 open_id cross app`) when sent from a different app — flip - // FellBackToPrefixInference=true so call sites LogDebug and operators can correlate - // a missing-union_id ingress with downstream Lark rejections. + public void BuildFromInbound_ShouldFallBackToSenderOpenId_ForP2pWithoutChatIdOrUnionId() + { + // Last-resort identifier. Surfaces `code:99992361 open_id cross app` when the + // relay-side ingress and outbound apps differ — flip FellBack so call sites LogDebug + // and operators see exactly which identifier they ended up with. var target = LarkConversationTargets.BuildFromInbound( chatType: "p2p", conversationId: "oc_dm_underlying_chat", @@ -190,6 +208,59 @@ public void BuildFromInbound_ShouldReturnEmptyTypedPairWithFellBack_WhenP2pAndSe target.FellBackToPrefixInference.Should().BeTrue(); } + [Fact] + public void BuildFromInboundWithFallback_ShouldPairChatIdPrimary_WithUnionIdFallback_ForP2p() + { + // PR #412 reviewer concern (codex-bot, P1): chat_id-first regresses cross-app same-tenant + // deployments where the outbound app is not a member of the inbound DM. The fallback + // pairs the primary chat_id with a secondary union_id captured at ingress so the + // runtime can retry once on `230002 bot not in chat` without needing a fresh ingress. + var target = LarkConversationTargets.BuildFromInboundWithFallback( + chatType: "p2p", + conversationId: "oc_dm_chat_1", + senderId: "ou_user_1", + larkUnionId: "on_user_1", + larkChatId: "oc_dm_chat_1"); + + target.Primary.ReceiveId.Should().Be("oc_dm_chat_1"); + target.Primary.ReceiveIdType.Should().Be("chat_id"); + target.Fallback.Should().NotBeNull(); + target.Fallback!.Value.ReceiveId.Should().Be("on_user_1"); + target.Fallback.Value.ReceiveIdType.Should().Be("union_id"); + } + + [Fact] + public void BuildFromInboundWithFallback_ShouldNotPairFallback_ForGroupChats() + { + // For groups chat_id is tenant-scoped; either the outbound app is in the chat (chat_id + // works) or it isn't (no user-id-based identifier helps). No fallback to attempt. + var target = LarkConversationTargets.BuildFromInboundWithFallback( + chatType: "group", + conversationId: "oc_group_chat_1", + senderId: "ou_user_1", + larkUnionId: "on_user_1", + larkChatId: "oc_group_chat_1"); + + target.Primary.ReceiveIdType.Should().Be("chat_id"); + target.Fallback.Should().BeNull(); + } + + [Fact] + public void BuildFromInboundWithFallback_ShouldOmitFallback_WhenPrimaryIsNotChatId() + { + // When the primary already degrades to union_id or open_id (chat_id missing at + // ingress), there is no further fallback to capture — the primary IS the safest + // identifier we have. + var target = LarkConversationTargets.BuildFromInboundWithFallback( + chatType: "p2p", + conversationId: "ou_legacy", + senderId: "ou_user_1", + larkUnionId: "on_user_1"); + + target.Primary.ReceiveIdType.Should().Be("union_id"); + target.Fallback.Should().BeNull(); + } + [Fact] public void Resolve_ShouldRecoverOpenIdForP2pConfusedInbound_ViaPrefixInference() { diff --git a/test/Aevatar.GAgents.ChannelRuntime.Tests/SkillRunnerGAgentTests.cs b/test/Aevatar.GAgents.ChannelRuntime.Tests/SkillRunnerGAgentTests.cs index ba1e5824d..1cd05cb5f 100644 --- a/test/Aevatar.GAgents.ChannelRuntime.Tests/SkillRunnerGAgentTests.cs +++ b/test/Aevatar.GAgents.ChannelRuntime.Tests/SkillRunnerGAgentTests.cs @@ -239,6 +239,178 @@ public async Task SendOutputAsync_ShouldIncludeRecreateHint_When_LarkRejectsAsCr assertion.WithMessage("*/daily*"); } + [Fact] + public async Task SendOutputAsync_ShouldRetryWithFallback_When_PrimaryRejectedAsBotNotInChat_ViaHttp400Envelope() + { + // Reviewer (PR #412 r3141700469): production failures arrive through + // `NyxIdApiClient.SendAsync` as an HTTP-400 Nyx envelope: + // `{"error": true, "status": 400, "body": "{\"code\":230002,...}"}`. The previous + // `LarkProxyResponse.TryGetError` returned true for that shape but left + // `larkCode=null` because it didn't parse the nested `body`, so the BotNotInChat + // retry branch never fired in the actual production path. Pin the wrapped envelope + // shape end-to-end. + var initialize = CreateInitializeCommand(); + initialize.OutboundConfig = new SkillRunnerOutboundConfig + { + ConversationId = "oc_dm_chat_1", + NyxProviderSlug = "api-lark-bot", + NyxApiKey = "nyx-api-key", + LarkReceiveId = "oc_dm_chat_1", + LarkReceiveIdType = "chat_id", + LarkReceiveIdFallback = "on_user_1", + LarkReceiveIdTypeFallback = "union_id", + }; + await _agent.HandleInitializeAsync(initialize); + + // First (primary) attempt: NyxIdApiClient.SendAsync HTTP-400 envelope wrapping Lark + // 230002. Second (fallback) attempt: clean success. + var handler = new SequencedHandler( + """{"error": true, "status": 400, "body": "{\"code\":230002,\"msg\":\"Bot is not in the chat\"}"}""", + """{"code":0,"msg":"success"}"""); + AttachNyxIdApiClient(_agent, handler); + + await InvokeSendOutputAsync(_agent, "report"); + + handler.Requests.Should().HaveCount(2); + handler.Requests[0].RequestUri!.Query.Should().Contain("receive_id_type=chat_id"); + handler.Requests[1].RequestUri!.Query.Should().Contain("receive_id_type=union_id"); + handler.Bodies[1].Should().Contain("\"receive_id\":\"on_user_1\""); + } + + [Fact] + public async Task SendOutputAsync_ShouldThrowCrossTenantHint_When_LarkCodeNestedInHttp400Body() + { + // Same envelope shape as the production /daily failure log: NyxID wraps the Lark + // 99992364 as a string body inside an HTTP-400 Nyx envelope. The cross-tenant + // recreate-the-agent hint (PR #412) only fires when the parser surfaces the nested + // Lark code; previously it never did. Pin both the recovery hint and the nested-body + // unwrap together. + var initialize = CreateInitializeCommand(); + initialize.OutboundConfig = new SkillRunnerOutboundConfig + { + ConversationId = "oc_dm_chat_1", + NyxProviderSlug = "api-lark-bot", + NyxApiKey = "nyx-api-key", + LarkReceiveId = "on_relay_tenant_user_1", + LarkReceiveIdType = "union_id", + }; + await _agent.HandleInitializeAsync(initialize); + + var handler = new RecordingHandler( + """{"error": true, "status": 400, "body": "{\"code\":99992364,\"msg\":\"user id cross tenant\"}"}"""); + AttachNyxIdApiClient(_agent, handler); + + Func act = () => InvokeSendOutputAsync(_agent, "report"); + + var assertion = await act.Should().ThrowAsync(); + assertion.WithMessage("*99992364*"); + assertion.WithMessage("*different tenant*"); + assertion.WithMessage("*/agents*"); + assertion.WithMessage("*Delete*"); + assertion.WithMessage("*/daily*"); + } + + [Fact] + public async Task SendOutputAsync_ShouldRetryWithFallback_When_PrimaryRejectedAsBotNotInChat() + { + // Reviewer concern (codex-bot, P1, PR #412): chat_id-first regresses cross-app + // same-tenant deployments where the outbound app is not a member of the inbound DM + // chat — Lark returns `230002 bot not in chat` for chat_id-typed sends. Captured the + // union_id at create time as a fallback; assert the runtime retries once with the + // fallback typed pair when the primary attempt fails with 230002, and that the retry + // body uses the fallback `receive_id` / `receive_id_type`. + var initialize = CreateInitializeCommand(); + initialize.OutboundConfig = new SkillRunnerOutboundConfig + { + ConversationId = "oc_dm_chat_1", + NyxProviderSlug = "api-lark-bot", + NyxApiKey = "nyx-api-key", + LarkReceiveId = "oc_dm_chat_1", + LarkReceiveIdType = "chat_id", + LarkReceiveIdFallback = "on_user_1", + LarkReceiveIdTypeFallback = "union_id", + }; + await _agent.HandleInitializeAsync(initialize); + + var handler = new SequencedHandler( + """{"code":230002,"msg":"Bot is not in the chat"}""", + """{"code":0,"msg":"success"}"""); + AttachNyxIdApiClient(_agent, handler); + + await InvokeSendOutputAsync(_agent, "report"); + + handler.Requests.Should().HaveCount(2); + handler.Requests[0].RequestUri!.Query.Should().Contain("receive_id_type=chat_id"); + handler.Bodies[0].Should().Contain("\"receive_id\":\"oc_dm_chat_1\""); + handler.Requests[1].RequestUri!.Query.Should().Contain("receive_id_type=union_id"); + handler.Bodies[1].Should().Contain("\"receive_id\":\"on_user_1\""); + } + + [Fact] + public async Task SendOutputAsync_ShouldNotRetry_When_PrimaryRejectedWithDifferentLarkCode() + { + // Only `230002 bot not in chat` triggers the fallback retry. Other Lark codes (e.g. + // 99992364 cross_tenant) propagate immediately so the user sees the actionable + // recovery hint for the actual failure mode rather than a misleading retry. + var initialize = CreateInitializeCommand(); + initialize.OutboundConfig = new SkillRunnerOutboundConfig + { + ConversationId = "oc_dm_chat_1", + NyxProviderSlug = "api-lark-bot", + NyxApiKey = "nyx-api-key", + LarkReceiveId = "oc_dm_chat_1", + LarkReceiveIdType = "chat_id", + LarkReceiveIdFallback = "on_user_1", + LarkReceiveIdTypeFallback = "union_id", + }; + await _agent.HandleInitializeAsync(initialize); + + var handler = new SequencedHandler( + """{"code":99992364,"msg":"user id cross tenant"}"""); + AttachNyxIdApiClient(_agent, handler); + + Func act = () => InvokeSendOutputAsync(_agent, "report"); + + await act.Should().ThrowAsync() + .WithMessage("*99992364*"); + handler.Requests.Should().ContainSingle("only 230002 should trigger the fallback retry"); + } + + [Fact] + public async Task SendOutputAsync_ShouldIncludeRecreateHint_When_LarkRejectsAsCrossTenantUserId() + { + // Production failure mode after PR #409 switched p2p to union_id: NyxID's relay-side + // ingress and `s/api-lark-bot` proxy turned out to be in different Lark tenants, so even + // union_id is rejected. This PR pivots to chat_id-first; the cross_tenant error code is + // surfaced with the same recreate guidance so legacy agents (still pinned to union_id) + // give users a way to recover without reading source. + var initialize = CreateInitializeCommand(); + initialize.OutboundConfig = new SkillRunnerOutboundConfig + { + ConversationId = "oc_chat_1", + NyxProviderSlug = "api-lark-bot", + NyxApiKey = "nyx-api-key", + LarkReceiveId = "on_relay_tenant_user_1", + LarkReceiveIdType = "union_id", + }; + await _agent.HandleInitializeAsync(initialize); + + var handler = new RecordingHandler( + """{"code":99992364,"msg":"user id cross tenant","error":{"log_id":"L1"}}"""); + AttachNyxIdApiClient(_agent, handler); + + Func act = () => InvokeSendOutputAsync(_agent, "report"); + + var assertion = await act.Should().ThrowAsync(); + assertion.WithMessage("*code=99992364*"); + assertion.WithMessage("*user id cross tenant*"); + assertion.WithMessage("*different tenant*"); + assertion.WithMessage("*chat_id-preferred*"); + assertion.WithMessage("*/agents*"); + assertion.WithMessage("*Delete*"); + assertion.WithMessage("*/daily*"); + } + private static void AttachNyxIdApiClient(SkillRunnerGAgent agent, HttpMessageHandler handler) { var client = new NyxIdApiClient( @@ -278,6 +450,34 @@ protected override async Task SendAsync(HttpRequestMessage } } + /// + /// Returns a different response per request in the order given. Used to simulate the + /// `bot not in chat` rejection on the primary attempt followed by a successful fallback + /// retry. + /// + private sealed class SequencedHandler : HttpMessageHandler + { + private readonly Queue _responses; + public List Requests { get; } = new(); + public List Bodies { get; } = new(); + + public SequencedHandler(params string[] responses) + { + _responses = new Queue(responses); + } + + protected override async Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + Requests.Add(request); + Bodies.Add(request.Content == null ? null : await request.Content.ReadAsStringAsync(cancellationToken)); + var body = _responses.Count > 0 ? _responses.Dequeue() : """{"code":0,"msg":"success"}"""; + return new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(body, Encoding.UTF8, "application/json"), + }; + } + } + private SkillRunnerGAgent CreateAgent(string actorId) { var agent = new SkillRunnerGAgent diff --git a/test/Aevatar.GAgents.ChannelRuntime.Tests/UserAgentCatalogProjectorTests.cs b/test/Aevatar.GAgents.ChannelRuntime.Tests/UserAgentCatalogProjectorTests.cs index d410c79a3..c35851e3c 100644 --- a/test/Aevatar.GAgents.ChannelRuntime.Tests/UserAgentCatalogProjectorTests.cs +++ b/test/Aevatar.GAgents.ChannelRuntime.Tests/UserAgentCatalogProjectorTests.cs @@ -54,8 +54,10 @@ public async Task ProjectAsync_WithValidCommittedEvent_UpsertsDocument() ErrorCount = 1, LastError = "last-error", CreatedAt = createdAt, - LarkReceiveId = "ou_user_1", - LarkReceiveIdType = "open_id", + LarkReceiveId = "oc_dm_chat_1", + LarkReceiveIdType = "chat_id", + LarkReceiveIdFallback = "on_user_1", + LarkReceiveIdTypeFallback = "union_id", }, }, }; @@ -87,9 +89,15 @@ public async Task ProjectAsync_WithValidCommittedEvent_UpsertsDocument() document.UpdatedAt.Should().Be(_clock.UtcNow); // Typed Lark target round-trips through the projection so catalog-backed senders // (FeishuCardHumanInteractionPort) read it via UserAgentCatalogQueryPort.ToEntry - // instead of falling back to conversation_id prefix inference. - document.LarkReceiveId.Should().Be("ou_user_1"); - document.LarkReceiveIdType.Should().Be("open_id"); + // instead of falling back to conversation_id prefix inference. The fallback pair + // (PR #412) MUST mirror through the projection too — without it the runtime + // `230002 bot not in chat` retry on FeishuCardHumanInteractionPort / + // SkillRunnerGAgent would never have a fallback typed pair to retry against, even + // though the actor-side state captured one at create time. + document.LarkReceiveId.Should().Be("oc_dm_chat_1"); + document.LarkReceiveIdType.Should().Be("chat_id"); + document.LarkReceiveIdFallback.Should().Be("on_user_1"); + document.LarkReceiveIdTypeFallback.Should().Be("union_id"); } [Fact] @@ -97,20 +105,27 @@ public void ToEntry_ShouldRoundTripTypedLarkReceiveTarget_FromDocumentToEntry() { // FeishuCardHumanInteractionPort consumes UserAgentCatalogEntry via this conversion; // dropping the typed fields would silently regress workflow / social_media DM delivery - // back to the prefix-inference path even after the projection captured them. + // back to the prefix-inference path even after the projection captured them. The + // fallback pair (PR #412) is part of the same contract — the catalog-backed + // `230002 bot not in chat` retry depends on `LarkReceiveIdFallback` / + // `LarkReceiveIdTypeFallback` surviving the document → entry mapping. var document = new UserAgentCatalogDocument { Id = "agent-1", Platform = "lark", - ConversationId = "oc_chat_1", - LarkReceiveId = "ou_user_1", - LarkReceiveIdType = "open_id", + ConversationId = "oc_dm_chat_1", + LarkReceiveId = "oc_dm_chat_1", + LarkReceiveIdType = "chat_id", + LarkReceiveIdFallback = "on_user_1", + LarkReceiveIdTypeFallback = "union_id", }; var entry = UserAgentCatalogQueryPort.ToEntry(document, nyxApiKey: ""); - entry.LarkReceiveId.Should().Be("ou_user_1"); - entry.LarkReceiveIdType.Should().Be("open_id"); + entry.LarkReceiveId.Should().Be("oc_dm_chat_1"); + entry.LarkReceiveIdType.Should().Be("chat_id"); + entry.LarkReceiveIdFallback.Should().Be("on_user_1"); + entry.LarkReceiveIdTypeFallback.Should().Be("union_id"); } [Fact]