Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions actions/setup/js/copilot_harness.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,21 @@ function buildCopilotProxyAuthFailureDiagnostic(output, env = process.env, optio
);
}

/**
* Determine whether an authentication_failed error came from the gh-aw API proxy after
* partial execution, making a one-time fresh-run retry worthwhile.
* @param {string} output
* @param {boolean} hasOutput
* @returns {boolean}
*/
function isRetryableProxyAuthenticationFailure(output, hasOutput) {
if (!hasOutput || !isAuthenticationFailedError(output)) {
return false;
}
const authFailure = parseProviderAuthFailure(output);
return Boolean(authFailure && isLikelyAWFAPIProxyURL(authFailure.providerUrl));
}

/**
* Detect known Copilot error patterns for workflow outputs.
* @param {string} output
Expand Down Expand Up @@ -823,6 +838,7 @@ async function main() {
const isAuthErr = isNoAuthInfoError(result.output);
const isAuthenticationFailed = isAuthenticationFailedError(result.output);
const proxyAuthDiagnostic = buildCopilotProxyAuthFailureDiagnostic(result.output, process.env);
const retryableProxyAuthenticationFailure = isRetryableProxyAuthenticationFailure(result.output, result.hasOutput);
const isNullTypeToolCall = isNullTypeToolCallError(result.output);
const isSDKSessionIdleTimeout = isSDKSessionIdleTimeoutError(result.output);
const isMCPGatewayShutdown = isMCPGatewayShutdownError(result.output);
Expand Down Expand Up @@ -882,11 +898,19 @@ async function main() {
break;
}

if (attempt === 0 && isAuthenticationFailed) {
// attempt === 0 makes this a one-time fresh-run recovery path.
if (attempt === 0 && retryableProxyAuthenticationFailure) {
useContinueOnRetry = false;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

continueDisabledPermanently = true here permanently poisons all future retries, not just the next one.

💡 Analysis and suggested fix

continueDisabledPermanently is not scoped to a single retry — it prevents --continue from ever being re-enabled for the entire remaining run. Setting it in the proxy-auth retry branch creates a subtle regression:

  1. attempt=0: proxy auth failure after partial work → fresh-run retry triggered ✔️
  2. attempt=1 (fresh run, clean start): executes for minutes, then hits a transient CAPIError with partial output
  3. attempt=2: useContinueOnRetry is forced to false because continueDisabledPermanently = true, so it starts fresh and discards all the valid work done in attempt=1

This is wrong. continueDisabledPermanently = true belongs in cases where session state is poisoned (null-type tool_call — conversation history causes 400 forever) or where re-enabling --continue would resurrect a broken recovery path (NoAuthInfo on a --continue attempt). A proxy-auth failure at attempt=0 does not corrupt attempt=1’s freshly-created, clean session state.

The one-time retry guarantee is already fully enforced by attempt === 0; continueDisabledPermanently adds nothing to that.

useContinueOnRetry = false is also redundant — it starts false by default and is never set to true at attempt=0.

// Suggested:
if (attempt === 0 && retryableProxyAuthenticationFailure) {
  log(`attempt ${attempt + 1}: provider authentication failed after partial execution — will retry once as fresh run to avoid losing completed agent work`);
  continue;
}

With this fix, if attempt=1 partially succeeds and then fails for a transient reason, attempt=2 will correctly set useContinueOnRetry = !copilotSDKMode (line 1003) and resume from attempt=1’s valid state instead of discarding it.

continueDisabledPermanently = true;
log(`attempt ${attempt + 1}: provider authentication failed after partial execution - will retry once as fresh run to avoid losing completed agent work`);
continue;
}

if (isAuthenticationFailed) {
if (proxyAuthDiagnostic) {
log(`attempt ${attempt + 1}: ${proxyAuthDiagnostic} — not retrying (first-attempt auth failure is non-retryable)`);
log(`attempt ${attempt + 1}: ${proxyAuthDiagnostic} — not retrying`);
} else {
log(`attempt ${attempt + 1}: authentication failed — not retrying (first-attempt auth failure is non-retryable)`);
log(`attempt ${attempt + 1}: authentication failed — not retrying`);
}
break;
}
Expand Down Expand Up @@ -1043,6 +1067,7 @@ if (typeof module !== "undefined" && module.exports) {
detectCopilotErrors,
classifyCopilotFailure,
extractOutputTail,
isRetryableProxyAuthenticationFailure,
hasNumerousPermissionDeniedIssues,
INFERENCE_ACCESS_ERROR_PATTERN,
AGENTIC_ENGINE_TIMEOUT_PATTERN,
Expand Down
59 changes: 54 additions & 5 deletions actions/setup/js/copilot_harness.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const {
AGENTIC_ENGINE_TIMEOUT_PATTERN,
isDetectionPhase,
isAuthenticationFailedError,
isRetryableProxyAuthenticationFailure,
isMCPGatewayShutdownError,
isModelAvailableInReflectData,
isModelAvailableInReflectFile,
Expand Down Expand Up @@ -1154,6 +1155,23 @@ describe("copilot_harness.cjs", () => {
});
});

const PROXY_AUTH_FAILURE_OUTPUT = "Authentication failed with provider at http://api-proxy:10002 (HTTP 403).";

describe("isRetryableProxyAuthenticationFailure", () => {
it("returns true for gh-aw proxy auth failures after partial execution", () => {
expect(isRetryableProxyAuthenticationFailure(PROXY_AUTH_FAILURE_OUTPUT, true)).toBe(true);
});

it("returns false when the auth failure happened before any output was produced", () => {
expect(isRetryableProxyAuthenticationFailure(PROXY_AUTH_FAILURE_OUTPUT, false)).toBe(false);
});

it("returns false for non-proxy authentication failures", () => {
expect(isRetryableProxyAuthenticationFailure("Authentication failed (Request ID: ABC123)", true)).toBe(false);
expect(isRetryableProxyAuthenticationFailure("Authentication failed with provider at https://api.openai.com/v1 (HTTP 401).", true)).toBe(false);
});
});

describe("envFlagEnabled", () => {
it.each(["true", "TRUE", "True", "1", "yes", " YES "])("returns true for '%s'", v => {
expect(envFlagEnabled(v)).toBe(true);
Expand All @@ -1168,8 +1186,8 @@ describe("copilot_harness.cjs", () => {
});
});

describe("auth error prevents retry", () => {
// Inline the same retry logic as the driver, including auth error check
describe("provider auth retry policy", () => {
// Inline the same retry logic as the driver for auth-related failures.
const MCP_POLICY_BLOCKED_PATTERN = /MCP servers were blocked by policy:/;
const NO_AUTH_INFO_PATTERN = /No authentication information found/;
const MAX_RETRIES = 3;
Expand All @@ -1184,7 +1202,9 @@ describe("copilot_harness.cjs", () => {
if (result.exitCode === 0) return false;
// MCP policy errors are persistent — never retry
if (MCP_POLICY_BLOCKED_PATTERN.test(result.output)) return false;
if (attempt === 0 && isAuthenticationFailedError(result.output)) return false;
if (isAuthenticationFailedError(result.output)) {
return attempt === 0 && isRetryableProxyAuthenticationFailure(result.output, result.hasOutput);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The stateless shouldRetry simulation cannot validate the continueDisabledPermanently side effect that the real retry loop depends on.

💡 Detail

The real harness mutates continueDisabledPermanently (and useContinueOnRetry) across loop iterations. The shouldRetry helper here takes useContinueOnRetry as a parameter but has no memory between calls, so the test for "retries the first proxy auth failure only once" only proves that attempt === 0 gates the retry — it does not verify that the state flag is correctly set (or not set, per the concern at line 903 of the harness).

This is a pre-existing test design limitation that becomes more significant as the retry logic grows, but the current tests provide no coverage for multi-attempt state interactions such as:

  • What --continue mode is used on attempt=2 after a proxy-auth fresh-run retry at attempt=0 followed by a partial CAPIError at attempt=1
  • Whether continueDisabledPermanently is correctly set/unset in each path

Consider adding at least one integration-level test that drives the real main() retry loop through mock subprocesses to validate cross-attempt state transitions.

}
// Auth error on --continue: fall back to fresh run once; on fresh run: bail
if (NO_AUTH_INFO_PATTERN.test(result.output)) {
return useContinueOnRetry && attempt < MAX_RETRIES;
Expand All @@ -1197,12 +1217,41 @@ describe("copilot_harness.cjs", () => {
expect(shouldRetry(result, 0, false)).toBe(false);
});

it("does not retry when first attempt reports authentication failed", () => {
it("retries once when the first attempt hits a proxy auth failure after partial execution", () => {
const result = {
exitCode: 1,
hasOutput: true,
output: PROXY_AUTH_FAILURE_OUTPUT,
};
expect(shouldRetry(result, 0, false)).toBe(true);
});

it("does not retry when proxy auth fails before any output was produced", () => {
const result = {
exitCode: 1,
hasOutput: false,
output: PROXY_AUTH_FAILURE_OUTPUT,
};
expect(shouldRetry(result, 0, false)).toBe(false);
});

it("does not retry generic authentication_failed errors that do not come from the gh-aw proxy", () => {
const result = { exitCode: 1, hasOutput: true, output: "Authentication failed (Request ID: ABC123)" };
expect(shouldRetry(result, 0, false)).toBe(false);
});

it("retries as fresh run when auth fails on a --continue attempt", () => {
it("retries the first proxy auth failure only once", () => {
const result = {
exitCode: 1,
hasOutput: true,
output: PROXY_AUTH_FAILURE_OUTPUT,
};
expect(shouldRetry(result, 0, false)).toBe(true);
expect(shouldRetry(result, 1, false)).toBe(false);
expect(shouldRetry(result, 2, false)).toBe(false);
});

it("retries as fresh run when no-auth failure happens on a --continue attempt", () => {
// This replicates the fix: attempt 1 ran for 3+ min then failed mid-stream,
// attempt 2 (--continue) fails with auth error — driver retries once as fresh run.
const continueResult = { exitCode: 1, hasOutput: true, output: "Error: No authentication information found." };
Expand Down
Loading