Skip to content

Commit 3ae8b79

Browse files
authored
Add exponential backoff retry logic to OIDC provider initialization (#928)
Implements retry logic using [github.com/cenkalti/backoff/v5](https://github.com/cenkalti/backoff/v5) to handle connectivity issues with slow or unreliable OIDC providers. The retry mechanism uses exponential backoff (1s → 2s → 4s → 8s → 16s → 30s max) with randomization to avoid thundering herd issues. - Add retry wrapper around `oidc.NewProvider()` calls - Add test coverage for retry scenarios - Improve logging with attempt counts and issuer information --------- Signed-off-by: Maxime Gréau <[email protected]>
1 parent ee28b4f commit 3ae8b79

File tree

2 files changed

+250
-2
lines changed

2 files changed

+250
-2
lines changed

pkg/provider/provider.go

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@ import (
77
"context"
88
"fmt"
99
"net/http"
10+
"strings"
1011
"testing"
12+
"time"
1113

14+
"github.com/cenkalti/backoff/v5"
1215
"github.com/chainguard-dev/clog"
1316
"github.com/chainguard-dev/terraform-infra-common/pkg/httpmetrics"
1417
"github.com/coreos/go-oidc/v3/oidc"
@@ -45,7 +48,7 @@ func Get(ctx context.Context, issuer string) (provider VerifierProvider, err err
4548
})
4649

4750
// Verify the token before we trust anything about it.
48-
provider, err = oidc.NewProvider(ctx, issuer)
51+
provider, err = newProviderWithRetry(ctx, issuer)
4952
if err != nil {
5053
return nil, fmt.Errorf("constructing %q provider: %w", issuer, err)
5154
}
@@ -57,6 +60,59 @@ func Get(ctx context.Context, issuer string) (provider VerifierProvider, err err
5760
return provider, nil
5861
}
5962

63+
// newProviderWithRetry creates a new OIDC provider with exponential backoff retry logic
64+
func newProviderWithRetry(ctx context.Context, issuer string) (VerifierProvider, error) {
65+
attempt := 0
66+
67+
operation := func() (VerifierProvider, error) {
68+
attempt++
69+
p, err := oidc.NewProvider(ctx, issuer)
70+
if err != nil {
71+
clog.WarnContext(ctx, "provider creation failed", "attempt", attempt, "issuer", issuer, "error", err)
72+
// Check for permanent errors that shouldn't be retried
73+
if isPermanentError(err) {
74+
return nil, backoff.Permanent(err)
75+
}
76+
return nil, err
77+
}
78+
if attempt > 1 {
79+
clog.InfoContext(ctx, "provider creation succeeded after retry", "attempts", attempt, "issuer", issuer)
80+
}
81+
return p, nil
82+
}
83+
84+
// Configure exponential backoff: 1s → 2s → 4s → 8s → 16s → 30s (max)
85+
// with ±10% jitter to prevent thundering herd issues
86+
expBackoff := backoff.NewExponentialBackOff()
87+
expBackoff.InitialInterval = 1 * time.Second
88+
expBackoff.MaxInterval = 30 * time.Second
89+
expBackoff.Multiplier = 2.0
90+
expBackoff.RandomizationFactor = 0.1
91+
92+
return backoff.Retry(ctx, operation, backoff.WithBackOff(expBackoff))
93+
}
94+
95+
// isPermanentError checks if an error should not be retried based on HTTP status codes
96+
func isPermanentError(err error) bool {
97+
// String matching for HTTP status codes embedded in error messages
98+
// This matches go-oidc's pattern: fmt.Errorf("%s: %s", resp.Status, body)
99+
errMsg := err.Error()
100+
if strings.Contains(errMsg, "400 Bad Request") ||
101+
strings.Contains(errMsg, "401 Unauthorized") ||
102+
strings.Contains(errMsg, "403 Forbidden") ||
103+
strings.Contains(errMsg, "404 Not Found") ||
104+
strings.Contains(errMsg, "405 Method Not Allowed") ||
105+
strings.Contains(errMsg, "406 Not Acceptable") ||
106+
strings.Contains(errMsg, "410 Gone") ||
107+
strings.Contains(errMsg, "415 Unsupported Media Type") ||
108+
strings.Contains(errMsg, "422 Unprocessable Entity") ||
109+
strings.Contains(errMsg, "501 Not Implemented") {
110+
return true // Don't retry these permanent client/server errors
111+
}
112+
113+
return false // Retry all other errors
114+
}
115+
60116
type keysetProvider struct {
61117
issuer string
62118
keySet oidc.KeySet
@@ -68,7 +124,7 @@ func (s *keysetProvider) Verifier(config *oidc.Config) *oidc.IDTokenVerifier {
68124

69125
// AddTestKeySetVerifier adds a test key set verifier to the provider cachef or the issuer.
70126
// This is primarily intended for testing - the static key set is not verified against the upstream issuer.
71-
func AddTestKeySetVerifier(t *testing.T, issuer string, keySet oidc.KeySet) {
127+
func AddTestKeySetVerifier(_ *testing.T, issuer string, keySet oidc.KeySet) {
72128
providers.Add(issuer, &keysetProvider{
73129
issuer: issuer,
74130
keySet: keySet,

pkg/provider/provider_test.go

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
// Copyright 2025 Chainguard, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package provider
5+
6+
import (
7+
"context"
8+
"errors"
9+
"net/http"
10+
"net/http/httptest"
11+
"sync/atomic"
12+
"testing"
13+
"time"
14+
15+
"github.com/coreos/go-oidc/v3/oidc"
16+
)
17+
18+
func TestNewProviderWithRetry_Success(t *testing.T) {
19+
// Create a test server that responds successfully
20+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
21+
w.Header().Set("Content-Type", "application/json")
22+
w.WriteHeader(http.StatusOK)
23+
issuerURL := "http://" + r.Host
24+
w.Write([]byte(`{"issuer":"` + issuerURL + `","authorization_endpoint":"` + issuerURL + `/auth","token_endpoint":"` + issuerURL + `/token","jwks_uri":"` + issuerURL + `/jwks"}`))
25+
}))
26+
defer server.Close()
27+
28+
ctx := context.Background()
29+
provider, err := newProviderWithRetry(ctx, server.URL)
30+
if err != nil {
31+
t.Fatalf("Expected success, got error: %v", err)
32+
}
33+
if provider == nil {
34+
t.Fatal("Expected provider, got nil")
35+
}
36+
}
37+
38+
func TestNewProviderWithRetry_EventualSuccess(t *testing.T) {
39+
var attempts int32
40+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
41+
attempt := atomic.AddInt32(&attempts, 1)
42+
if attempt < 3 {
43+
// Fail the first 2 attempts
44+
w.WriteHeader(http.StatusInternalServerError)
45+
return
46+
}
47+
// Succeed on the 3rd attempt
48+
w.Header().Set("Content-Type", "application/json")
49+
w.WriteHeader(http.StatusOK)
50+
issuerURL := "http://" + r.Host
51+
w.Write([]byte(`{"issuer":"` + issuerURL + `","authorization_endpoint":"` + issuerURL + `/auth","token_endpoint":"` + issuerURL + `/token","jwks_uri":"` + issuerURL + `/jwks"}`))
52+
}))
53+
defer server.Close()
54+
55+
ctx := context.Background()
56+
start := time.Now()
57+
provider, err := newProviderWithRetry(ctx, server.URL)
58+
duration := time.Since(start)
59+
60+
if err != nil {
61+
t.Fatalf("Expected eventual success, got error: %v", err)
62+
}
63+
if provider == nil {
64+
t.Fatal("Expected provider, got nil")
65+
}
66+
// The test server fails twice then succeeds on the third attempt
67+
if atomic.LoadInt32(&attempts) != 3 {
68+
t.Fatalf("Expected 3 attempts, got %d", attempts)
69+
}
70+
// Should have taken at least 1 second due to backoff after first failure
71+
if duration < 1*time.Second {
72+
t.Fatalf("Expected retry backoff, but completed too quickly: %v", duration)
73+
}
74+
}
75+
76+
func TestNewProviderWithRetry_AllAttemptsFail(t *testing.T) {
77+
var attempts int32
78+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
79+
atomic.AddInt32(&attempts, 1)
80+
w.WriteHeader(http.StatusInternalServerError)
81+
}))
82+
defer server.Close()
83+
84+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
85+
defer cancel()
86+
87+
provider, err := newProviderWithRetry(ctx, server.URL)
88+
89+
if err == nil {
90+
t.Fatal("Expected error after all retries failed")
91+
}
92+
if provider != nil {
93+
t.Fatal("Expected nil provider after all retries failed")
94+
}
95+
// With backoff library, we expect multiple attempts but don't need to check exact count
96+
if atomic.LoadInt32(&attempts) < 3 {
97+
t.Fatalf("Expected at least 3 attempts, got %d", attempts)
98+
}
99+
}
100+
101+
func TestNewProviderWithRetry_ContextCancellation(t *testing.T) {
102+
var attempts int32
103+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
104+
atomic.AddInt32(&attempts, 1)
105+
// Always fail to trigger retries
106+
w.WriteHeader(http.StatusInternalServerError)
107+
}))
108+
defer server.Close()
109+
110+
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
111+
defer cancel()
112+
113+
start := time.Now()
114+
provider, err := newProviderWithRetry(ctx, server.URL)
115+
duration := time.Since(start)
116+
117+
if !errors.Is(err, context.DeadlineExceeded) {
118+
t.Fatalf("Expected context deadline exceeded, got: %v", err)
119+
}
120+
if provider != nil {
121+
t.Fatal("Expected nil provider after context cancellation")
122+
}
123+
// Should have attempted at least once but been canceled before completing all retries
124+
totalAttempts := atomic.LoadInt32(&attempts)
125+
if totalAttempts == 0 {
126+
t.Fatal("Expected at least one attempt before context cancellation")
127+
}
128+
// With backoff library and timeout, we expect some attempts but not too many
129+
if totalAttempts > 10 {
130+
t.Fatalf("Expected reasonable number of attempts due to context cancellation, got %d", totalAttempts)
131+
}
132+
// Should have been canceled around the timeout duration
133+
if duration > 3*time.Second {
134+
t.Fatalf("Expected cancellation around 2s, but took %v", duration)
135+
}
136+
}
137+
138+
func TestIsPermanentError_GoOIDCErrorPatterns(t *testing.T) {
139+
tests := []struct {
140+
statusCode int
141+
body string
142+
permanent bool
143+
name string
144+
}{
145+
// Permanent errors - using actual HTTP status codes that will generate real go-oidc errors
146+
{400, `{"error":"invalid_request"}`, true, "400 Bad Request should be permanent"},
147+
{401, `{"error":"access_denied"}`, true, "401 Unauthorized should be permanent"},
148+
{403, `{"error":"insufficient_scope"}`, true, "403 Forbidden should be permanent"},
149+
{404, `{"error":"not_found"}`, true, "404 Not Found should be permanent"},
150+
{405, `{"error":"method_not_allowed"}`, true, "405 Method Not Allowed should be permanent"},
151+
{406, `{"error":"not_acceptable"}`, true, "406 Not Acceptable should be permanent"},
152+
{410, `{"error":"gone"}`, true, "410 Gone should be permanent"},
153+
{415, `{"error":"unsupported_media_type"}`, true, "415 Unsupported Media Type should be permanent"},
154+
{422, `{"error":"unprocessable_entity"}`, true, "422 Unprocessable Entity should be permanent"},
155+
{501, `{"error":"not_implemented"}`, true, "501 Not Implemented should be permanent"},
156+
157+
// Temporary errors - should be retryable
158+
{429, `{"error":"rate_limited"}`, false, "429 Too Many Requests should be retryable"},
159+
{500, `{"error":"internal_server_error"}`, false, "500 Internal Server Error should be retryable"},
160+
{502, `{"error":"bad_gateway"}`, false, "502 Bad Gateway should be retryable"},
161+
{503, `{"error":"service_unavailable"}`, false, "503 Service Unavailable should be retryable"},
162+
{504, `{"error":"gateway_timeout"}`, false, "504 Gateway Timeout should be retryable"},
163+
}
164+
165+
for _, tc := range tests {
166+
t.Run(tc.name, func(t *testing.T) {
167+
// Create a test server that returns the specific HTTP status code
168+
// This will generate actual go-oidc errors that we can test against
169+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
170+
w.Header().Set("Content-Type", "application/json")
171+
w.WriteHeader(tc.statusCode)
172+
w.Write([]byte(tc.body))
173+
}))
174+
defer server.Close()
175+
176+
// Use go-oidc to generate the actual error
177+
ctx := context.Background()
178+
_, err := oidc.NewProvider(ctx, server.URL)
179+
180+
// go-oidc should return an error for non-200 responses
181+
if err == nil {
182+
t.Fatalf("Expected go-oidc to return an error for status %d, but got nil", tc.statusCode)
183+
}
184+
185+
// Test our error classification function on the real go-oidc error
186+
result := isPermanentError(err)
187+
if result != tc.permanent {
188+
t.Errorf("isPermanentError() for real go-oidc error %q = %v, want %v", err.Error(), result, tc.permanent)
189+
}
190+
})
191+
}
192+
}

0 commit comments

Comments
 (0)