-
Notifications
You must be signed in to change notification settings - Fork 37
sidecar sync: retry SSH on boot, simplify transient error detection, resolve base from sidecar #315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ffdf4b1
bacb751
e313176
47421ad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,9 +4,13 @@ import ( | |
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "net" | ||
| "os" | ||
| "path/filepath" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "github.com/cenkalti/backoff/v4" | ||
|
|
||
| "github.com/CircleCI-Public/chunk-cli/internal/circleci" | ||
| "github.com/CircleCI-Public/chunk-cli/internal/gitremote" | ||
|
|
@@ -52,7 +56,7 @@ func persistWorkspace(ctx context.Context, workspace string) error { | |
| func Sync(ctx context.Context, | ||
| client *circleci.Client, sidecarID, identityFile, authSock, workdir string, status iostream.StatusFunc) error { | ||
|
|
||
| session, err := OpenSession(ctx, client, sidecarID, identityFile, authSock) | ||
| session, err := openSessionWithRetry(ctx, client, sidecarID, identityFile, authSock, status) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
@@ -155,10 +159,20 @@ func syncWorkspace(ctx context.Context, status iostream.StatusFunc, org, repo, r | |
|
|
||
| status(iostream.LevelInfo, fmt.Sprintf("Synchronising local %s/%s to remote: %s...", org, repo, repoPath)) | ||
|
|
||
| base, err := gitutil.MergeBase() | ||
| // Prefer origin/HEAD (set when the remote advertises a default branch), fall | ||
| // back to HEAD for repos where origin/HEAD is not configured. | ||
| baseCmd := fmt.Sprintf( | ||
| "git -C %[1]s rev-parse origin/HEAD 2>/dev/null || git -C %[1]s rev-parse HEAD", | ||
| ShellEscape(repoPath), | ||
| ) | ||
| baseResult, err := ExecOverSSH(ctx, session, baseCmd, nil, nil) | ||
| if err != nil { | ||
| return &RemoteBaseError{Err: err} | ||
| return fmt.Errorf("sync: resolve remote base: %w", err) | ||
| } | ||
| if baseResult.ExitCode != 0 { | ||
| return &RemoteBaseError{Err: fmt.Errorf("resolve base commit: %s", baseResult.Stderr)} | ||
| } | ||
| base := strings.TrimSpace(baseResult.Stdout) | ||
|
|
||
| patch, err := gitutil.GeneratePatch(base) | ||
| if err != nil { | ||
|
|
@@ -201,3 +215,44 @@ func syncWorkspace(ctx context.Context, status iostream.StatusFunc, org, repo, r | |
| } | ||
| return nil | ||
| } | ||
|
|
||
| // openSessionWithRetry calls OpenSession, retrying on transient errors to give | ||
| // a newly-created sidecar time to finish booting before its SSH service is ready. | ||
| func openSessionWithRetry(ctx context.Context, client *circleci.Client, sidecarID, identityFile, authSock string, status iostream.StatusFunc) (*Session, error) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @danmux or @pete-woods do we have a standard pattern or lib for retries? this feels just like something we would have solved elsewhere...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is using the standard lib for retries now! Thanks @danmux for the link |
||
| b := backoff.NewExponentialBackOff() | ||
| b.InitialInterval = 2 * time.Second | ||
| b.MaxInterval = 15 * time.Second | ||
| b.MaxElapsedTime = 90 * time.Second | ||
|
|
||
| var session *Session | ||
| notified := false | ||
| err := backoff.RetryNotify( | ||
| func() error { | ||
| var e error | ||
| session, e = OpenSession(ctx, client, sidecarID, identityFile, authSock) | ||
| if e != nil && !isTransientSSHError(e) { | ||
| return backoff.Permanent(e) | ||
| } | ||
| return e | ||
| }, | ||
| backoff.WithContext(b, ctx), | ||
| func(_ error, _ time.Duration) { | ||
| if !notified { | ||
| status(iostream.LevelInfo, "Waiting for sidecar SSH to become available...") | ||
| notified = true | ||
| } | ||
| }, | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| return session, nil | ||
| } | ||
|
|
||
| // isTransientSSHError returns true for network-level errors that are worth | ||
| // retrying when opening a session — connection failures and timeouts that | ||
| // indicate the sidecar's SSH service is not yet ready. | ||
| func isTransientSSHError(err error) bool { | ||
| var netErr net.Error | ||
| return errors.As(err, &netErr) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| package sidecar | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "net" | ||
| "testing" | ||
|
|
||
| "gotest.tools/v3/assert" | ||
|
|
||
| "github.com/CircleCI-Public/chunk-cli/internal/circleci" | ||
| ) | ||
|
|
||
| func TestIsTransientSSHError(t *testing.T) { | ||
| t.Run("timeout is transient", func(t *testing.T) { | ||
| err := &net.OpError{Op: "dial", Err: &timeoutError{}} | ||
| assert.Equal(t, isTransientSSHError(err), true) | ||
| }) | ||
|
|
||
| t.Run("connection refused is transient", func(t *testing.T) { | ||
| err := &net.OpError{Op: "dial", Net: "tcp", Err: fmt.Errorf("connection refused")} | ||
| assert.Equal(t, isTransientSSHError(err), true) | ||
| }) | ||
|
|
||
| t.Run("net error wrapped with fmt.Errorf is transient", func(t *testing.T) { | ||
| inner := &net.OpError{Op: "dial", Err: &timeoutError{}} | ||
| err := fmt.Errorf("register SSH key: %w", inner) | ||
| assert.Equal(t, isTransientSSHError(err), true) | ||
| }) | ||
|
|
||
| t.Run("ErrNotAuthorized is not transient", func(t *testing.T) { | ||
| err := fmt.Errorf("add ssh key: %w", circleci.ErrNotAuthorized) | ||
| assert.Equal(t, isTransientSSHError(err), false) | ||
| }) | ||
|
|
||
| t.Run("StatusError is not transient", func(t *testing.T) { | ||
| err := &circleci.StatusError{Op: "add ssh key", StatusCode: 503} | ||
| assert.Equal(t, isTransientSSHError(err), false) | ||
| }) | ||
|
|
||
| t.Run("KeyNotFoundError is not transient", func(t *testing.T) { | ||
| err := &KeyNotFoundError{Path: "/home/user/.ssh/chunk_ai"} | ||
| assert.Equal(t, isTransientSSHError(err), false) | ||
| }) | ||
|
|
||
| t.Run("PublicKeyNotFoundError is not transient", func(t *testing.T) { | ||
| err := &PublicKeyNotFoundError{KeyPath: "/home/user/.ssh/chunk_ai.pub"} | ||
| assert.Equal(t, isTransientSSHError(err), false) | ||
| }) | ||
|
|
||
| t.Run("generic error is not transient", func(t *testing.T) { | ||
| err := fmt.Errorf("resolve home directory: permission denied") | ||
| assert.Equal(t, isTransientSSHError(err), false) | ||
| }) | ||
| } | ||
|
|
||
| // timeoutError is a net.Error that reports Timeout() == true. | ||
| type timeoutError struct{} | ||
|
|
||
| func (timeoutError) Error() string { return "i/o timeout" } | ||
| func (timeoutError) Timeout() bool { return true } | ||
| func (timeoutError) Temporary() bool { return true } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where was the git fetch happening? We were only calling
rev-parseandmerge-basewhich shouldn't be doing a fetch?