Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions cmd/compute-domain-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package main

import (
"context"
"encoding/json"
"errors"
"fmt"
"slices"
"sync"
Expand All @@ -27,6 +29,7 @@ import (
"k8s.io/dynamic-resource-allocation/kubeletplugin"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
cperrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"

configapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
Expand Down Expand Up @@ -181,6 +184,20 @@ func (s *DeviceState) Unprepare(ctx context.Context, claimRef kubeletplugin.Name
// Rely on local checkpoint state for ability to clean up.
checkpoint := newCheckpoint()
if err := s.checkpointManager.GetCheckpoint(DriverPluginCheckpointFileBasename, checkpoint); err != nil {
// Some errors returned by `GetCheckpoint` are permanent. Some of them
// may be retryable. For now, handle an explicit set of errors as
// permanent, and treat every other error as retryable.

if errors.Is(err, &json.UnmarshalTypeError{}) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: there is no overarching error type representing any error during Unmarshal().

As of https://pkg.go.dev/encoding/json#UnmarshalTypeError, Unmarshal() can actually return three different types of error:

Unmarshal returns an InvalidUnmarshalError.

Unmarshal returns a SyntaxError.

Unmarshal returns an UnmarshalTypeError

SyntaxError means "bad JSON", we also want to treat that as permanent.

What would be a concise way to test "is this error type any of these...?"

// May for example happen when a different version of this program
// wrote the JSON doc (using a different JSON schema).
return permanentError{fmt.Errorf("get checkpoint: unexpected schema (treat permanent): %w", err)}
}

if errors.Is(err, &cperrors.CorruptCheckpointError{}) {
return permanentError{fmt.Errorf("get checkpoint: bad checksum (treat permanent): %w", err)}
}

return fmt.Errorf("unable to get checkpoint: %w", err)
}

Expand Down
Loading