NVIDIA
diff --git a/‎cmd/compute-domain-controller/cleanup.go‎
Lines changed: 1 addition & 1 deletion b/‎cmd/compute-domain-controller/cleanup.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmd/compute-domain-kubelet-plugin/cdi.go‎
Lines changed: 1 addition & 1 deletion b/‎cmd/compute-domain-kubelet-plugin/cdi.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmd/compute-domain-kubelet-plugin/checkpointv.go‎
Lines changed: 2 additions & 0 deletions b/‎cmd/compute-domain-kubelet-plugin/checkpointv.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cmd/compute-domain-kubelet-plugin/cleanup.go‎
Lines changed: 258 additions & 0 deletions b/‎cmd/compute-domain-kubelet-plugin/cleanup.go‎
Lines changed: 258 additions & 0 deletions
@@ -155,7 +155,7 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) {
 			return
 		case <-ticker.C:
 			if m.EnqueueCleanup() {
-				klog.V(6).Infof("Periodoc cleanup requested for %T objects", *new(T))
+				klog.V(6).Infof("Periodic cleanup requested for %T objects", *new(T))
 			}
 		}
 	}
 
@@ -259,7 +259,7 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, preparedDevices Prep
 	return cdi.cache.WriteSpec(spec.Raw(), specName)
 }
 
-func (cdi *CDIHandler) DeleteClaimSpecFile(claimUID string) error {
+func (cdi *CDIHandler) DeleteClaimSpecFileIfExists(claimUID string) error {
 	specName := cdiapi.GenerateTransientSpecName(cdi.vendor, cdi.claimClass, claimUID)
 	return cdi.cache.RemoveSpec(specName)
 }
 
@@ -32,6 +32,8 @@ type PreparedClaimV2 struct {
 	CheckpointState ClaimCheckpointState            `json:"checkpointState"`
 	Status          resourceapi.ResourceClaimStatus `json:"status,omitempty"`
 	PreparedDevices PreparedDevices                 `json:"preparedDevices,omitempty"`
+	Name            string                          `json:"name,omitempty"`
+	Namespace       string                          `json:"namespace,omitempty"`
 }
 
 // V1 types
 
@@ -0,0 +1,258 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	resourcev1 "k8s.io/api/resource/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	draclient "k8s.io/dynamic-resource-allocation/client"
+	"k8s.io/dynamic-resource-allocation/kubeletplugin"
+	"k8s.io/klog/v2"
+)
+
+const (
+	cleanupIntervalRC = 10 * time.Minute
+)
+
+type CheckpointCleanupManager struct {
+	waitGroup     sync.WaitGroup
+	cancelContext context.CancelFunc
+	queue         chan struct{}
+	s             *DeviceState
+	draclient     *draclient.Client
+}
+
+func NewCheckpointCleanupManager(s *DeviceState, client *draclient.Client) *CheckpointCleanupManager {
+	// `queue`: buffered channel to implement a pragmatic fixed-size queue; to
+	// ensure at most one cleanup operation gets enqueued.
+	return &CheckpointCleanupManager{
+		s:         s,
+		draclient: client,
+		queue:     make(chan struct{}, 1),
+	}
+}
+
+// Log relevant error detail, but do not propagate errors anywhere.
+func (m *CheckpointCleanupManager) cleanup(ctx context.Context) {
+	cp, err := m.s.getCheckpoint()
+	if err != nil {
+		klog.Errorf("Checkpointed RC cleanup: unable to get checkpoint: %s", err)
+		return
+	}
+
+	// Get all checkpointed claims in PrepareStarted state.
+	filtered := make(PreparedClaimsByUIDV2)
+	for uid, claim := range cp.V2.PreparedClaims {
+		if claim.CheckpointState == ClaimCheckpointStatePrepareStarted {
+			filtered[uid] = claim
+		}
+	}
+
+	klog.V(4).Infof("Checkpointed ResourceClaims in PrepareStarted state: %d found (of total: %d)", len(filtered), len(cp.V2.PreparedClaims))
+
+	for cpuid, cpclaim := range filtered {
+		// Drop the claim from the checkpoint if it is not present anymore
+		// according to the API server. There are two options to look up a claim
+		// with a specific UID from the API server:
+		//
+		// 1) List(), with `FieldSelector: "metadata.uid=your-uid"`. Especially
+		//    across all namespaces (but also within one namespace) this can be
+		//    considered an irresponsibly expensive lookup.
+		//
+		// 2) Get(), using a specific name/namespace + subsequent UID
+		//    comparison. This is a cheap lookup for the API server.
+
+		// For (2), name and namespace must be stored in the checkpoint. That is
+		// not true for legacy deployments with checkpoint data created by
+		// version 25.3.x of this driver. Detect that situation by looking for
+		// an empty `Name`.
+
+		if cpclaim.Name == "" {
+			klog.V(4).Infof("Checkpointed RC cleanup: skip checkpointed claim '%s': RC name not in checkpoint", cpuid)
+			// Consider fallback: expensive lookup by UID across all namespaces.
+			continue
+		}
+
+		claim, err := m.getClaimByName(ctx, cpclaim.Name, cpclaim.Namespace)
+		if err != nil {
+			if errors.IsNotFound(err) {
+				klog.V(4).Infof(
+					"Checkpointed RC cleanup: partially prepared claim '%s/%s:%s' is stale: not found in API server",
+					cpclaim.Namespace,
+					cpclaim.Name,
+					cpuid)
+				m.unprepare(ctx, cpuid, cpclaim)
+			} else {
+				// A transient error during API server lookup. No explicit retry
+				// required. The next periodic cleanup invocation will
+				// implicitly retry.
+				klog.Infof("Checkpointed RC cleanup: skip for checkpointed claim %s: getClaimByName failed (retry later): %s", cpuid, err)
+			}
+			continue
+		}
+
+		if string(claim.UID) != cpuid {
+			// There cannot be two ResourceClaim objects with the same name in
+			// the same namespace at the same time. It is possible for a
+			// ResourceClaim with the same name to have a different UID if the
+			// original object was deleted and a new one with the same name was
+			// created. Hence, this checkpointed claim is stale.
+			klog.V(4).Infof("Checkpointed RC cleanup: partially prepared claim '%s/%s' is stale: UID changed (checkpoint: %s, API server: %s)", cpclaim.Namespace, cpclaim.Name, cpuid, claim.UID)
+			m.unprepare(ctx, cpuid, cpclaim)
+			continue
+		}
+
+		klog.V(4).Infof("Checkpointed RC cleanup: partially prepared claim not stale: %s", RCToString(claim))
+	}
+}
+
+// Do not propagate error back (but log it)
+func (m *CheckpointCleanupManager) unprepare(ctx context.Context, uid string, claim PreparedClaim) {
+
+	claimRef := kubeletplugin.NamespacedObject{
+		UID: types.UID(uid),
+		NamespacedName: types.NamespacedName{
+			Name:      claim.Name,
+			Namespace: claim.Namespace,
+		},
+	}
+
+	// Perform one Unprepare attempt. Implicit retrying across periodic cleanup
+	// invocations is sufficient. Rely on Unprepare() to delete claim from
+	// checkpoint (upon success). TODO: review `Unprepare()` for code paths that
+	// allow for this claim never to be dropped from the checkpoint (resulting
+	// in infinite periodic cleanup attempts for this claim).
+	err := m.s.Unprepare(ctx, claimRef)
+	if err != nil {
+		klog.Warningf("Error during self-initiated unprepare for %s (retried later): %s", claimRef.String(), err)
+		return
+	}
+
+	klog.Infof("Checkpointed RC cleanup: unprepared stale claim: %s", claimRef.String())
+}
+
+func (m *CheckpointCleanupManager) getClaimByName(ctx context.Context, name string, ns string) (*resourcev1.ResourceClaim, error) {
+	// The API call below should be responded to with low latency. Choose a
+	// timeout constant here that reflects a pathological state if met; in this
+	// case give up.
+	childctx, cancel := context.WithTimeout(ctx, 20*time.Second)
+	defer cancel()
+
+	// Works across DRA API versions -- but how can we not hardcode the return
+	// type version? Or does this always automatically convert to a v1 type?
+	claim, err := m.draclient.ResourceClaims(ns).Get(childctx, name, metav1.GetOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("error getting resource claim %s/%s: %w", name, ns, err)
+	}
+
+	return claim, nil
+}
+
+func (m *CheckpointCleanupManager) Start(ctx context.Context) error {
+	ctx, cancel := context.WithCancel(ctx)
+	m.cancelContext = cancel
+
+	m.waitGroup.Add(1)
+	go func() {
+		defer m.waitGroup.Done()
+		// Start producer: periodically submit cleanup task.
+		m.triggerPeriodically(ctx)
+	}()
+
+	m.waitGroup.Add(1)
+	go func() {
+		defer m.waitGroup.Done()
+		// Start consumer
+		m.worker(ctx)
+	}()
+
+	klog.V(6).Infof("CheckpointCleanupManager started")
+	return nil
+}
+
+func (m *CheckpointCleanupManager) Stop() error {
+	if m.cancelContext != nil {
+		m.cancelContext()
+	}
+	m.waitGroup.Wait()
+	return nil
+}
+
+// enqueueCleanup() submits a cleanup task if the queue is currently empty.
+// Return a Boolean indicating whether the task was submitted or not.
+func (m *CheckpointCleanupManager) enqueueCleanup() bool {
+	select {
+	case m.queue <- struct{}{}:
+		return true
+	default:
+		// Channel full: one task already lined up, did not submit more.
+		return false
+	}
+}
+
+// Run forever until context is canceled.
+func (m *CheckpointCleanupManager) worker(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-m.queue:
+			// Do we want to timeout-control this cleanup run? What may take
+			// unexpectedly long: lock acquisition (if we do any, e.g. around
+			// checkpoint file mutation), API server interaction.
+			m.cleanup(ctx)
+		}
+	}
+}
+
+// Immediately submit a cleanup task; then periodically submit cleanup tasks
+// forever.
+func (m *CheckpointCleanupManager) triggerPeriodically(ctx context.Context) {
+	// Maybe add jitter. Or delay first cleanup by a somewhat random amount.
+	// After all, this periodic cleanup runs in N kubelet plugins and upon
+	// driver upgrade they might restart at roughly the same time -- it makes
+	// sense to smear the API server load out over time.
+	ticker := time.NewTicker(cleanupIntervalRC)
+	defer ticker.Stop()
+
+	m.cleanup(ctx)
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			if m.enqueueCleanup() {
+				klog.V(6).Infof("Cleanup for checkpointed ResourceClaims in PrepareStarted state: task submitted")
+			} else {
+				// A previous cleanup is taking long; that may not be normal.
+				klog.Warningf("Cleanup for checkpointed ResourceClaims in PrepareStarted state: ongoing, skipped")
+			}
+		}
+	}
+}
+
+func RCToString(rc *resourcev1.ResourceClaim) string {
+	return fmt.Sprintf("%s/%s:%s", rc.Namespace, rc.Name, rc.UID)
+}
Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,7 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) {`
`155`	`155`	`return`
`156`	`156`	`case <-ticker.C:`
`157`	`157`	`if m.EnqueueCleanup() {`
`158`		`- klog.V(6).Infof("Periodoc cleanup requested for %T objects", *new(T))`
	`158`	`+ klog.V(6).Infof("Periodic cleanup requested for %T objects", *new(T))`
`159`	`159`	`}`
`160`	`160`	`}`
`161`	`161`	`}`
Original file line number	Diff line number	Diff line change
`@@ -259,7 +259,7 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, preparedDevices Prep`
`259`	`259`	`return cdi.cache.WriteSpec(spec.Raw(), specName)`
`260`	`260`	`}`
`261`	`261`
`262`		`-func (cdi *CDIHandler) DeleteClaimSpecFile(claimUID string) error {`
	`262`	`+func (cdi *CDIHandler) DeleteClaimSpecFileIfExists(claimUID string) error {`
`263`	`263`	`specName := cdiapi.GenerateTransientSpecName(cdi.vendor, cdi.claimClass, claimUID)`
`264`	`264`	`return cdi.cache.RemoveSpec(specName)`
`265`	`265`	`}`
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,8 @@ type PreparedClaimV2 struct {`
`32`	`32`	CheckpointState ClaimCheckpointState `json:"checkpointState"`
`33`	`33`	Status resourceapi.ResourceClaimStatus `json:"status,omitempty"`
`34`	`34`	PreparedDevices PreparedDevices `json:"preparedDevices,omitempty"`
	`35`	+ Name string `json:"name,omitempty"`
	`36`	+ Namespace string `json:"namespace,omitempty"`
`35`	`37`	`}`
`36`	`38`
`37`	`39`	`// V1 types`