review feedback

jgehrcke · jgehrcke · commit fbd041ee077f · 2025-10-06T18:49:18.000Z
Signed-off-by: Dr. Jan-Philip Gehrcke &lt;jgehrcke@nvidia.com&gt;
diff --git a/cmd/compute-domain-kubelet-plugin/device_state.go b/cmd/compute-domain-kubelet-plugin/device_state.go
@@ -425,12 +425,6 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi
 		return nil, fmt.Errorf("applyComputeDomainChannelConfig: unexpected results %v", results)
 	}
 
-	// For now, we treat each request as a request for channel zero, even if
-	// AllocationModeAll.
-	if err := s.allocateImexChannel(0); err != nil {
-		return nil, fmt.Errorf("allocation failed: %w", err)
-	}
-
 	// If explicitly requested, inject all channels instead of just one.
 	chancount := 1
 	if config.AllocationMode == configapi.ComputeDomainChannelAllocationModeAll {
@@ -443,6 +437,12 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi
 		ComputeDomain: config.DomainID,
 	}
 
+	// Treat each request as a request for channel zero, even if
+	// AllocationModeAll.
+	if err := s.assertImexChannelNotAllocated(0); err != nil {
+		return nil, fmt.Errorf("allocation failed: %w", err)
+	}
+
 	// Create any necessary ComputeDomain channels and gather their CDI container edits.
 	if err := s.computeDomainManager.AssertComputeDomainNamespace(ctx, claim.Namespace, config.DomainID); err != nil {
 		return nil, permanentError{fmt.Errorf("error asserting ComputeDomain's namespace: %w", err)}
@@ -578,26 +578,30 @@ func (s *DeviceState) getConfigResultsMap(rcs *resourceapi.ResourceClaimStatus,
 	return configResultsMap, nil
 }
 
-// allocateImexChannel() consults the (absolute, node-local) source of truth,
-// which currently is the checkpoint data. For now, It fails with an error when
-// the channel with the given `id` is already allocated for/by another resource
-// claim (soon, this implementation may become more involved when the same IMEX
-// channel may be shared across pods on the same node). Note that generally, we
-// must expect prepare() and unprepare() calls acting on the same resource to
-// arrive out-of-order (cf.
+// assertImexChannelNotAllocated() consults the absolute, node-local source of
+// truth (the checkpoint data) and fails when the channel with ID `id` is
+// already in use by another resource claim.
+//
+// Must be performed in the Prepare() path for any claim asking for a channel to
+// force processing Prepare() and Unprepare() calls acting on the same resource
+// in the correct order (to prevent unprepare-after-prepare, cf.
 // https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/641).
-func (s *DeviceState) allocateImexChannel(id int) error {
+//
+// This implementation may become more involved when the same IMEX channel may
+// be shared across pods on the same node)
+func (s *DeviceState) assertImexChannelNotAllocated(id int) error {
 	cp, err := s.getCheckpoint()
 	if err != nil {
 		return fmt.Errorf("unable to get checkpoint: %w", err)
 	}
 
 	for claimUID, claim := range cp.V2.PreparedClaims {
-		// Ignore non-completed preparations: only one instance of this program
-		// is running, and we only run one Prepare() at any given time. Is that
-		// true during upgrades though? If this is not true, then we must fail
-		// allocation also on PrepareStarted -- which leads to the question of
-		// how we clean up long-term stale PrepareStarted entries.
+		// Ignore non-completed preparations: file-based locking guarantees that
+		// only one Prepare() runs at any given time. If a claim is in the
+		// `PrepareStarted` state then it is not actually currently in progress
+		// of being prepared, but either retried soon (in which case we are
+		// faster and win over it) or never retried (in which case we can also
+		// safely allocate).
 		if claim.CheckpointState != "PrepareCompleted" {
 			continue
 		}