@@ -437,6 +437,12 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi
437437 ComputeDomain : config .DomainID ,
438438 }
439439
440+ // Treat each request as a request for channel zero, even if
441+ // AllocationModeAll.
442+ if err := s .assertImexChannelNotAllocated (0 ); err != nil {
443+ return nil , fmt .Errorf ("allocation failed: %w" , err )
444+ }
445+
440446 // Create any necessary ComputeDomain channels and gather their CDI container edits.
441447 if err := s .computeDomainManager .AssertComputeDomainNamespace (ctx , claim .Namespace , config .DomainID ); err != nil {
442448 return nil , permanentError {fmt .Errorf ("error asserting ComputeDomain's namespace: %w" , err )}
@@ -572,6 +578,47 @@ func (s *DeviceState) getConfigResultsMap(rcs *resourceapi.ResourceClaimStatus,
572578 return configResultsMap , nil
573579}
574580
581+ // assertImexChannelNotAllocated() consults the absolute, node-local source of
582+ // truth (the checkpoint data). It fails when the IMEX channel with ID `id` is
583+ // already in use by another resource claim.
584+ //
585+ // Must be performed in the Prepare() path for any claim asking for a channel.
586+ // This makes sure that Prepare() and Unprepare() calls acting on the same
587+ // resource are processed in the correct order (this prevents for example
588+ // unprepare-after-prepare, cf. issue 641).
589+ //
590+ // The implementation may become more involved when the same IMEX channel may be
591+ // shared across pods on the same node).
592+ func (s * DeviceState ) assertImexChannelNotAllocated (id int ) error {
593+ cp , err := s .getCheckpoint ()
594+ if err != nil {
595+ return fmt .Errorf ("unable to get checkpoint: %w" , err )
596+ }
597+
598+ for claimUID , claim := range cp .V2 .PreparedClaims {
599+ // Ignore non-completed preparations: file-based locking guarantees that
600+ // only one Prepare() runs at any given time. If a claim is in the
601+ // `PrepareStarted` state then it is not actually currently in progress
602+ // of being prepared, but either retried soon (in which case we are
603+ // faster and win over it) or never retried (in which case we can also
604+ // safely allocate).
605+ if claim .CheckpointState != "PrepareCompleted" {
606+ continue
607+ }
608+
609+ for _ , preparedDevice := range claim .PreparedDevices {
610+ for _ , device := range preparedDevice .Devices {
611+ if device .Channel != nil && device .Channel .Info .ID == id {
612+ // Maybe log something based on `claim.Status.ReservedFor`
613+ // to facilitate debugging.
614+ return fmt .Errorf ("channel %d already allocated by claim %s (according to checkpoint)" , id , claimUID )
615+ }
616+ }
617+ }
618+ }
619+ return nil
620+ }
621+
575622// validateDriverVersionForIMEXDaemonsWithDNSNames validates that the driver version
576623// meets the minimum requirement for the IMEXDaemonsWithDNSNames feature gate.
577624func validateDriverVersionForIMEXDaemonsWithDNSNames (flags * Flags , nvdevlib * deviceLib ) error {
0 commit comments