@@ -425,6 +425,12 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi
425425 return nil , fmt .Errorf ("applyComputeDomainChannelConfig: unexpected results %v" , results )
426426 }
427427
428+ // For now, we treat each request as a request for channel zero, even if
429+ // AllocationModeAll.
430+ if err := s .allocateImexChannel (0 ); err != nil {
431+ return nil , fmt .Errorf ("allocation failed: %w" , err )
432+ }
433+
428434 // If explicitly requested, inject all channels instead of just one.
429435 chancount := 1
430436 if config .AllocationMode == configapi .ComputeDomainChannelAllocationModeAll {
@@ -572,6 +578,43 @@ func (s *DeviceState) getConfigResultsMap(rcs *resourceapi.ResourceClaimStatus,
572578 return configResultsMap , nil
573579}
574580
581+ // allocateImexChannel() consults the (absolute, node-local) source of truth,
582+ // which currently is the checkpoint data. For now, It fails with an error when
583+ // the channel with the given `id` is already allocated for/by another resource
584+ // claim (soon, this implementation may become more involved when the same IMEX
585+ // channel may be shared across pods on the same node). Note that generally, we
586+ // must expect prepare() and unprepare() calls acting on the same resource to
587+ // arrive out-of-order (cf.
588+ // https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/641).
589+ func (s * DeviceState ) allocateImexChannel (id int ) error {
590+ cp , err := s .getCheckpoint ()
591+ if err != nil {
592+ return fmt .Errorf ("unable to get checkpoint: %w" , err )
593+ }
594+
595+ for claimUID , claim := range cp .V2 .PreparedClaims {
596+ // Ignore non-completed preparations: only one instance of this program
597+ // is running, and we only run one Prepare() at any given time. Is that
598+ // true during upgrades though? If this is not true, then we must fail
599+ // allocation also on PrepareStarted -- which leads to the question of
600+ // how we clean up long-term stale PrepareStarted entries.
601+ if claim .CheckpointState != "PrepareCompleted" {
602+ continue
603+ }
604+
605+ for _ , preparedDevice := range claim .PreparedDevices {
606+ for _ , device := range preparedDevice .Devices {
607+ if device .Channel != nil && device .Channel .Info .ID == id {
608+ // Maybe log something based on `claim.Status.ReservedFor`
609+ // to facilitate debugging.
610+ return fmt .Errorf ("channel %d already allocated by claim %s (according to checkpoint)" , id , claimUID )
611+ }
612+ }
613+ }
614+ }
615+ return nil
616+ }
617+
575618// validateDriverVersionForIMEXDaemonsWithDNSNames validates that the driver version
576619// meets the minimum requirement for the IMEXDaemonsWithDNSNames feature gate.
577620func validateDriverVersionForIMEXDaemonsWithDNSNames (flags * Flags , nvdevlib * deviceLib ) error {
0 commit comments