Skip to content

Commit 8e568dc

Browse files
committed
CD plugin: channel prepare: fail if allocated as of checkpoint
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent c10a367 commit 8e568dc

File tree

1 file changed

+47
-0
lines changed

1 file changed

+47
-0
lines changed

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,12 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi
437437
ComputeDomain: config.DomainID,
438438
}
439439

440+
// Treat each request as a request for channel zero, even if
441+
// AllocationModeAll.
442+
if err := s.assertImexChannelNotAllocated(0); err != nil {
443+
return nil, fmt.Errorf("allocation failed: %w", err)
444+
}
445+
440446
// Create any necessary ComputeDomain channels and gather their CDI container edits.
441447
if err := s.computeDomainManager.AssertComputeDomainNamespace(ctx, claim.Namespace, config.DomainID); err != nil {
442448
return nil, permanentError{fmt.Errorf("error asserting ComputeDomain's namespace: %w", err)}
@@ -572,6 +578,47 @@ func (s *DeviceState) getConfigResultsMap(rcs *resourceapi.ResourceClaimStatus,
572578
return configResultsMap, nil
573579
}
574580

581+
// assertImexChannelNotAllocated() consults the absolute, node-local source of
582+
// truth (the checkpoint data). It fails when the IMEX channel with ID `id` is
583+
// already in use by another resource claim.
584+
//
585+
// Must be performed in the Prepare() path for any claim asking for a channel.
586+
// This makes sure that Prepare() and Unprepare() calls acting on the same
587+
// resource are processed in the correct order (this prevents for example
588+
// unprepare-after-prepare, cf. issue 641).
589+
//
590+
// The implementation may become more involved when the same IMEX channel may be
591+
// shared across pods on the same node).
592+
func (s *DeviceState) assertImexChannelNotAllocated(id int) error {
593+
cp, err := s.getCheckpoint()
594+
if err != nil {
595+
return fmt.Errorf("unable to get checkpoint: %w", err)
596+
}
597+
598+
for claimUID, claim := range cp.V2.PreparedClaims {
599+
// Ignore non-completed preparations: file-based locking guarantees that
600+
// only one Prepare() runs at any given time. If a claim is in the
601+
// `PrepareStarted` state then it is not actually currently in progress
602+
// of being prepared, but either retried soon (in which case we are
603+
// faster and win over it) or never retried (in which case we can also
604+
// safely allocate).
605+
if claim.CheckpointState != "PrepareCompleted" {
606+
continue
607+
}
608+
609+
for _, preparedDevice := range claim.PreparedDevices {
610+
for _, device := range preparedDevice.Devices {
611+
if device.Channel != nil && device.Channel.Info.ID == id {
612+
// Maybe log something based on `claim.Status.ReservedFor`
613+
// to facilitate debugging.
614+
return fmt.Errorf("channel %d already allocated by claim %s (according to checkpoint)", id, claimUID)
615+
}
616+
}
617+
}
618+
}
619+
return nil
620+
}
621+
575622
// validateDriverVersionForIMEXDaemonsWithDNSNames validates that the driver version
576623
// meets the minimum requirement for the IMEXDaemonsWithDNSNames feature gate.
577624
func validateDriverVersionForIMEXDaemonsWithDNSNames(flags *Flags, nvdevlib *deviceLib) error {

0 commit comments

Comments
 (0)