Merge pull request #630 from jgehrcke/jp/clique-id-readout-logging

jgehrcke · web-flow · commit 40834bbc7d82 · 2025-10-02T11:46:02.000+02:00
CD plugin: log detail around fabric cliqueID readout
diff --git a/cmd/compute-domain-kubelet-plugin/device_state.go b/cmd/compute-domain-kubelet-plugin/device_state.go
@@ -94,6 +94,8 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
 		return nil, fmt.Errorf("unable to create CDI handler: %w", err)
 	}
 
+	// TODO: explore calling this not only during plugin startup because this
+	// information may change during runtime.
 	cliqueID, err := nvdevlib.getCliqueID()
 	if err != nil {
 		return nil, fmt.Errorf("error getting cliqueID: %w", err)
diff --git a/cmd/compute-domain-kubelet-plugin/nvlib.go b/cmd/compute-domain-kubelet-plugin/nvlib.go
@@ -206,28 +206,41 @@ func (l deviceLib) getCliqueID() (string, error) {
 	uniqueCliqueIDs := make(map[string]struct{})
 
 	err := l.VisitDevices(func(i int, d nvdev.Device) error {
+		duid, ret := d.GetUUID()
+		if ret != nvml.SUCCESS {
+			return fmt.Errorf("failed to read device uuid (%d): %w", i, ret)
+		}
+
 		isFabricAttached, err := d.IsFabricAttached()
 		if err != nil {
-			return fmt.Errorf("error checking if device is fabric attached: %w", err)
+			return fmt.Errorf("error checking if fabric is attached (device %d/%s): %w", i, duid, err)
 		}
+
 		if !isFabricAttached {
+			klog.Infof("no-clique fallback: fabric not attached (device %d/%s)", i, duid)
 			return nil
 		}
 
+		// TODO: explore using GetGpuFabricInfoV() which can return
+		// nvmlGpuFabricInfo_v3_t which contains `state`, `status`, and
+		// `healthSummary`. The latter we may at least want to log (may be
+		// "unhealthy"). See
+		// https://docs.nvidia.com/deploy/nvml-api/group__nvmlFabricDefs.html
 		info, ret := d.GetGpuFabricInfo()
 		if ret != nvml.SUCCESS {
-			return fmt.Errorf("failed to get GPU fabric info: %w", ret)
+			return fmt.Errorf("failed to get GPU fabric info (device %d/%s): %w", i, duid, ret)
 		}
 
 		clusterUUID, err := uuid.FromBytes(info.ClusterUuid[:])
 		if err != nil {
-			return fmt.Errorf("invalid cluster UUID: %w", err)
+			return fmt.Errorf("invalid cluster UUID (device %d/%s): %w", i, duid, err)
 		}
 
 		cliqueID := fmt.Sprintf("%d", info.CliqueId)
 
 		uniqueClusterUUIDs[clusterUUID.String()] = struct{}{}
 		uniqueCliqueIDs[cliqueID] = struct{}{}
+		klog.Infof("identified fabric clique UUID/ID (device %d/%s): %s/%s", i, duid, clusterUUID.String(), cliqueID)
 
 		return nil
 	})

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,8 @@ func NewDeviceState(ctx context.Context, config Config) (DeviceState, error) {`
`94`	`94`	`return nil, fmt.Errorf("unable to create CDI handler: %w", err)`
`95`	`95`	`}`
`96`	`96`
	`97`	`+ // TODO: explore calling this not only during plugin startup because this`
	`98`	`+ // information may change during runtime.`
`97`	`99`	`cliqueID, err := nvdevlib.getCliqueID()`
`98`	`100`	`if err != nil {`
`99`	`101`	`return nil, fmt.Errorf("error getting cliqueID: %w", err)`