Skip to content

Commit 40834bb

Browse files
authored
Merge pull request #630 from jgehrcke/jp/clique-id-readout-logging
CD plugin: log detail around fabric cliqueID readout
2 parents 416c2c6 + cc3f6f6 commit 40834bb

File tree

2 files changed

+18
-3
lines changed

2 files changed

+18
-3
lines changed

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
9494
return nil, fmt.Errorf("unable to create CDI handler: %w", err)
9595
}
9696

97+
// TODO: explore calling this not only during plugin startup because this
98+
// information may change during runtime.
9799
cliqueID, err := nvdevlib.getCliqueID()
98100
if err != nil {
99101
return nil, fmt.Errorf("error getting cliqueID: %w", err)

cmd/compute-domain-kubelet-plugin/nvlib.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,28 +206,41 @@ func (l deviceLib) getCliqueID() (string, error) {
206206
uniqueCliqueIDs := make(map[string]struct{})
207207

208208
err := l.VisitDevices(func(i int, d nvdev.Device) error {
209+
duid, ret := d.GetUUID()
210+
if ret != nvml.SUCCESS {
211+
return fmt.Errorf("failed to read device uuid (%d): %w", i, ret)
212+
}
213+
209214
isFabricAttached, err := d.IsFabricAttached()
210215
if err != nil {
211-
return fmt.Errorf("error checking if device is fabric attached: %w", err)
216+
return fmt.Errorf("error checking if fabric is attached (device %d/%s): %w", i, duid, err)
212217
}
218+
213219
if !isFabricAttached {
220+
klog.Infof("no-clique fallback: fabric not attached (device %d/%s)", i, duid)
214221
return nil
215222
}
216223

224+
// TODO: explore using GetGpuFabricInfoV() which can return
225+
// nvmlGpuFabricInfo_v3_t which contains `state`, `status`, and
226+
// `healthSummary`. The latter we may at least want to log (may be
227+
// "unhealthy"). See
228+
// https://docs.nvidia.com/deploy/nvml-api/group__nvmlFabricDefs.html
217229
info, ret := d.GetGpuFabricInfo()
218230
if ret != nvml.SUCCESS {
219-
return fmt.Errorf("failed to get GPU fabric info: %w", ret)
231+
return fmt.Errorf("failed to get GPU fabric info (device %d/%s): %w", i, duid, ret)
220232
}
221233

222234
clusterUUID, err := uuid.FromBytes(info.ClusterUuid[:])
223235
if err != nil {
224-
return fmt.Errorf("invalid cluster UUID: %w", err)
236+
return fmt.Errorf("invalid cluster UUID (device %d/%s): %w", i, duid, err)
225237
}
226238

227239
cliqueID := fmt.Sprintf("%d", info.CliqueId)
228240

229241
uniqueClusterUUIDs[clusterUUID.String()] = struct{}{}
230242
uniqueCliqueIDs[cliqueID] = struct{}{}
243+
klog.Infof("identified fabric clique UUID/ID (device %d/%s): %s/%s", i, duid, clusterUUID.String(), cliqueID)
231244

232245
return nil
233246
})

0 commit comments

Comments
 (0)