@@ -206,28 +206,41 @@ func (l deviceLib) getCliqueID() (string, error) {
206206 uniqueCliqueIDs := make (map [string ]struct {})
207207
208208 err := l .VisitDevices (func (i int , d nvdev.Device ) error {
209+ duid , ret := d .GetUUID ()
210+ if ret != nvml .SUCCESS {
211+ return fmt .Errorf ("failed to read device uuid (%d): %w" , i , ret )
212+ }
213+
209214 isFabricAttached , err := d .IsFabricAttached ()
210215 if err != nil {
211- return fmt .Errorf ("error checking if device is fabric attached: %w" , err )
216+ return fmt .Errorf ("error checking if fabric is attached (device %d/%s) : %w" , i , duid , err )
212217 }
218+
213219 if ! isFabricAttached {
220+ klog .Infof ("no-clique fallback: fabric not attached (device %d/%s)" , i , duid )
214221 return nil
215222 }
216223
224+ // TODO: explore using GetGpuFabricInfoV() which can return
225+ // nvmlGpuFabricInfo_v3_t which contains `state`, `status`, and
226+ // `healthSummary`. The latter we may at least want to log (may be
227+ // "unhealthy"). See
228+ // https://docs.nvidia.com/deploy/nvml-api/group__nvmlFabricDefs.html
217229 info , ret := d .GetGpuFabricInfo ()
218230 if ret != nvml .SUCCESS {
219- return fmt .Errorf ("failed to get GPU fabric info: %w" , ret )
231+ return fmt .Errorf ("failed to get GPU fabric info (device %d/%s) : %w" , i , duid , ret )
220232 }
221233
222234 clusterUUID , err := uuid .FromBytes (info .ClusterUuid [:])
223235 if err != nil {
224- return fmt .Errorf ("invalid cluster UUID: %w" , err )
236+ return fmt .Errorf ("invalid cluster UUID (device %d/%s) : %w" , i , duid , err )
225237 }
226238
227239 cliqueID := fmt .Sprintf ("%d" , info .CliqueId )
228240
229241 uniqueClusterUUIDs [clusterUUID .String ()] = struct {}{}
230242 uniqueCliqueIDs [cliqueID ] = struct {}{}
243+ klog .Infof ("identified fabric clique UUID/ID (device %d/%s): %s/%s" , i , duid , clusterUUID .String (), cliqueID )
231244
232245 return nil
233246 })
0 commit comments