diff --git a/go.mod b/go.mod index 7c9fd5d1..c4c8c1a9 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.22.0 toolchain go1.22.5 require ( - github.com/NVIDIA/go-nvlib v0.6.1 + github.com/NVIDIA/go-nvlib v0.7.0 github.com/NVIDIA/nvidia-container-toolkit v1.16.0 github.com/opencontainers/image-spec v1.1.0 github.com/pelletier/go-toml v1.9.5 diff --git a/go.sum b/go.sum index 19fd20b5..d1d78dae 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ github.com/BurntSushi/toml v1.3.2 h1:o7IhLm0Msx3BaB+n3Ag7L8EVlByGnpq14C4YWiu/gL8= github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= -github.com/NVIDIA/go-nvlib v0.6.1 h1:0/5FvaKvDJoJeJ+LFlh+NDQMxMlVw9wOXrOVrGXttfE= -github.com/NVIDIA/go-nvlib v0.6.1/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY= +github.com/NVIDIA/go-nvlib v0.7.0 h1:Z/J7skMdLbTiHvomKVsGYsttfQMZj5FwNYIFXhZ4i/c= +github.com/NVIDIA/go-nvlib v0.7.0/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY= github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg= github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/NVIDIA/nvidia-container-toolkit v1.16.0 h1:NZyKfW0s8nfghoBSJJUth7OZB5ZzRGYbn3RaiTDYdHM= diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go index 5b21fc13..5fac8c11 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go @@ -32,6 +32,7 @@ type Device interface { GetMigDevices() ([]MigDevice, error) GetMigProfiles() ([]MigProfile, error) GetPCIBusID() (string, error) + IsFabricAttached() (bool, error) IsMigCapable() (bool, error) IsMigEnabled() (bool, error) VisitMigDevices(func(j int, m MigDevice) error) error @@ -208,6 +209,47 @@ func (d *device) IsMigEnabled() (bool, error) { return (mode == nvml.DEVICE_MIG_ENABLE), nil } +// IsFabricAttached checks if a device is attached to a GPU fabric. +func (d *device) IsFabricAttached() (bool, error) { + if d.lib.hasSymbol("nvmlDeviceGetGpuFabricInfo") { + info, ret := d.GetGpuFabricInfo() + if ret == nvml.ERROR_NOT_SUPPORTED { + return false, nil + } + if ret != nvml.SUCCESS { + return false, fmt.Errorf("error getting GPU Fabric Info: %v", ret) + } + if info.State != nvml.GPU_FABRIC_STATE_COMPLETED { + return false, nil + } + if nvml.Return(info.Status) != nvml.SUCCESS { + return false, nil + } + + return true, nil + } + + if d.lib.hasSymbol("nvmlDeviceGetGpuFabricInfoV") { + info, ret := d.GetGpuFabricInfoV().V2() + if ret == nvml.ERROR_NOT_SUPPORTED { + return false, nil + } + if ret != nvml.SUCCESS { + return false, fmt.Errorf("error getting GPU Fabric Info: %v", ret) + } + if info.State != nvml.GPU_FABRIC_STATE_COMPLETED { + return false, nil + } + if nvml.Return(info.Status) != nvml.SUCCESS { + return false, nil + } + + return true, nil + } + + return false, nil +} + // VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it. func (d *device) VisitMigDevices(visit func(int, MigDevice) error) error { capable, err := d.IsMigCapable() diff --git a/vendor/modules.txt b/vendor/modules.txt index e992c171..038b168f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -2,7 +2,7 @@ ## explicit; go 1.16 github.com/BurntSushi/toml github.com/BurntSushi/toml/internal -# github.com/NVIDIA/go-nvlib v0.6.1 +# github.com/NVIDIA/go-nvlib v0.7.0 ## explicit; go 1.20 github.com/NVIDIA/go-nvlib/pkg/nvlib/device github.com/NVIDIA/go-nvlib/pkg/nvlib/info