From 2d6f6f629680bf335c074e9e81a9550444d9b1c9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 21 Aug 2025 08:16:26 +0000 Subject: [PATCH] Bump github.com/NVIDIA/go-nvml from 0.12.9-0 to 0.13.0-0 Bumps [github.com/NVIDIA/go-nvml](https://github.com/NVIDIA/go-nvml) from 0.12.9-0 to 0.13.0-0. - [Release notes](https://github.com/NVIDIA/go-nvml/releases) - [Commits](https://github.com/NVIDIA/go-nvml/compare/v0.12.9-0...v0.13.0-0) --- updated-dependencies: - dependency-name: github.com/NVIDIA/go-nvml dependency-version: 0.13.0-0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 +- .../NVIDIA/go-nvml/pkg/nvml/const.go | 67 +- .../NVIDIA/go-nvml/pkg/nvml/device.go | 204 +++- .../NVIDIA/go-nvml/pkg/nvml/nvml.go | 100 +- .../github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h | 958 ++++++++++++------ .../NVIDIA/go-nvml/pkg/nvml/types_gen.go | 227 +++-- .../NVIDIA/go-nvml/pkg/nvml/vgpu.go | 2 + .../go-nvml/pkg/nvml/zz_generated.api.go | 24 + vendor/modules.txt | 2 +- 10 files changed, 1168 insertions(+), 422 deletions(-) diff --git a/go.mod b/go.mod index 07a09f174..2aaa7f9f6 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ toolchain go1.24.1 require ( github.com/NVIDIA/go-gpuallocator v0.6.0 github.com/NVIDIA/go-nvlib v0.7.4 - github.com/NVIDIA/go-nvml v0.12.9-0 + github.com/NVIDIA/go-nvml v0.13.0-0 github.com/NVIDIA/nvidia-container-toolkit v1.17.8 github.com/fsnotify/fsnotify v1.9.0 github.com/google/renameio v1.0.1 diff --git a/go.sum b/go.sum index b4fec66ea..06c4aa015 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/NVIDIA/go-gpuallocator v0.6.0 h1:2PA2swx59gJYREPkZNTGtyCP6Pnz3WEgnYsX github.com/NVIDIA/go-gpuallocator v0.6.0/go.mod h1:c+Yspg+/QxWOmoSQeuI48Z/7nS+mMPtxyj1NYUTwewY= github.com/NVIDIA/go-nvlib v0.7.4 h1:qnXK8qhm45YfxalhZ76XwKdAMmxz1GIgzE0e/Hhhshs= github.com/NVIDIA/go-nvlib v0.7.4/go.mod h1:i95Je7GinMy/+BDs++DAdbPmT2TubjNP8i8joC7DD7I= -github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdus0= -github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= +github.com/NVIDIA/go-nvml v0.13.0-0 h1:ZbPDoheBQGF9WHNb3qIXvhgNGqc/p7OsUPvRzuI9qGE= +github.com/NVIDIA/go-nvml v0.13.0-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/NVIDIA/nvidia-container-toolkit v1.17.8 h1:ndE23TKvQBicsZT88mzZudygn6JNOe6+UsIgqk6gGvw= github.com/NVIDIA/nvidia-container-toolkit v1.17.8/go.mod h1:khOgMW80+g8eX/1zPlO4demLShHht9I0YEm8ngcPgwk= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go index 09e82fc6b..8a6a93c2a 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go @@ -31,9 +31,9 @@ const ( // NO_UNVERSIONED_FUNC_DEFS as defined in go-nvml/:24 NO_UNVERSIONED_FUNC_DEFS = 1 // API_VERSION as defined in nvml/nvml.h - API_VERSION = 12 + API_VERSION = 13 // API_VERSION_STR as defined in nvml/nvml.h - API_VERSION_STR = "12" + API_VERSION_STR = "13" // VALUE_NOT_AVAILABLE as defined in nvml/nvml.h VALUE_NOT_AVAILABLE = -1 // DEVICE_PCI_BUS_ID_BUFFER_SIZE as defined in nvml/nvml.h @@ -68,6 +68,14 @@ const ( MAX_GPU_PERF_PSTATES = 16 // PERF_MODES_BUFFER_SIZE as defined in nvml/nvml.h PERF_MODES_BUFFER_SIZE = 2048 + // POWER_MIZER_MODE_ADAPTIVE as defined in nvml/nvml.h + POWER_MIZER_MODE_ADAPTIVE = 0 + // POWER_MIZER_MODE_PREFER_MAXIMUM_PERFORMANCE as defined in nvml/nvml.h + POWER_MIZER_MODE_PREFER_MAXIMUM_PERFORMANCE = 1 + // POWER_MIZER_MODE_AUTO as defined in nvml/nvml.h + POWER_MIZER_MODE_AUTO = 2 + // POWER_MIZER_MODE_PREFER_CONSISTENT_PERFORMANCE as defined in nvml/nvml.h + POWER_MIZER_MODE_PREFER_CONSISTENT_PERFORMANCE = 3 // GSP_FIRMWARE_VERSION_BUF_SIZE as defined in nvml/nvml.h GSP_FIRMWARE_VERSION_BUF_SIZE = 64 // DEVICE_ARCH_KEPLER as defined in nvml/nvml.h @@ -88,8 +96,6 @@ const ( DEVICE_ARCH_HOPPER = 9 // DEVICE_ARCH_BLACKWELL as defined in nvml/nvml.h DEVICE_ARCH_BLACKWELL = 10 - // DEVICE_ARCH_T23X as defined in nvml/nvml.h - DEVICE_ARCH_T23X = 11 // DEVICE_ARCH_UNKNOWN as defined in nvml/nvml.h DEVICE_ARCH_UNKNOWN = 4294967295 // BUS_TYPE_UNKNOWN as defined in nvml/nvml.h @@ -900,6 +906,8 @@ const ( CC_SYSTEM_MULTIGPU_NONE = 0 // CC_SYSTEM_MULTIGPU_PROTECTED_PCIE as defined in nvml/nvml.h CC_SYSTEM_MULTIGPU_PROTECTED_PCIE = 1 + // CC_SYSTEM_MULTIGPU_NVLE as defined in nvml/nvml.h + CC_SYSTEM_MULTIGPU_NVLE = 2 // CC_ACCEPTING_CLIENT_REQUESTS_FALSE as defined in nvml/nvml.h CC_ACCEPTING_CLIENT_REQUESTS_FALSE = 0 // CC_ACCEPTING_CLIENT_REQUESTS_TRUE as defined in nvml/nvml.h @@ -972,6 +980,30 @@ const ( GPU_FABRIC_HEALTH_MASK_SHIFT_ACCESS_TIMEOUT_RECOVERY = 6 // GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT_RECOVERY as defined in nvml/nvml.h GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT_RECOVERY = 3 + // GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NOT_SUPPORTED as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NOT_SUPPORTED = 0 + // GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NONE as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NONE = 1 + // GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_SYSGUID as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_SYSGUID = 2 + // GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_CHASSIS_SN as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_CHASSIS_SN = 3 + // GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NO_PARTITION as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NO_PARTITION = 4 + // GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INSUFFICIENT_NVLINKS as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INSUFFICIENT_NVLINKS = 5 + // GPU_FABRIC_HEALTH_MASK_SHIFT_INCORRECT_CONFIGURATION as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_SHIFT_INCORRECT_CONFIGURATION = 8 + // GPU_FABRIC_HEALTH_MASK_WIDTH_INCORRECT_CONFIGURATION as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_WIDTH_INCORRECT_CONFIGURATION = 15 + // GPU_FABRIC_HEALTH_SUMMARY_NOT_SUPPORTED as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_SUMMARY_NOT_SUPPORTED = 0 + // GPU_FABRIC_HEALTH_SUMMARY_HEALTHY as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_SUMMARY_HEALTHY = 1 + // GPU_FABRIC_HEALTH_SUMMARY_UNHEALTHY as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_SUMMARY_UNHEALTHY = 2 + // GPU_FABRIC_HEALTH_SUMMARY_LIMITED_CAPACITY as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_SUMMARY_LIMITED_CAPACITY = 3 // INIT_FLAG_NO_GPUS as defined in nvml/nvml.h INIT_FLAG_NO_GPUS = 1 // INIT_FLAG_NO_ATTACH as defined in nvml/nvml.h @@ -1016,6 +1048,20 @@ const ( NVLINK_STATE_SLEEP = 2 // NVLINK_TOTAL_SUPPORTED_BW_MODES as defined in nvml/nvml.h NVLINK_TOTAL_SUPPORTED_BW_MODES = 23 + // NVLINK_FIRMWARE_UCODE_TYPE_MSE as defined in nvml/nvml.h + NVLINK_FIRMWARE_UCODE_TYPE_MSE = 1 + // NVLINK_FIRMWARE_UCODE_TYPE_NETIR as defined in nvml/nvml.h + NVLINK_FIRMWARE_UCODE_TYPE_NETIR = 2 + // NVLINK_FIRMWARE_UCODE_TYPE_NETIR_UPHY as defined in nvml/nvml.h + NVLINK_FIRMWARE_UCODE_TYPE_NETIR_UPHY = 3 + // NVLINK_FIRMWARE_UCODE_TYPE_NETIR_CLN as defined in nvml/nvml.h + NVLINK_FIRMWARE_UCODE_TYPE_NETIR_CLN = 4 + // NVLINK_FIRMWARE_UCODE_TYPE_NETIR_DLN as defined in nvml/nvml.h + NVLINK_FIRMWARE_UCODE_TYPE_NETIR_DLN = 5 + // NVLINK_FIRMWARE_VERSION_LENGTH as defined in nvml/nvml.h + NVLINK_FIRMWARE_VERSION_LENGTH = 100 + // PRM_DATA_MAX_SIZE as defined in nvml/nvml.h + PRM_DATA_MAX_SIZE = 496 // DEVICE_MIG_DISABLE as defined in nvml/nvml.h DEVICE_MIG_DISABLE = 0 // DEVICE_MIG_ENABLE as defined in nvml/nvml.h @@ -1341,7 +1387,7 @@ const ( BRAND_NVIDIA BrandType = 14 BRAND_GEFORCE_RTX BrandType = 15 BRAND_TITAN_RTX BrandType = 16 - BRAND_COUNT BrandType = 17 + BRAND_COUNT BrandType = 18 ) // TemperatureThresholds as declared in nvml/nvml.h @@ -1531,6 +1577,7 @@ const ( ERROR_NOT_READY Return = 27 ERROR_GPU_NOT_FOUND Return = 28 ERROR_INVALID_STATE Return = 29 + ERROR_RESET_TYPE_NOT_SUPPORTED Return = 30 ERROR_UNKNOWN Return = 999 ) @@ -1769,6 +1816,16 @@ const ( VGPU_COMPATIBILITY_LIMIT_OTHER VgpuPgpuCompatibilityLimitCode = -2147483648 ) +// DeviceAddressingModeType as declared in nvml/nvml.h +type DeviceAddressingModeType int32 + +// DeviceAddressingModeType enumeration from nvml/nvml.h +const ( + DEVICE_ADDRESSING_MODE_NONE DeviceAddressingModeType = iota + DEVICE_ADDRESSING_MODE_HMM DeviceAddressingModeType = 1 + DEVICE_ADDRESSING_MODE_ATS DeviceAddressingModeType = 2 +) + // ThermalTarget as declared in nvml/nvml.h type ThermalTarget int32 diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go index 4784cd433..671776c4b 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go @@ -113,6 +113,8 @@ func (l *library) DeviceGetHandleByIndex(index int) (Device, Return) { } // nvml.DeviceGetHandleBySerial() +// +// Deprecated: Use DeviceGetHandleByUUID instead. func (l *library) DeviceGetHandleBySerial(serial string) (Device, Return) { var device nvmlDevice ret := nvmlDeviceGetHandleBySerial(serial+string(rune(0)), &device) @@ -490,10 +492,13 @@ func (device nvmlDevice) GetMaxClockInfo(clockType ClockType) (uint32, Return) { } // nvml.DeviceGetApplicationsClock() +// +// Deprecated: Applications clocks are deprecated and will be removed in CUDA 14.0. func (l *library) DeviceGetApplicationsClock(device Device, clockType ClockType) (uint32, Return) { return device.GetApplicationsClock(clockType) } +// Deprecated: Applications clocks are deprecated and will be removed in CUDA 14.0. func (device nvmlDevice) GetApplicationsClock(clockType ClockType) (uint32, Return) { var clockMHz uint32 ret := nvmlDeviceGetApplicationsClock(device, clockType, &clockMHz) @@ -501,10 +506,13 @@ func (device nvmlDevice) GetApplicationsClock(clockType ClockType) (uint32, Retu } // nvml.DeviceGetDefaultApplicationsClock() +// +// Deprecated: Applications clocks are deprecated and will be removed in CUDA 14.0. func (l *library) DeviceGetDefaultApplicationsClock(device Device, clockType ClockType) (uint32, Return) { return device.GetDefaultApplicationsClock(clockType) } +// Deprecated: Applications clocks are deprecated and will be removed in CUDA 14.0. func (device nvmlDevice) GetDefaultApplicationsClock(clockType ClockType) (uint32, Return) { var clockMHz uint32 ret := nvmlDeviceGetDefaultApplicationsClock(device, clockType, &clockMHz) @@ -512,10 +520,13 @@ func (device nvmlDevice) GetDefaultApplicationsClock(clockType ClockType) (uint3 } // nvml.DeviceResetApplicationsClocks() +// +// Deprecated: Use DeviceResetMemoryLockedClocks for Memory Clocks and DeviceResetGpuLockedClocks for Graphics Clocks instead func (l *library) DeviceResetApplicationsClocks(device Device) Return { return device.ResetApplicationsClocks() } +// Deprecated: Use DeviceResetMemoryLockedClocks for Memory Clocks and DeviceResetGpuLockedClocks for Graphics Clocks instead func (device nvmlDevice) ResetApplicationsClocks() Return { return nvmlDeviceResetApplicationsClocks(device) } @@ -627,10 +638,13 @@ func (device nvmlDevice) GetNumFans() (int, Return) { } // nvml.DeviceGetTemperature() +// +// Deprecated: Use DeviceGetTemperatureV instead. func (l *library) DeviceGetTemperature(device Device, sensorType TemperatureSensors) (uint32, Return) { return device.GetTemperature(sensorType) } +// Deprecated: Use DeviceGetTemperatureV instead. func (device nvmlDevice) GetTemperature(sensorType TemperatureSensors) (uint32, Return) { var temp uint32 ret := nvmlDeviceGetTemperature(device, sensorType, &temp) @@ -671,10 +685,13 @@ func (device nvmlDevice) GetPerformanceState() (Pstates, Return) { } // nvml.DeviceGetCurrentClocksThrottleReasons() +// +// Deprecated: Use DeviceGetCurrentClocksEventReasons instead func (l *library) DeviceGetCurrentClocksThrottleReasons(device Device) (uint64, Return) { return device.GetCurrentClocksThrottleReasons() } +// Deprecated: Use DeviceGetCurrentClocksEventReasons instead func (device nvmlDevice) GetCurrentClocksThrottleReasons() (uint64, Return) { var clocksThrottleReasons uint64 ret := nvmlDeviceGetCurrentClocksThrottleReasons(device, &clocksThrottleReasons) @@ -682,10 +699,13 @@ func (device nvmlDevice) GetCurrentClocksThrottleReasons() (uint64, Return) { } // nvml.DeviceGetSupportedClocksThrottleReasons() +// +// Deprecated: Use DeviceGetSupportedClocksEventReasons instead func (l *library) DeviceGetSupportedClocksThrottleReasons(device Device) (uint64, Return) { return device.GetSupportedClocksThrottleReasons() } +// Deprecated: Use DeviceGetSupportedClocksEventReasons instead func (device nvmlDevice) GetSupportedClocksThrottleReasons() (uint64, Return) { var supportedClocksThrottleReasons uint64 ret := nvmlDeviceGetSupportedClocksThrottleReasons(device, &supportedClocksThrottleReasons) @@ -693,10 +713,13 @@ func (device nvmlDevice) GetSupportedClocksThrottleReasons() (uint64, Return) { } // nvml.DeviceGetPowerState() +// +// Deprecated: Use DeviceGetPerformanceState instead func (l *library) DeviceGetPowerState(device Device) (Pstates, Return) { return device.GetPowerState() } +// Deprecated: Use DeviceGetPerformanceState instead func (device nvmlDevice) GetPowerState() (Pstates, Return) { var pState Pstates ret := nvmlDeviceGetPowerState(device, &pState) @@ -704,10 +727,13 @@ func (device nvmlDevice) GetPowerState() (Pstates, Return) { } // nvml.DeviceGetPowerManagementMode() +// +// Deprecated: This will be removed in a future version func (l *library) DeviceGetPowerManagementMode(device Device) (EnableState, Return) { return device.GetPowerManagementMode() } +// Deprecated: This will be removed in a future version func (device nvmlDevice) GetPowerManagementMode() (EnableState, Return) { var mode EnableState ret := nvmlDeviceGetPowerManagementMode(device, &mode) @@ -758,6 +784,16 @@ func (device nvmlDevice) GetPowerUsage() (uint32, Return) { return power, ret } +func (l *library) DeviceGetPowerMizerMode_v1(device Device) (DevicePowerMizerModes_v1, Return) { + return device.GetPowerMizerMode_v1() +} + +func (device nvmlDevice) GetPowerMizerMode_v1() (DevicePowerMizerModes_v1, Return) { + var devicePowerMizerModes DevicePowerMizerModes_v1 + ret := nvmlDeviceGetPowerMizerMode_v1(device, &devicePowerMizerModes) + return devicePowerMizerModes, ret +} + // nvml.DeviceGetTotalEnergyConsumption() func (l *library) DeviceGetTotalEnergyConsumption(device Device) (uint64, Return) { return device.GetTotalEnergyConsumption() @@ -881,10 +917,13 @@ func (device nvmlDevice) GetTotalEccErrors(errorType MemoryErrorType, counterTyp } // nvml.DeviceGetDetailedEccErrors() +// +// Deprecated: See DeviceGetMemoryErrorCounter func (l *library) DeviceGetDetailedEccErrors(device Device, errorType MemoryErrorType, counterType EccCounterType) (EccErrorCounts, Return) { return device.GetDetailedEccErrors(errorType, counterType) } +// Deprecated: See DeviceGetMemoryErrorCounter func (device nvmlDevice) GetDetailedEccErrors(errorType MemoryErrorType, counterType EccCounterType) (EccErrorCounts, Return) { var eccCounts EccErrorCounts ret := nvmlDeviceGetDetailedEccErrors(device, errorType, counterType, &eccCounts) @@ -1257,10 +1296,13 @@ func (device nvmlDevice) GetBAR1MemoryInfo() (BAR1Memory, Return) { } // nvml.DeviceGetViolationStatus() +// +// Deprecated: Use DeviceGetFieldValues instead. func (l *library) DeviceGetViolationStatus(device Device, perfPolicyType PerfPolicyType) (ViolationTime, Return) { return device.GetViolationStatus(perfPolicyType) } +// Deprecated: Use DeviceGetFieldValues instead. func (device nvmlDevice) GetViolationStatus(perfPolicyType PerfPolicyType) (ViolationTime, Return) { var violTime ViolationTime ret := nvmlDeviceGetViolationStatus(device, perfPolicyType, &violTime) @@ -1344,6 +1386,17 @@ func (device nvmlDevice) GetAccountingMode() (EnableState, Return) { return mode, ret } +func (l *library) DeviceGetPdi(device Device) (Pdi, Return) { + return device.GetPdi() +} + +func (device nvmlDevice) GetPdi() (Pdi, Return) { + var pdi Pdi + pdi.Version = STRUCT_VERSION(pdi, 1) + ret := nvmlDeviceGetPdi(device, &pdi) + return pdi, ret +} + // nvml.DeviceGetAccountingStats() func (l *library) DeviceGetAccountingStats(device Device, pid uint32) (AccountingStats, Return) { return device.GetAccountingStats(pid) @@ -1531,10 +1584,13 @@ func (device nvmlDevice) GetClkMonStatus() (ClkMonStatus, Return) { } // nvml.DeviceSetApplicationsClocks() +// +// Deprecated: Use DeviceSetMemoryLockedClocks for Memory Clocks and DeviceSetGpuLockedClocks for Graphics Clocks instead func (l *library) DeviceSetApplicationsClocks(device Device, memClockMHz uint32, graphicsClockMHz uint32) Return { return device.SetApplicationsClocks(memClockMHz, graphicsClockMHz) } +// Deprecated: Use DeviceSetMemoryLockedClocks for Memory Clocks and DeviceSetGpuLockedClocks for Graphics Clocks instead func (device nvmlDevice) SetApplicationsClocks(memClockMHz uint32, graphicsClockMHz uint32) Return { return nvmlDeviceSetApplicationsClocks(device, memClockMHz, graphicsClockMHz) } @@ -1548,6 +1604,15 @@ func (device nvmlDevice) SetPowerManagementLimit(limit uint32) Return { return nvmlDeviceSetPowerManagementLimit(device, limit) } +// nvml.DeviceSetPowerManagementLimit_v2() +func (l *library) DeviceSetPowerManagementLimit_v2(device Device, powerValue *PowerValue_v2) Return { + return device.SetPowerManagementLimit_v2(powerValue) +} + +func (device nvmlDevice) SetPowerManagementLimit_v2(powerValue *PowerValue_v2) Return { + return nvmlDeviceSetPowerManagementLimit_v2(device, powerValue) +} + // nvml.DeviceSetGpuOperationMode() func (l *library) DeviceSetGpuOperationMode(device Device, mode GpuOperationMode) Return { return device.SetGpuOperationMode(mode) @@ -1649,10 +1714,13 @@ func (device nvmlDevice) ResetNvLinkErrorCounters(link int) Return { } // nvml.DeviceSetNvLinkUtilizationControl() +// +// Deprecated: Setting utilization counter control is no longer supported. func (l *library) DeviceSetNvLinkUtilizationControl(device Device, link int, counter int, control *NvLinkUtilizationControl, reset bool) Return { return device.SetNvLinkUtilizationControl(link, counter, control, reset) } +// Deprecated: Setting utilization counter control is no longer supported. func (device nvmlDevice) SetNvLinkUtilizationControl(link int, counter int, control *NvLinkUtilizationControl, reset bool) Return { resetValue := uint32(0) if reset { @@ -1662,10 +1730,13 @@ func (device nvmlDevice) SetNvLinkUtilizationControl(link int, counter int, cont } // nvml.DeviceGetNvLinkUtilizationControl() +// +// Deprecated: Getting utilization counter control is no longer supported. func (l *library) DeviceGetNvLinkUtilizationControl(device Device, link int, counter int) (NvLinkUtilizationControl, Return) { return device.GetNvLinkUtilizationControl(link, counter) } +// Deprecated: Getting utilization counter control is no longer supported. func (device nvmlDevice) GetNvLinkUtilizationControl(link int, counter int) (NvLinkUtilizationControl, Return) { var control NvLinkUtilizationControl ret := nvmlDeviceGetNvLinkUtilizationControl(device, uint32(link), uint32(counter), &control) @@ -1673,10 +1744,13 @@ func (device nvmlDevice) GetNvLinkUtilizationControl(link int, counter int) (NvL } // nvml.DeviceGetNvLinkUtilizationCounter() +// +// Deprecated: Use DeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. func (l *library) DeviceGetNvLinkUtilizationCounter(device Device, link int, counter int) (uint64, uint64, Return) { return device.GetNvLinkUtilizationCounter(link, counter) } +// Deprecated: Use DeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. func (device nvmlDevice) GetNvLinkUtilizationCounter(link int, counter int) (uint64, uint64, Return) { var rxCounter, txCounter uint64 ret := nvmlDeviceGetNvLinkUtilizationCounter(device, uint32(link), uint32(counter), &rxCounter, &txCounter) @@ -1684,19 +1758,25 @@ func (device nvmlDevice) GetNvLinkUtilizationCounter(link int, counter int) (uin } // nvml.DeviceFreezeNvLinkUtilizationCounter() +// +// Deprecated: Freezing NVLINK utilization counters is no longer supported. func (l *library) DeviceFreezeNvLinkUtilizationCounter(device Device, link int, counter int, freeze EnableState) Return { return device.FreezeNvLinkUtilizationCounter(link, counter, freeze) } +// Deprecated: Freezing NVLINK utilization counters is no longer supported. func (device nvmlDevice) FreezeNvLinkUtilizationCounter(link int, counter int, freeze EnableState) Return { return nvmlDeviceFreezeNvLinkUtilizationCounter(device, uint32(link), uint32(counter), freeze) } // nvml.DeviceResetNvLinkUtilizationCounter() +// +// Deprecated: Resetting NVLINK utilization counters is no longer supported. func (l *library) DeviceResetNvLinkUtilizationCounter(device Device, link int, counter int) Return { return device.ResetNvLinkUtilizationCounter(link, counter) } +// Deprecated: Resetting NVLINK utilization counters is no longer supported. func (device nvmlDevice) ResetNvLinkUtilizationCounter(link int, counter int) Return { return nvmlDeviceResetNvLinkUtilizationCounter(device, uint32(link), uint32(counter)) } @@ -2037,6 +2117,14 @@ func (l *library) GetExcludedDeviceInfoByIndex(index int) (ExcludedDeviceInfo, R return info, ret } +func (l *library) DeviceReadWritePRM_v1(device Device, buffer *PRMTLV_v1) Return { + return device.ReadWritePRM_v1(buffer) +} + +func (device nvmlDevice) ReadWritePRM_v1(buffer *PRMTLV_v1) Return { + return nvmlDeviceReadWritePRM_v1(device, buffer) +} + // nvml.DeviceSetMigMode() func (l *library) DeviceSetMigMode(device Device, mode int) (Return, Return) { return device.SetMigMode(mode) @@ -2102,6 +2190,33 @@ func (device nvmlDevice) GetGpuInstanceProfileInfoV(profile int) GpuInstanceProf return GpuInstanceProfileInfoHandler{device, profile} } +type GpuInstanceProfileInfoByIdHandler struct { + device nvmlDevice + profileId int +} + +func (handler GpuInstanceProfileInfoByIdHandler) V2() (GpuInstanceProfileInfo_v2, Return) { + var info GpuInstanceProfileInfo_v2 + info.Version = STRUCT_VERSION(info, 2) + ret := nvmlDeviceGetGpuInstanceProfileInfoByIdV(handler.device, uint32(handler.profileId), &info) + return info, ret +} + +func (handler GpuInstanceProfileInfoByIdHandler) V3() (GpuInstanceProfileInfo_v3, Return) { + var info GpuInstanceProfileInfo_v3 + info.Version = STRUCT_VERSION(info, 3) + ret := nvmlDeviceGetGpuInstanceProfileInfoV(handler.device, uint32(handler.profileId), (*GpuInstanceProfileInfo_v2)(unsafe.Pointer(&info))) + return info, ret +} + +func (l *library) DeviceGetGpuInstanceProfileInfoByIdV(device Device, profileId int) GpuInstanceProfileInfoByIdHandler { + return device.GetGpuInstanceProfileInfoByIdV(profileId) +} + +func (device nvmlDevice) GetGpuInstanceProfileInfoByIdV(profileId int) GpuInstanceProfileInfoByIdHandler { + return GpuInstanceProfileInfoByIdHandler{device, profileId} +} + // nvml.DeviceGetGpuInstancePossiblePlacements() func (l *library) DeviceGetGpuInstancePossiblePlacements(device Device, info *GpuInstanceProfileInfo) ([]GpuInstancePlacement, Return) { return device.GetGpuInstancePossiblePlacements(info) @@ -2514,10 +2629,13 @@ func (device nvmlDevice) GetGpcClkVfOffset() (int, Return) { } // nvml.DeviceSetGpcClkVfOffset() +// +// Deprecated: Use DeviceSetClockOffsets instead. func (l *library) DeviceSetGpcClkVfOffset(device Device, offset int) Return { return device.SetGpcClkVfOffset(offset) } +// Deprecated: Use DeviceSetClockOffsets instead. func (device nvmlDevice) SetGpcClkVfOffset(offset int) Return { return nvmlDeviceSetGpcClkVfOffset(device, int32(offset)) } @@ -2572,10 +2690,13 @@ func (device nvmlDevice) GetMemClkVfOffset() (int, Return) { } // nvml.DeviceSetMemClkVfOffset() +// +// Deprecated: Use DeviceSetMemClkVfOffset instead func (l *library) DeviceSetMemClkVfOffset(device Device, offset int) Return { return device.SetMemClkVfOffset(offset) } +// Deprecated: Use DeviceSetMemClkVfOffset instead func (device nvmlDevice) SetMemClkVfOffset(offset int) Return { return nvmlDeviceSetMemClkVfOffset(device, int32(offset)) } @@ -2727,10 +2848,13 @@ func (gpuInstance nvmlGpuInstance) CreateComputeInstanceWithPlacement(info *Comp } // nvml.DeviceGetGpuFabricInfo() +// +// Deprecated: Use DeviceGetGpuFabricInfoV instead func (l *library) DeviceGetGpuFabricInfo(device Device) (GpuFabricInfo, Return) { return device.GetGpuFabricInfo() } +// Deprecated: Use DeviceGetGpuFabricInfoV instead func (device nvmlDevice) GetGpuFabricInfo() (GpuFabricInfo, Return) { var gpuFabricInfo GpuFabricInfo ret := nvmlDeviceGetGpuFabricInfo(device, &gpuFabricInfo) @@ -2866,15 +2990,6 @@ func (device nvmlDevice) SetConfComputeUnprotectedMemSize(sizeKiB uint64) Return return nvmlDeviceSetConfComputeUnprotectedMemSize(device, sizeKiB) } -// nvml.DeviceSetPowerManagementLimit_v2() -func (l *library) DeviceSetPowerManagementLimit_v2(device Device, powerValue *PowerValue_v2) Return { - return device.SetPowerManagementLimit_v2(powerValue) -} - -func (device nvmlDevice) SetPowerManagementLimit_v2(powerValue *PowerValue_v2) Return { - return nvmlDeviceSetPowerManagementLimit_v2(device, powerValue) -} - // nvml.DeviceGetC2cModeInfoV() type C2cModeInfoHandler struct { device nvmlDevice @@ -2917,6 +3032,28 @@ func (device nvmlDevice) GetNumaNodeId() (int, Return) { return int(node), ret } +func (l *library) DeviceGetAddressingMode(device Device) (DeviceAddressingMode, Return) { + return device.GetAddressingMode() +} + +func (device nvmlDevice) GetAddressingMode() (DeviceAddressingMode, Return) { + var deviceAddressingMode DeviceAddressingMode + deviceAddressingMode.Version = STRUCT_VERSION(deviceAddressingMode, 1) + ret := nvmlDeviceGetAddressingMode(device, &deviceAddressingMode) + return deviceAddressingMode, ret +} + +func (l *library) DeviceGetRepairStatus(device Device) (RepairStatus, Return) { + return device.GetRepairStatus() +} + +func (device nvmlDevice) GetRepairStatus() (RepairStatus, Return) { + var repairStatus RepairStatus + repairStatus.Version = STRUCT_VERSION(repairStatus, 1) + ret := nvmlDeviceGetRepairStatus(device, &repairStatus) + return repairStatus, ret +} + // nvml.DeviceGetPciInfoExt() func (l *library) DeviceGetPciInfoExt(device Device) (PciInfoExt, Return) { return device.GetPciInfoExt() @@ -2939,10 +3076,17 @@ func (handler GpuFabricInfoHandler) V1() (GpuFabricInfo, Return) { } func (handler GpuFabricInfoHandler) V2() (GpuFabricInfo_v2, Return) { - var info GpuFabricInfoV + var info GpuFabricInfo_v2 info.Version = STRUCT_VERSION(info, 2) - ret := nvmlDeviceGetGpuFabricInfoV(handler.device, &info) - return GpuFabricInfo_v2(info), ret + ret := nvmlDeviceGetGpuFabricInfoV(handler.device, (*GpuFabricInfoV)(unsafe.Pointer(&info))) + return info, ret +} + +func (handler GpuFabricInfoHandler) V3() (GpuFabricInfo_v3, Return) { + var info GpuFabricInfo_v3 + info.Version = STRUCT_VERSION(info, 3) + ret := nvmlDeviceGetGpuFabricInfoV(handler.device, (*GpuFabricInfoV)(unsafe.Pointer(&info))) + return info, ret } func (l *library) DeviceGetGpuFabricInfoV(device Device) GpuFabricInfoHandler { @@ -3254,6 +3398,34 @@ func (device nvmlDevice) SetNvlinkBwMode(setBwMode *NvlinkSetBwMode) Return { return nvmlDeviceSetNvlinkBwMode(device, setBwMode) } +func (l *library) DeviceGetNvLinkInfo(device Device) NvLinkInfoHandler { + return device.GetNvLinkInfo() +} + +func (device nvmlDevice) GetNvLinkInfo() NvLinkInfoHandler { + return NvLinkInfoHandler{device} +} + +type NvLinkInfoHandler struct { + device nvmlDevice +} + +func (handler NvLinkInfoHandler) V1() (NvLinkInfo_v1, Return) { + var info NvLinkInfo_v1 + info.Version = STRUCT_VERSION(info, 1) + ret := nvmlDeviceGetNvLinkInfo(handler.device, (*NvLinkInfo)(unsafe.Pointer(&info))) + + return info, ret +} + +func (handler NvLinkInfoHandler) V2() (NvLinkInfo_v2, Return) { + var info NvLinkInfo_v2 + info.Version = STRUCT_VERSION(info, 2) + ret := nvmlDeviceGetNvLinkInfo(handler.device, (*NvLinkInfo)(unsafe.Pointer(&info))) + + return info, ret +} + // nvml.DeviceWorkloadPowerProfileGetProfilesInfo() func (l *library) DeviceWorkloadPowerProfileGetProfilesInfo(device Device) (WorkloadPowerProfileProfilesInfo, Return) { return device.WorkloadPowerProfileGetProfilesInfo() @@ -3323,6 +3495,14 @@ func (device nvmlDevice) PowerSmoothingSetState(state *PowerSmoothingState) Retu return nvmlDevicePowerSmoothingSetState(device, state) } +func (l *library) DeviceGetSramUniqueUncorrectedEccErrorCounts(device Device, errorCounts *EccSramUniqueUncorrectedErrorCounts) Return { + return device.GetSramUniqueUncorrectedEccErrorCounts(errorCounts) +} + +func (device nvmlDevice) GetSramUniqueUncorrectedEccErrorCounts(errorCounts *EccSramUniqueUncorrectedErrorCounts) Return { + return nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(device, errorCounts) +} + // nvml.GpuInstanceGetCreatableVgpus() func (l *library) GpuInstanceGetCreatableVgpus(gpuInstance GpuInstance) (VgpuTypeIdInfo, Return) { return gpuInstance.GetCreatableVgpus() diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go index 95d67d6da..38123a964 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go @@ -378,6 +378,24 @@ func nvmlDeviceGetNumaNodeId(nvmlDevice nvmlDevice, Node *uint32) Return { return __v } +// nvmlDeviceGetAddressingMode function as declared in nvml/nvml.h +func nvmlDeviceGetAddressingMode(nvmlDevice nvmlDevice, Mode *DeviceAddressingMode) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cMode, _ := (*C.nvmlDeviceAddressingMode_t)(unsafe.Pointer(Mode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetAddressingMode(cnvmlDevice, cMode) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetRepairStatus function as declared in nvml/nvml.h +func nvmlDeviceGetRepairStatus(nvmlDevice nvmlDevice, RepairStatus *RepairStatus) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cRepairStatus, _ := (*C.nvmlRepairStatus_t)(unsafe.Pointer(RepairStatus)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetRepairStatus(cnvmlDevice, cRepairStatus) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetTopologyCommonAncestor function as declared in nvml/nvml.h func nvmlDeviceGetTopologyCommonAncestor(Device1 nvmlDevice, Device2 nvmlDevice, PathInfo *GpuTopologyLevel) Return { cDevice1, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device1)), cgoAllocsUnknown @@ -1017,6 +1035,24 @@ func nvmlDeviceGetPowerUsage(nvmlDevice nvmlDevice, Power *uint32) Return { return __v } +// nvmlDeviceGetPowerMizerMode_v1 function as declared in nvml/nvml.h +func nvmlDeviceGetPowerMizerMode_v1(nvmlDevice nvmlDevice, PowerMizerMode *DevicePowerMizerModes_v1) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPowerMizerMode, _ := (*C.nvmlDevicePowerMizerModes_v1_t)(unsafe.Pointer(PowerMizerMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetPowerMizerMode_v1(cnvmlDevice, cPowerMizerMode) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetPowerMizerMode_v1 function as declared in nvml/nvml.h +func nvmlDeviceSetPowerMizerMode_v1(nvmlDevice nvmlDevice, PowerMizerMode *DevicePowerMizerModes_v1) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPowerMizerMode, _ := (*C.nvmlDevicePowerMizerModes_v1_t)(unsafe.Pointer(PowerMizerMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetPowerMizerMode_v1(cnvmlDevice, cPowerMizerMode) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetTotalEnergyConsumption function as declared in nvml/nvml.h func nvmlDeviceGetTotalEnergyConsumption(nvmlDevice nvmlDevice, Energy *uint64) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -1610,6 +1646,15 @@ func nvmlDeviceGetSramEccErrorStatus(nvmlDevice nvmlDevice, Status *EccSramError return __v } +// nvmlDeviceSetPowerManagementLimit_v2 function as declared in nvml/nvml.h +func nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice nvmlDevice, PowerValue *PowerValue_v2) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPowerValue, _ := (*C.nvmlPowerValue_v2_t)(unsafe.Pointer(PowerValue)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetPowerManagementLimit_v2(cnvmlDevice, cPowerValue) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetAccountingMode function as declared in nvml/nvml.h func nvmlDeviceGetAccountingMode(nvmlDevice nvmlDevice, Mode *EnableState) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -1748,6 +1793,15 @@ func nvmlDeviceGetPlatformInfo(nvmlDevice nvmlDevice, PlatformInfo *PlatformInfo return __v } +// nvmlDeviceGetPdi function as declared in nvml/nvml.h +func nvmlDeviceGetPdi(nvmlDevice nvmlDevice, Pdi *Pdi) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPdi, _ := (*C.nvmlPdi_t)(unsafe.Pointer(Pdi)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetPdi(cnvmlDevice, cPdi) + __v := (Return)(__ret) + return __v +} + // nvmlUnitSetLedState function as declared in nvml/nvml.h func nvmlUnitSetLedState(nvmlUnit nvmlUnit, Color LedColor) Return { cnvmlUnit, _ := *(*C.nvmlUnit_t)(unsafe.Pointer(&nvmlUnit)), cgoAllocsUnknown @@ -1978,15 +2032,6 @@ func nvmlDeviceClearAccountingPids(nvmlDevice nvmlDevice) Return { return __v } -// nvmlDeviceSetPowerManagementLimit_v2 function as declared in nvml/nvml.h -func nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice nvmlDevice, PowerValue *PowerValue_v2) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cPowerValue, _ := (*C.nvmlPowerValue_v2_t)(unsafe.Pointer(PowerValue)), cgoAllocsUnknown - __ret := C.nvmlDeviceSetPowerManagementLimit_v2(cnvmlDevice, cPowerValue) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetNvLinkState function as declared in nvml/nvml.h func nvmlDeviceGetNvLinkState(nvmlDevice nvmlDevice, Link uint32, IsActive *EnableState) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -2166,6 +2211,15 @@ func nvmlDeviceSetNvlinkBwMode(nvmlDevice nvmlDevice, SetBwMode *NvlinkSetBwMode return __v } +// nvmlDeviceGetNvLinkInfo function as declared in nvml/nvml.h +func nvmlDeviceGetNvLinkInfo(nvmlDevice nvmlDevice, Info *NvLinkInfo) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cInfo, _ := (*C.nvmlNvLinkInfo_t)(unsafe.Pointer(Info)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetNvLinkInfo(cnvmlDevice, cInfo) + __v := (Return)(__ret) + return __v +} + // nvmlEventSetCreate function as declared in nvml/nvml.h func nvmlEventSetCreate(Set *nvmlEventSet) Return { cSet, _ := (*C.nvmlEventSet_t)(unsafe.Pointer(Set)), cgoAllocsUnknown @@ -3033,6 +3087,15 @@ func nvmlGetExcludedDeviceInfoByIndex(Index uint32, Info *ExcludedDeviceInfo) Re return __v } +// nvmlDeviceReadWritePRM_v1 function as declared in nvml/nvml.h +func nvmlDeviceReadWritePRM_v1(nvmlDevice nvmlDevice, Buffer *PRMTLV_v1) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cBuffer, _ := (*C.nvmlPRMTLV_v1_t)(unsafe.Pointer(Buffer)), cgoAllocsUnknown + __ret := C.nvmlDeviceReadWritePRM_v1(cnvmlDevice, cBuffer) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceSetMigMode function as declared in nvml/nvml.h func nvmlDeviceSetMigMode(nvmlDevice nvmlDevice, Mode uint32, ActivationStatus *Return) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -3073,6 +3136,16 @@ func nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice nvmlDevice, Profile uint32, return __v } +// nvmlDeviceGetGpuInstanceProfileInfoByIdV function as declared in nvml/nvml.h +func nvmlDeviceGetGpuInstanceProfileInfoByIdV(nvmlDevice nvmlDevice, ProfileId uint32, Info *GpuInstanceProfileInfo_v2) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cProfileId, _ := (C.uint)(ProfileId), cgoAllocsUnknown + cInfo, _ := (*C.nvmlGpuInstanceProfileInfo_v2_t)(unsafe.Pointer(Info)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpuInstanceProfileInfoByIdV(cnvmlDevice, cProfileId, cInfo) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetGpuInstancePossiblePlacements_v2 function as declared in nvml/nvml.h func nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice nvmlDevice, ProfileId uint32, Placements *GpuInstancePlacement, Count *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -3452,6 +3525,15 @@ func nvmlDevicePowerSmoothingSetState(nvmlDevice nvmlDevice, State *PowerSmoothi return __v } +// nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts function as declared in nvml/nvml.h +func nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice nvmlDevice, ErrorCounts *EccSramUniqueUncorrectedErrorCounts) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cErrorCounts, _ := (*C.nvmlEccSramUniqueUncorrectedErrorCounts_t)(unsafe.Pointer(ErrorCounts)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(cnvmlDevice, cErrorCounts) + __v := (Return)(__ret) + return __v +} + // nvmlInit_v1 function as declared in nvml/nvml.h func nvmlInit_v1() Return { __ret := C.nvmlInit() diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h index 28a654756..917a8c93e 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h @@ -1,5 +1,5 @@ -/*** NVML VERSION: 12.9.40 ***/ -/*** From https://gitlab.com/nvidia/headers/cuda-individual/nvml_dev/-/raw/v12.9.40/nvml.h ***/ +/*** NVML VERSION: 13.0.39 ***/ +/*** From https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvml_dev/linux-x86_64/cuda_nvml_dev-linux-x86_64-13.0.39-archive.tar.xz ***/ /* * Copyright 1993-2025 NVIDIA Corporation. All rights reserved. * @@ -94,13 +94,23 @@ extern "C" { #define DECLDIR #endif +/* + * Deprecation definition. Starting CUDA 13.1 this will change to: + * #if defined _WINDOWS + * #define DEPRECATED(ver) __declspec(deprecated) + * #else + * #define DEPRECATED(ver) __attribute__((deprecated)) + * #endif + */ +#define DEPRECATED(ver) /* nop in CUDA 13.0, enabled in CUDA 13.1 */ + #define NVML_MCDM_SUPPORT /** * NVML API versioning support */ -#define NVML_API_VERSION 12 -#define NVML_API_VERSION_STR "12" +#define NVML_API_VERSION 13 +#define NVML_API_VERSION_STR "13" /** * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this @@ -162,7 +172,7 @@ typedef struct #define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 /** - * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy + * Buffer size guaranteed to be large enough for pci bus id for \p busIdLegacy */ #define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 @@ -205,12 +215,12 @@ typedef struct nvmlPciInfo_st } nvmlPciInfo_t; /** - * PCI format string for ::busIdLegacy + * PCI format string for \p busIdLegacy */ #define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" /** - * PCI format string for ::busId + * PCI format string for \p busId */ #define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" @@ -367,6 +377,41 @@ typedef struct #define nvmlC2cModeInfo_v1 NVML_STRUCT_VERSION(C2cModeInfo, 1) +/** + * Enum to represent device addressing mode values + */ +typedef enum +{ + NVML_DEVICE_ADDRESSING_MODE_NONE = 0, //!< No active mode + NVML_DEVICE_ADDRESSING_MODE_HMM = 1, //!< Heterogeneous Memory Management mode + NVML_DEVICE_ADDRESSING_MODE_ATS = 2, //!< Address Translation Services mode +} nvmlDeviceAddressingModeType_t; + +/** + * Struct to represent device addressing mode information + */ +typedef struct +{ + unsigned int version; //!< API version + unsigned int value; //!< One of \ref nvmlDeviceAddressingModeType_t +} nvmlDeviceAddressingMode_v1_t; +typedef nvmlDeviceAddressingMode_v1_t nvmlDeviceAddressingMode_t; + +#define nvmlDeviceAddressingMode_v1 NVML_STRUCT_VERSION(DeviceAddressingMode, 1) + +/** + * Struct to represent the NVML repair status + */ +typedef struct +{ + unsigned int version; //!< API version number + unsigned int bChannelRepairPending; //!< Reference to \a unsigned int + unsigned int bTpcRepairPending; //!< Reference to \a unsigned int +} nvmlRepairStatus_v1_t; +typedef nvmlRepairStatus_v1_t nvmlRepairStatus_t; + +#define nvmlRepairStatus_v1 NVML_STRUCT_VERSION(RepairStatus, 1) + /** * Possible values that classify the remap availability for each bank. The max * field will contain the number of banks that have maximum remap availability @@ -698,21 +743,20 @@ typedef enum NVML_THERMAL_CONTROLLER_UNKNOWN = -1, } nvmlThermalController_t; -typedef struct { - nvmlThermalController_t controller; - int defaultMinTemp; - int defaultMaxTemp; - int currentTemp; - nvmlThermalTarget_t target; -} nvmlGpuThermalSettingsSensor_t; - /** * Struct to hold the thermal sensor settings */ typedef struct { unsigned int count; - nvmlGpuThermalSettingsSensor_t sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU]; + struct + { + nvmlThermalController_t controller; + int defaultMinTemp; + int defaultMaxTemp; + int currentTemp; + nvmlThermalTarget_t target; + } sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU]; } nvmlGpuThermalSettings_t; @@ -794,6 +838,18 @@ typedef nvmlUUID_v1_t nvmlUUID_t; #define nvmlUUID_v1 NVML_STRUCT_VERSION(UUID, 1) +/** + * Struct to represent the NVML PDI information + */ +typedef struct +{ + unsigned int version; //!< API version number + unsigned long long value; //!< 64-bit PDI value +} nvmlPdi_v1_t; +typedef nvmlPdi_v1_t nvmlPdi_t; + +#define nvmlPdi_v1 NVML_STRUCT_VERSION(Pdi, 1) + /** @} */ /***************************************************************************************************/ @@ -851,9 +907,8 @@ typedef enum nvmlBrandType_enum NVML_BRAND_NVIDIA = 14, NVML_BRAND_GEFORCE_RTX = 15, // Unused NVML_BRAND_TITAN_RTX = 16, // Unused - // Keep this last - NVML_BRAND_COUNT + NVML_BRAND_COUNT = 18, } nvmlBrandType_t; /** @@ -1064,8 +1119,10 @@ typedef enum nvmlClockType_enum typedef enum nvmlClockId_enum { NVML_CLOCK_ID_CURRENT = 0, //!< Current actual clock value - NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock + NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock. + //!< Deprecated, do not use. NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2, //!< Default application clock target + //!< Deprecated, do not use. NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3, //!< OEM-defined maximum clock rate //Keep this last @@ -1167,6 +1224,22 @@ typedef nvmlDeviceCurrentClockFreqs_v1_t nvmlDeviceCurrentClockFreqs_t; #define nvmlDeviceCurrentClockFreqs_v1 NVML_STRUCT_VERSION(DeviceCurrentClockFreqs, 1) +/** + * Device powerMizer modes + */ +#define NVML_POWER_MIZER_MODE_ADAPTIVE 0 //!< adjust GPU clocks based on GPU utilization +#define NVML_POWER_MIZER_MODE_PREFER_MAXIMUM_PERFORMANCE 1 //!< raise GPU clocks to favor maximum performance, + //!< to the extent that thermal and other constraints allow +#define NVML_POWER_MIZER_MODE_AUTO 2 //!< PowerMizer mode is driver controlled. +#define NVML_POWER_MIZER_MODE_PREFER_CONSISTENT_PERFORMANCE 3 //!< lock to GPU base clocks + +typedef struct +{ + unsigned int currentMode; //!< OUT: the current powermizer mode + unsigned int mode; //!< IN: the powermizer mode to set + unsigned int supportedPowerMizerModes; //!< OUT: Bitmask of supported powermizer modes +} nvmlDevicePowerMizerModes_v1_t; + /** * GPU Operation Mode * @@ -1234,6 +1307,7 @@ typedef enum nvmlReturn_enum NVML_ERROR_NOT_READY = 27, //!< The system is not ready for the request NVML_ERROR_GPU_NOT_FOUND = 28, //!< No GPUs were found NVML_ERROR_INVALID_STATE = 29, //!< Resource not in correct state to perform requested operation + NVML_ERROR_RESET_TYPE_NOT_SUPPORTED = 30, //!< Reset not supported for given device/parameters NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; @@ -1273,7 +1347,8 @@ typedef enum nvmlPageRetirementCause_enum typedef enum nvmlRestrictedAPI_enum { NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks - //!< and see nvmlDeviceResetApplicationsClocks + //!< and see nvmlDeviceResetApplicationsClocks. + //!< Deprecated, keeping definition for backward compatibility. NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks //!< see nvmlDeviceSetAutoBoostedClocksEnabled // Keep this last @@ -1381,6 +1456,27 @@ typedef struct typedef nvmlPlatformInfo_v2_t nvmlPlatformInfo_t; #define nvmlPlatformInfo_v2 NVML_STRUCT_VERSION(PlatformInfo, 2) +typedef struct +{ + unsigned int unit; //!< the SRAM unit index + unsigned int location; //!< the error location within the SRAM unit + unsigned int sublocation; //!< the error sublocation within the SRAM unit + unsigned int extlocation; //!< the error extlocation within the SRAM unit + unsigned int address; //!< the error address within the SRAM unit + unsigned int isParity; //!< if the SRAM error is parity or not + unsigned int count; //!< the error count at the same SRAM address +} nvmlEccSramUniqueUncorrectedErrorEntry_v1_t; + +typedef struct +{ + unsigned int version; //!< the API version number + unsigned int entryCount; //!< the number of error count entries + nvmlEccSramUniqueUncorrectedErrorEntry_v1_t *entries; //!< pointer to caller-supplied buffer to return the SRAM unique uncorrected ECC error count entries +} nvmlEccSramUniqueUncorrectedErrorCounts_v1_t; + +typedef nvmlEccSramUniqueUncorrectedErrorCounts_v1_t nvmlEccSramUniqueUncorrectedErrorCounts_t; +#define nvmlEccSramUniqueUncorrectedErrorCounts_v1 NVML_STRUCT_VERSION(EccSramUniqueUncorrectedErrorCounts, 1) + /** * GSP firmware */ @@ -1400,8 +1496,6 @@ typedef nvmlPlatformInfo_v2_t nvmlPlatformInfo_t; #define NVML_DEVICE_ARCH_BLACKWELL 10 // Devices based on the NVIDIA Blackwell architecture -#define NVML_DEVICE_ARCH_T23X 11 // Devices based on NVIDIA Orin architecture - #define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer typedef unsigned int nvmlDeviceArchitecture_t; @@ -1468,17 +1562,16 @@ typedef enum nvmlGpuUtilizationDomainId_t NVML_GPU_UTILIZATION_DOMAIN_BUS = 3, //!< Bus interface domain } nvmlGpuUtilizationDomainId_t; -typedef struct { - unsigned int bIsPresent; - unsigned int percentage; - unsigned int incThreshold; - unsigned int decThreshold; -} nvmlGpuDynamicPstatesInfoUtilization_t; - typedef struct nvmlGpuDynamicPstatesInfo_st { unsigned int flags; //!< Reserved for future use - nvmlGpuDynamicPstatesInfoUtilization_t utilization[NVML_MAX_GPU_UTILIZATIONS]; + struct + { + unsigned int bIsPresent; //!< Set if this utilization domain is present on this GPU + unsigned int percentage; //!< Percentage of time where the domain is considered busy in the last 1-second interval + unsigned int incThreshold; //!< Utilization threshold that can trigger a perf-increasing P-State change when crossed + unsigned int decThreshold; //!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed + } utilization[NVML_MAX_GPU_UTILIZATIONS]; } nvmlGpuDynamicPstatesInfo_t; /* @@ -1864,23 +1957,21 @@ typedef nvmlVgpuRuntimeState_v1_t nvmlVgpuRuntimeState_t; */ #define NVML_VGPU_SCHEDULER_ENGINE_TYPE_GRAPHICS 1 -typedef struct { - unsigned int avgFactor; - unsigned int timeslice; -} nvmlVgpuSchedulerParamsVgpuSchedDataWithARR_t; - -typedef struct { - unsigned int timeslice; -} nvmlVgpuSchedulerParamsVgpuSchedData_t; - /** * Union to represent the vGPU Scheduler Parameters */ typedef union { - nvmlVgpuSchedulerParamsVgpuSchedDataWithARR_t vgpuSchedDataWithARR; + struct + { + unsigned int avgFactor; //!< Average factor in compensating the timeslice for Adaptive Round Robin mode + unsigned int timeslice; //!< The timeslice in ns for each software run list as configured, or the default value otherwise + } vgpuSchedDataWithARR; - nvmlVgpuSchedulerParamsVgpuSchedData_t vgpuSchedData; + struct + { + unsigned int timeslice; //!< The timeslice in ns for each software run list as configured, or the default value otherwise + } vgpuSchedData; } nvmlVgpuSchedulerParams_t; @@ -1920,23 +2011,21 @@ typedef struct nvmlVgpuSchedulerGetState_st nvmlVgpuSchedulerParams_t schedulerParams; } nvmlVgpuSchedulerGetState_t; -typedef struct { - unsigned int avgFactor; - unsigned int frequency; -} nvmlVgpuSchedulerSetParamsVgpuSchedDataWithARR_t; - -typedef struct { - unsigned int timeslice; -} nvmlVgpuSchedulerSetParamsVgpuSchedData_t; - /** * Union to represent the vGPU Scheduler set Parameters */ typedef union { - nvmlVgpuSchedulerSetParamsVgpuSchedDataWithARR_t vgpuSchedDataWithARR; + struct + { + unsigned int avgFactor; //!< Average factor in compensating the timeslice for Adaptive Round Robin mode + unsigned int frequency; //!< Frequency for Adaptive Round Robin mode + } vgpuSchedDataWithARR; - nvmlVgpuSchedulerSetParamsVgpuSchedData_t vgpuSchedData; + struct + { + unsigned int timeslice; //!< The timeslice in ns(Nanoseconds) for each software run list as configured, or the default value otherwise + } vgpuSchedData; } nvmlVgpuSchedulerSetParams_t; @@ -2923,8 +3012,9 @@ typedef struct nvmlEventData_st // 0xFFFFFFFF otherwise. } nvmlEventData_t; -/** @} */ - +/** + * System Event Set + */ typedef struct { struct nvmlSystemEventSet_st* handle; @@ -3010,10 +3100,8 @@ typedef nvmlSystemEventSetWaitRequest_v1_t nvmlSystemEventSetWaitRequest_t; */ #define nvmlClocksEventReasonGpuIdle 0x0000000000000001LL -/** GPU clocks are limited by current setting of applications clocks - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetApplicationsClock +/* + * @deprecated No longer used */ #define nvmlClocksEventReasonApplicationsClocksSetting 0x0000000000000002LL @@ -3120,7 +3208,7 @@ typedef nvmlSystemEventSetWaitRequest_v1_t nvmlSystemEventSetWaitRequest_t; */ #define nvmlClocksThrottleReasonGpuIdle nvmlClocksEventReasonGpuIdle /** - * @deprecated Use \ref nvmlClocksEventReasonApplicationsClocksSetting instead + * @deprecated */ #define nvmlClocksThrottleReasonApplicationsClocksSetting nvmlClocksEventReasonApplicationsClocksSetting /** @@ -3359,8 +3447,9 @@ typedef struct nvmlConfComputeSystemState_st { /** * Confidential Compute Multigpu mode values */ -#define NVML_CC_SYSTEM_MULTIGPU_NONE 0 +#define NVML_CC_SYSTEM_MULTIGPU_NONE 0 #define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1 +#define NVML_CC_SYSTEM_MULTIGPU_NVLE 2 /** * Confidential Compute System settings @@ -3451,64 +3540,92 @@ typedef nvmlConfComputeGetKeyRotationThresholdInfo_v1_t nvmlConfComputeGetKeyRot */ /***************************************************************************************************/ -#define NVML_GPU_FABRIC_UUID_LEN 16 +#define NVML_GPU_FABRIC_UUID_LEN 16 //!< Length of Fabric UUID -#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0 -#define NVML_GPU_FABRIC_STATE_NOT_STARTED 1 -#define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2 -#define NVML_GPU_FABRIC_STATE_COMPLETED 3 +/** + * Fabric Probe States + */ +#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0 //!< Fabric Probe State not supported +#define NVML_GPU_FABRIC_STATE_NOT_STARTED 1 //!< Fabric Probe has not started +#define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2 //!< Fabric Probe in progress +#define NVML_GPU_FABRIC_STATE_COMPLETED 3 //!< Fabric Probe State completed +/** + * Probe State of GPU registration process + */ typedef unsigned char nvmlGpuFabricState_t; /** * Contains the device fabric information */ -typedef struct { +typedef struct +{ unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs - nvmlGpuFabricState_t state; //!< Current state of GPU registration process + nvmlGpuFabricState_t state; //!< Current state of GPU registration process. See NVML_GPU_FABRIC_STATE_* } nvmlGpuFabricInfo_t; -/* +/** * Fabric Degraded BW */ -#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0 -#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1 -#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2 +#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0 //!< Fabric Health Mask: Degraded Bandwidth not supported +#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1 //!< Fabric Health Mask: Bandwidth degraded +#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2 //!< Fabric Health Mask: Bandwidth not degraded -#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0 -#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x3 +#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0 //!< Fabric Health Mask Bit Shift for Degraded Bandwidth +#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x3 //!< Fabric Health Mask Width for Degraded Bandwidth -/* +/** * Fabric Route Recovery */ -#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_NOT_SUPPORTED 0 -#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_TRUE 1 -#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_FALSE 2 +#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_NOT_SUPPORTED 0 //!< Fabric Health Mask: Route Recovery not supported +#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_TRUE 1 //!< Fabric Health Mask: Route Recovery in progress +#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_FALSE 2 //!< Fabric Health Mask: Route Recovery not in progress -#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_RECOVERY 2 -#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_RECOVERY 0x3 +#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_RECOVERY 2 //!< Fabric Health Mask Bit Shift for Route Recovery +#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_RECOVERY 0x3 //!< Fabric Health Mask Width for Route Recovery -/* +/** * Nvlink Fabric Route Unhealthy */ -#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_NOT_SUPPORTED 0 -#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_TRUE 1 -#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_FALSE 2 +#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_NOT_SUPPORTED 0 //!< Fabric Health Mask: Route Unhealthy not supported +#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_TRUE 1 //!< Fabric Health Mask: Route is unhealthy +#define NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_FALSE 2 //!< Fabric Health Mask: Route is healthy -#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_UNHEALTHY 4 -#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_UNHEALTHY 0x3 +#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_UNHEALTHY 4 //!< Fabric Health Mask Bit Shift for Route Unhealthy +#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_UNHEALTHY 0x3 //!< Fabric Health Mask Width for Route Unhealthy -/* +/** * Fabric Access Timeout Recovery */ -#define NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_NOT_SUPPORTED 0 -#define NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_TRUE 1 -#define NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_FALSE 2 +#define NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_NOT_SUPPORTED 0 //!< Fabric Health Mask: Access Timeout Recovery not supported +#define NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_TRUE 1 //!< Fabric Health Mask: Access Timeout Recovery in progress +#define NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_FALSE 2 //!< Fabric Health Mask: Access Timeout Recovery not in progress + +#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ACCESS_TIMEOUT_RECOVERY 6 //!< Fabric Health Mask Bit Shift for Access Timeout Recovery +#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT_RECOVERY 0x3 //!< Fabric Health Mask Width for Access Timeout Recovery + +/** + * Fabric Incorrect Configuration + */ +#define NVML_GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NOT_SUPPORTED 0 //!< Fabric Health Mask: Incorrect Configuration not supported +#define NVML_GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NONE 1 //!< Fabric Health Mask: Correct Configuration +#define NVML_GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_SYSGUID 2 //!< Fabric Health Mask: Incorrect Configuration - SysGUID +#define NVML_GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_CHASSIS_SN 3 //!< Fabric Health Mask: Incorrect Configuration - Chassis Serial Number +#define NVML_GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NO_PARTITION 4 //!< Fabric Health Mask: Incorrect Configuration - No Partition +#define NVML_GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INSUFFICIENT_NVLINKS 5 //!< Fabric Health Mask: Incorrect Configuration - Insufficient Nvlinks + +#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_INCORRECT_CONFIGURATION 8 //!< Fabric Health Mask Bit Shift for Incorrect Configuration +#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_INCORRECT_CONFIGURATION 0xf //!< Fabric Health Mask Width for Incorrect Configuration -#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ACCESS_TIMEOUT_RECOVERY 6 -#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT_RECOVERY 0x3 +/** + * Fabric Health + */ +#define NVML_GPU_FABRIC_HEALTH_SUMMARY_NOT_SUPPORTED 0 //!< Fabric Health Summary: Not supported +#define NVML_GPU_FABRIC_HEALTH_SUMMARY_HEALTHY 1 //!< Fabric Health Summary: Healthy +#define NVML_GPU_FABRIC_HEALTH_SUMMARY_UNHEALTHY 2 //!< Fabric Health Summary: Unhealthy +#define NVML_GPU_FABRIC_HEALTH_SUMMARY_LIMITED_CAPACITY 3 //!< Fabric Health Summary: Limited Capacity /** * GPU Fabric Health Status Mask for various fields can be obtained @@ -3531,27 +3648,50 @@ typedef struct { /** * GPU Fabric information (v2). * +* @deprecated nvmlGpuFabricInfo_v2_t is deprecated and will be removed in a future release. +* Use nvmlGpuFabricInfo_v3_t instead +* * Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field * to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask * field to the end. This structure is not backwards-compatible with * \ref nvmlGpuFabricInfo_t. */ -typedef struct { +typedef struct +{ unsigned int version; //!< Structure version identifier (set to nvmlGpuFabricInfo_v2) unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs - nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". + nvmlReturn_t status; //!< Probe Error status, if any. Must be checked only if Probe state returns "complete". unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs - nvmlGpuFabricState_t state; //!< Current state of GPU registration process - unsigned int healthMask; //!< GPU Fabric health Status Mask + nvmlGpuFabricState_t state; //!< Current Probe State of GPU registration process. See NVML_GPU_FABRIC_STATE_* + unsigned int healthMask; //!< GPU Fabric health Status Mask. See NVML_GPU_FABRIC_HEALTH_MASK_* } nvmlGpuFabricInfo_v2_t; -typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; - /** * Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version. */ #define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) +/** +* GPU Fabric information (v3). +*/ +typedef struct +{ + unsigned int version; //!< Structure version identifier (set to nvmlGpuFabricInfo_v2) + unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs + nvmlReturn_t status; //!< Probe Error status, if any. Must be checked only if Probe state returns "complete". + unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs + nvmlGpuFabricState_t state; //!< Current Probe State of GPU registration process. See NVML_GPU_FABRIC_STATE_* + unsigned int healthMask; //!< GPU Fabric health Status Mask. See NVML_GPU_FABRIC_HEALTH_MASK_* + unsigned char healthSummary; //!< GPU Fabric health summary. See NVML_GPU_FABRIC_HEALTH_SUMMARY_* +} nvmlGpuFabricInfo_v3_t; + +typedef nvmlGpuFabricInfo_v3_t nvmlGpuFabricInfoV_t; + +/** +* Version identifier value for \ref nvmlGpuFabricInfo_v3_t.version. +*/ +#define nvmlGpuFabricInfo_v3 NVML_STRUCT_VERSION(GpuFabricInfo, 3) + /** @} */ /***************************************************************************************************/ @@ -4185,7 +4325,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevic * @see nvmlDeviceGetSerial * @see nvmlDeviceGetHandleByUUID */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); /** * Acquire the handle for a particular device, based on its globally unique immutable UUID (in ASCII format) associated with each device. @@ -4556,6 +4696,48 @@ nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device \a node is invalid */ nvmlReturn_t DECLDIR nvmlDeviceGetNumaNodeId(nvmlDevice_t device, unsigned int *node); + +/** + * Get the addressing mode for a given GPU. Addressing modes can be one of: + * 1. HMM: System allocated memory (malloc, mmap) is addressable from the device (GPU), + * via software-based mirroring of the CPU's page tables, on the GPU. + * 2. ATS: System allocated memory (malloc, mmap) is addressable from the device (GPU), + * via Address Translation Services. This means that there is (effectively) + * a single set of page tables, and the CPU and GPU both use them. + * 3. None: Neither HMM nor ATS is active. + * + * %TURING_OR_NEWER% + * Supported on Linux only. + * + * @param[in] device The device handle + * @param[out] mode Pointer to addressing mode of the device + * + * @returns + * - \ref NVML_SUCCESS if \a mode is retrieved successfully + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported + * - \ref NVML_ERROR_NOT_SUPPORTED if request is not supported on the current platform + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device \a node is invalid + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAddressingMode(nvmlDevice_t device, nvmlDeviceAddressingMode_t *mode); + +/** + * Get the repair status for TPC/Channel repair + * + * For Ampere &tm; or newer fully supported devices. + * + * @param[in] device The identifier of the target device + * @param[out] repairStatus Reference to \a nvmlRepairStatus_t + * + * @return + * - \ref NVML_SUCCESS if the query was successful + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRepairStatus(nvmlDevice_t device, nvmlRepairStatus_t *repairStatus); + /** * Retrieve the common ancestor for two devices * For all products. @@ -5103,46 +5285,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockTyp nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); /** - * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. - * Can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * @deprecated Applications clocks are deprecated and will be removed in CUDA 14.0. */ -nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); /** - * Retrieves the default applications clock that GPU boots with or - * defaults to after \ref nvmlDeviceResetApplicationsClocks call. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the default clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * \see nvmlDeviceGetApplicationsClock + * @deprecated Applications clocks are deprecated and will be removed in CUDA 14.0. */ -nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); /** * Retrieves the clock speed for the clock specified by the clock type and clock ID. @@ -5184,7 +5334,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clo nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); /** - * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetMemoryLockedClocks. * * For Kepler &tm; or newer fully supported devices. * @@ -5203,13 +5353,12 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvm * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedGraphicsClocks + * @see nvmlDeviceSetMemoryLockedClocks */ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); /** - * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetGpuLockedClocks. * * For Kepler &tm; or newer fully supported devices. * @@ -5229,8 +5378,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, uns * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedMemoryClocks + * @see nvmlDeviceSetGpuLockedClocks */ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); @@ -5286,7 +5434,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, */ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); - /** * Retrieves the intended operating speed of the device's specified fan. * @@ -5426,7 +5573,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int *num /** * @deprecated Use \ref nvmlDeviceGetTemperatureV instead */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); /** * Retrieves the cooler's information. @@ -5594,7 +5741,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksEventReasons(nvmlDevice_t device, /** * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); /** * Retrieves bitmask of supported clocks event reasons that can be returned by @@ -5623,10 +5770,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksEventReasons(nvmlDevice_t devic /** * @deprecated Use \ref nvmlDeviceGetSupportedClocksEventReasons instead */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); /** - * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. + * @deprecated Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. * * Retrieve the current performance state for the device. * @@ -5645,7 +5792,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t de * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); /** * Retrieve performance monitor samples from the associated subdevice. @@ -5908,7 +6055,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceModes(nvmlDevice_t device, nvmlDevi nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClockFreqs(nvmlDevice_t device, nvmlDeviceCurrentClockFreqs_t *currentClockFreqs); /** - * This API has been deprecated. + * @deprecated This API has been deprecated. * * Retrieves the power management mode associated with this device. * @@ -5935,7 +6082,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClockFreqs(nvmlDevice_t device, nvmlDev * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); /** * Retrieves the power management limit associated with this device. @@ -6028,6 +6175,56 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t devic */ nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); +/** + * Retrieves current power mizer mode on this device. + * + * PowerMizerMode provides a hint to the driver as to how to manage the performance of the GPU. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param powerMizerMode Reference in which to return the power mizer mode + * @param supportedPowerMizerModes Reference in which to return the bitmask of supported power mizer modes on this device. + * The supported modes can be combined using the bitwise OR operator '|'. + * For example, if a device supports all PowerMizer modes, the bitmask would be: + * supportedPowerMizerModes = ((1 << NVML_POWER_MIZER_MODE_ADAPTIVE) | + * (1 << NVML_POWER_MIZER_MODE_PREFER_MAXIMUM_PERFORMANCE) | + * (1 << NVML_POWER_MIZER_MODE_AUTO) | + * (1 << NVML_POWER_MIZER_MODE_PREFER_CONSISTENT_PERFORMANCE)); + * This bitmask can be used to check which power mizer modes are available on the device by performing + * a bitwise AND operation with the specific mode you want to check. + * + * @return + * - \ref NVML_SUCCESS if \a powerMizerMode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerMizerMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support powerMizerMode readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + +nvmlReturn_t DECLDIR nvmlDeviceGetPowerMizerMode_v1(nvmlDevice_t device, nvmlDevicePowerMizerModes_v1_t *powerMizerMode); + +/** + * Sets the new power mizer mode. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param powerMizerMode Reference in which to set the power mizer mode. + * + * @return + * - \ref NVML_SUCCESS if \a powerMizerMode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerMizerMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support powerMizerMode readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + +nvmlReturn_t DECLDIR nvmlDeviceSetPowerMizerMode_v1(nvmlDevice_t device, nvmlDevicePowerMizerModes_v1_t *powerMizerMode); + + /** * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded * @@ -6407,7 +6604,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemory * * @see nvmlDeviceClearEccErrorCounts() */ -nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); /** * Retrieves the requested memory error counter for the device. @@ -7043,30 +7240,22 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_ nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); /** - * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power - * or thermal constraints. + * @deprecated Use \ref nvmlDeviceGetFieldValues to query this data. + * This API will be removed in CUDA 14.0. * - * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The - * difference in violation times at two different reference times gives the indication of GPU throttling event. - * - * Violation for thermal capping is not supported at this time. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param perfPolicyType Represents Performance policy which can trigger GPU throttling - * @param violTime Reference to which violation time related information is returned - * - * - * @return - * - \ref NVML_SUCCESS if violation time is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * Translations are as follows: + * + * NVML_PERF_POLICY_POWER -> NVML_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP + * NVML_PERF_POLICY_THERMAL -> NVML_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN + * NVML_PERF_POLICY_SYNC_BOOST -> NVML_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST + * NVML_PERF_POLICY_BOARD_LIMIT -> NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT + * NVML_PERF_POLICY_LOW_UTILIZATION -> NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION + * NVML_PERF_POLICY_RELIABILITY -> NVML_FI_DEV_PERF_POLICY_RELIABILITY + * NVML_PERF_POLICY_TOTAL_APP_CLOCKS -> DEPRECATED, Do not use + * NVML_PERF_POLICY_TOTAL_BASE_CLOCKS -> NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS */ -nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); /** * Gets the device's interrupt number @@ -7087,6 +7276,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqN /** * Gets the device's core count * + * @note On MIG-enabled GPUs, querying the device's core count is currently not supported using this API. + * Please use \ref nvmlDeviceGetGpuInstanceProfileInfo to fetch the MIG device's core count. + * * @param device The identifier of the target device * @param numCores The number of cores for the specified device * @@ -7094,7 +7286,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqN * - \ref NVML_SUCCESS if GPU core count is successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a numCores is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or a mig device. * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * */ @@ -7197,7 +7389,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *ty /** - * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceGetGpuFabricInfoV instead + * @deprecated Will be deprecated in a future release. Use \ref nvmlDeviceGetGpuFabricInfoV instead * * Get fabric information associated with the device. * @@ -7217,7 +7409,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *ty * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); /** * Versioned wrapper around \ref nvmlDeviceGetGpuFabricInfo that accepts a versioned @@ -7519,7 +7711,41 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSramEccErrorStatus(nvmlDevice_t device, nvmlEccSramErrorStatus_t *status); /** - * @} + * Set new power limit of this device. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * See \ref nvmlPowerValue_v2_t for more information on the struct. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version. + * + * @param device The identifier of the target device + * @param powerValue Power management limit in milliwatts to set + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerValue is NULL or contains invalid values + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see NVML_FI_DEV_POWER_AVERAGE + * @see NVML_FI_DEV_POWER_INSTANT + * @see NVML_FI_DEV_POWER_MIN_LIMIT + * @see NVML_FI_DEV_POWER_MAX_LIMIT + * @see NVML_FI_DEV_POWER_CURRENT_LIMIT + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice_t device, nvmlPowerValue_v2_t *powerValue); + +/** + * @} // @defgroup nvmlDeviceQueries Device Queries */ /** @addtogroup nvmlAccountingStats @@ -7681,7 +7907,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageReti * that this does not match the virtual address used in CUDA, but will match the address information in Xid 63 * * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's - * retirement. + * retirement. This is supported for Pascal and newer architecture. * * For Kepler &tm; or newer fully supported devices. * @@ -7913,6 +8139,27 @@ nvmlReturn_t DECLDIR nvmlDeviceGetProcessesUtilizationInfo(nvmlDevice_t device, */ nvmlReturn_t DECLDIR nvmlDeviceGetPlatformInfo(nvmlDevice_t device, nvmlPlatformInfo_t *platformInfo); +/** + * Retrieves the Per Device Identifier (PDI) associated with this device. + * + * For Pascal &tm; or newer fully supported devices. + * + * See \ref nvmlPdi_v1_t for more information on the struct. + * + * @param[in] device The identifier of the target device + * @param[out] pdi Reference to the caller-provided structure to return the GPU PDI + * + * @return + * - \ref NVML_SUCCESS if \a pdi has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a pdi is NULL + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version is invalid/unsupported + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPdi(nvmlDevice_t device, nvmlPdi_t *pdi); + /** @} */ /***************************************************************************************************/ @@ -8150,8 +8397,6 @@ typedef enum nvmlClockLimitId_enum { * Set clocks that device will lock to. * * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. - * Setting this will supersede application clock values and take effect regardless if a cuda app is running. - * See /ref nvmlDeviceSetApplicationsClocks * * Can be used as a setting to request constant performance. * @@ -8170,7 +8415,7 @@ typedef enum nvmlClockLimitId_enum { * * Requires root/admin permissions. * - * After system reboot or driver reload applications clocks go back to their default value. + * After system reboot or driver reload GPU clocks go back to their default value. * See \ref nvmlDeviceResetGpuLockedClocks. * * For Volta &tm; or newer fully supported devices. @@ -8195,7 +8440,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned * Resets the gpu clock to the default value * * This is the gpu clock that will be used after system reboot or driver reload. - * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * Default values are idle clocks. * * @see nvmlDeviceSetGpuLockedClocks * @@ -8217,14 +8462,12 @@ nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); * Set memory clocks that device will lock to. * * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz. - * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running. - * See /ref nvmlDeviceSetApplicationsClocks * * Can be used as a setting to request constant performance. * * Requires root/admin permissions. * - * After system reboot or driver reload applications clocks go back to their default value. + * After system reboot or driver reload memory clocks go back to their default value. * See \ref nvmlDeviceResetMemoryLockedClocks. * * For Ampere &tm; or newer fully supported devices. @@ -8249,7 +8492,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsign * Resets the memory clock to the default value * * This is the memory clock that will be used after system reboot or driver reload. - * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * Default values are idle clocks. * * @see nvmlDeviceSetMemoryLockedClocks * @@ -8268,72 +8511,20 @@ nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsign nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); /** - * Set clocks that applications will lock to. - * - * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property - * defines clocks at which CUDA applications will be running unless some overspec event - * occurs (e.g. over power, over thermal or external HW brake). - * - * Can be used as a setting to request constant performance. - * - * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. - * - * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call - * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting - * above the clock value being set. + * @deprecated Applications clocks are deprecated and will be removed in CUDA 14.0. * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks - * for details on how to list available clocks combinations. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetApplicationsClocks. - * - * @param device The identifier of the target device - * @param memClockMHz Requested memory clock in MHz - * @param graphicsClockMHz Requested graphics clock in MHz - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * Please use \ref nvmlDeviceSetMemoryLockedClocks for Memory Clocks and + * \ref nvmlDeviceSetGpuLockedClocks for Graphics Clocks. */ -nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); /** - * Resets the application clock to the default value - * - * This is the applications clock that will be used after system reboot or driver reload. - * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, - * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above - * base clocks as thermal limits allow. + * @deprecated Applications clocks are deprecated and will be removed in CUDA 14.0. * - * @see nvmlDeviceGetApplicationsClock - * @see nvmlDeviceSetApplicationsClocks - * - * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * Please use \ref nvmlDeviceResetMemoryLockedClocks for Memory Clocks and + * \ref nvmlDeviceResetGpuLockedClocks for Graphics Clocks. */ -nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); /** * Try to set the current state of Auto Boosted clocks on a device. @@ -8569,7 +8760,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestri nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); /** - * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works + * @deprecated Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works * on Maxwell onwards GPU architectures. * * Set the GPCCLK VF offset value @@ -8584,10 +8775,10 @@ nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); /** - * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works + * @deprecated Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works * on Maxwell onwards GPU architectures. * * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. @@ -8602,7 +8793,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); /** * @} @@ -8666,42 +8857,11 @@ nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnable */ nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); -/** - * Set new power limit of this device. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. - * - * See \ref nvmlPowerValue_v2_t for more information on the struct. - * - * \note Limit is not persistent across reboots or driver unloads. - * Enable persistent mode to prevent driver from unloading when no application is using the device. - * - * This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version. - * - * @param device The identifier of the target device - * @param powerValue Power management limit in milliwatts to set - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerValue is NULL or contains invalid values - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see NVML_FI_DEV_POWER_AVERAGE - * @see NVML_FI_DEV_POWER_INSTANT - * @see NVML_FI_DEV_POWER_MIN_LIMIT - * @see NVML_FI_DEV_POWER_MAX_LIMIT - * @see NVML_FI_DEV_POWER_CURRENT_LIMIT - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice_t device, nvmlPowerValue_v2_t *powerValue); +/** @} */ // @addtogroup nvmlAccountingStats /***************************************************************************************************/ -/** @defgroup NVML NVLink +/** @defgroup NvLink NvLink Methods + * This chapter describes methods that NVML can perform on NVLINK enabled devices. * @{ */ /***************************************************************************************************/ @@ -8756,17 +8916,54 @@ typedef struct typedef nvmlNvlinkSetBwMode_v1_t nvmlNvlinkSetBwMode_t; #define nvmlNvlinkSetBwMode_v1 NVML_STRUCT_VERSION(NvlinkSetBwMode, 1) -/** @} */ // @defgroup NVML NVLink +/** + * Struct to represent per device NVLINK information v1 + */ +typedef struct +{ + unsigned int version; //!< IN - the API version number + unsigned int isNvleEnabled; //!< OUT - NVLINK encryption enablement +} nvmlNvLinkInfo_v1_t; +#define nvmlNvLinkInfo_v1 NVML_STRUCT_VERSION(NvLinkInfo, 1) +#define NVML_NVLINK_FIRMWARE_UCODE_TYPE_MSE 0x1 +#define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR 0x2 +#define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_UPHY 0x3 +#define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_CLN 0x4 +#define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_DLN 0x5 +#define NVML_NVLINK_FIRMWARE_VERSION_LENGTH 100 -/** @} */ +/** + * Struct to represent NVLINK firmware Semantic versioning and ucode type + */ +typedef struct +{ + unsigned char ucodeType; + unsigned int major; + unsigned int minor; + unsigned int subMinor; +} nvmlNvlinkFirmwareVersion_t; -/***************************************************************************************************/ -/** @defgroup NvLink NvLink Methods - * This chapter describes methods that NVML can perform on NVLINK enabled devices. - * @{ +/** + * Struct to represent NVLINK firmware information */ -/***************************************************************************************************/ +typedef struct +{ + nvmlNvlinkFirmwareVersion_t firmwareVersion[NVML_NVLINK_FIRMWARE_VERSION_LENGTH]; //!< OUT - NVLINK firmware version + unsigned int numValidEntries; //!< OUT - Number of valid firmware entries +} nvmlNvlinkFirmwareInfo_t; + +/** + * Struct to represent per device NVLINK information v2 + */ +typedef struct +{ + unsigned int version; //!< IN - the API version number + unsigned int isNvleEnabled; //!< OUT - NVLINK encryption enablement + nvmlNvlinkFirmwareInfo_t firmwareInfo; //!< OUT - NVLINK Firmware info +} nvmlNvLinkInfo_v2_t; +typedef nvmlNvLinkInfo_v2_t nvmlNvLinkInfo_t; +#define nvmlNvLinkInfo_v2 NVML_STRUCT_VERSION(NvLinkInfo, 2) /** * Retrieves the state of the device's NvLink for the link specified @@ -8887,7 +9084,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsign nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); /** - * Deprecated: Setting utilization counter control is no longer supported. + * @deprecated Setting utilization counter control is no longer supported. * * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset @@ -8908,11 +9105,11 @@ nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, uns * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t *control, unsigned int reset); /** - * Deprecated: Getting utilization counter control is no longer supported. + * @deprecated Getting utilization counter control is no longer supported. * * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition @@ -8931,12 +9128,12 @@ nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t *control); /** - * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. + * @deprecated Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. * * Retrieve the NVLINK utilization counter based on the current control for a specified counter. * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl @@ -8957,11 +9154,11 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, unsigned long long *rxcounter, unsigned long long *txcounter); /** - * Deprecated: Freezing NVLINK utilization counters is no longer supported. + * @deprecated Freezing NVLINK utilization counters is no longer supported. * * Freeze the NVLINK utilization counters * Both the receive and transmit counters are operated on by this function @@ -8981,11 +9178,11 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlEnableState_t freeze); /** - * Deprecated: Resetting NVLINK utilization counters is no longer supported. + * @deprecated Resetting NVLINK utilization counters is no longer supported. * * Reset the NVLINK utilization counters * Both the receive and transmit counters are operated on by this function @@ -9003,7 +9200,7 @@ nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t devi * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); /** * Get the NVLink device type of the remote device connected over the given link. @@ -9120,7 +9317,24 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvlinkBwMode(nvmlDevice_t device, nvmlReturn_t DECLDIR nvmlDeviceSetNvlinkBwMode(nvmlDevice_t device, nvmlNvlinkSetBwMode_t *setBwMode); -/** @} */ +/** + * Query NVLINK information associated with this device. + * + * @param[in] device The identifier of the target device + * @param[out] info Reference to \a nvmlNvLinkInfo_t + * + * @return + * - \ref NVML_SUCCESS if query is success + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a info is NULL + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version is invalid/unsupported + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkInfo(nvmlDevice_t device, nvmlNvLinkInfo_t *info); + +/** @} */ // @defgroup NvLink NvLink Methods /***************************************************************************************************/ /** @defgroup nvmlEvents Event Handling Methods @@ -9260,7 +9474,7 @@ nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * d */ nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); -/* +/** * Create an empty set of system events. * Event set should be freed by \ref nvmlSystemEventSetFree * @@ -9283,7 +9497,7 @@ nvmlReturn_t DECLDIR nvmlSystemEventSetCreate(nvmlSystemEventSetCreateRequest_t * * For Fermi &tm; or newer fully supported devices. * - * @param set Reference to nvmlSystemEventSetFreeRequest_t + * @param request Reference to nvmlSystemEventSetFreeRequest_t * * @return * - \ref NVML_SUCCESS if the event has been set @@ -10249,7 +10463,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); +DEPRECATED(13.0) nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); /** * Retrieve the vGPU type of a vGPU instance. @@ -10735,8 +10949,6 @@ nvmlReturn_t DECLDIR nvmlGpuInstanceGetVgpuHeterogeneousMode(nvmlGpuInstance_t g * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should * set the correct version number to set the vGPU heterogeneous mode. * - * %GB20X_OR_NEWER% - * * @param gpuInstance The GPU instance handle * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t * @@ -10756,7 +10968,7 @@ nvmlReturn_t DECLDIR nvmlGpuInstanceSetVgpuHeterogeneousMode(nvmlGpuInstance_t g /** @} */ /***************************************************************************************************/ -/** @defgroup nvml vGPU Migration +/** @defgroup nvmlVgpuMigration vGPU Migration * This chapter describes operations that are associated with vGPU Migration. * @{ */ @@ -11075,7 +11287,7 @@ nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVe */ nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion); -/** @} */ +/** @} */ // @defgroup nvmlVgpuMigration vGPU Migration /***************************************************************************************************/ /** @defgroup nvmlUtil vGPU Utilization and Accounting @@ -11447,6 +11659,55 @@ nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlEx /** @} */ +/***************************************************************************************************/ +/** @defgroup nvmlGPUPRMAccess PRM Access + * This chapter describes NVML operations that are associated with PRM register reads + * @{ + */ +/***************************************************************************************************/ + +#define NVML_PRM_DATA_MAX_SIZE 496 +/** + * Main PRM input structure + */ +typedef struct +{ + /* I/O parameters */ + unsigned dataSize; //!< Size of the input TLV data. + unsigned status; //!< OUT: status of the PRM command + union { + /* Input data in TLV format */ + unsigned char inData[NVML_PRM_DATA_MAX_SIZE]; //!< IN: Input data in TLV format + /* Output data in TLV format */ + unsigned char outData[NVML_PRM_DATA_MAX_SIZE]; //!< OUT: Output PRM data in TLV format + }; +} nvmlPRMTLV_v1_t; + +/** + * Read or write a GPU PRM register. The input is assumed to be in TLV format in + * network byte order. + * + * %BLACKWELL_OR_NEWER% + * + * Supported on Linux only. + * + * @param device Identifer of target GPU device + * @param buffer Structure holding the input data in TLV format as well as + * the PRM register contents in TLV format (in the case of a successful + * read operation). + * Note: the input data and any returned data shall be in network byte order. + * + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if \p device or \p buffer are invalid + * - \ref NVML_ERROR_NO_PERMISSION if user does not have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if this feature is not supported by the device + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version specified in \p buffer is not supported + */ +nvmlReturn_t DECLDIR nvmlDeviceReadWritePRM_v1(nvmlDevice_t device, nvmlPRMTLV_v1_t *buffer); + +/** @} */ + /***************************************************************************************************/ /** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management * This chapter describes NVML operations that are associated with Multi Instance GPU management. @@ -11775,6 +12036,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); @@ -11834,6 +12096,37 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, un nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, nvmlGpuInstanceProfileInfo_v2_t *info); +/** + * GPU instance profile query function that accepts profile ID, instead of profile name. + * It accepts a versioned \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure. + * + * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the + * appropriate version prior to calling this function. For example: + * \code + * nvmlGpuInstanceProfileInfo_v2_t profileInfo = + * { .version = nvmlGpuInstanceProfileInfo_v2 }; + * nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device, + * profile, + * &profileInfo); + * \endcode + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param profileId One of the profile IDs. + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a info, or \a info->version are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoByIdV(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstanceProfileInfo_v2_t *info); + /** * Get GPU instance placements. * @@ -12622,12 +12915,6 @@ typedef struct struct nvmlGpmSample_st* handle; } nvmlGpmSample_t; -typedef struct { - char *shortName; - char *longName; - char *unit; -} nvmlGpmMetricMetricInfo_t; - /** * GPM metric information. */ @@ -12636,7 +12923,12 @@ typedef struct unsigned int metricId; //!< IN: NVML_GPM_METRIC_? define of which metric to retrieve nvmlReturn_t nvmlReturn; //!< OUT: Status of this metric. If this is nonzero, then value is not valid double value; //!< OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) - nvmlGpmMetricMetricInfo_t metricInfo; //!< OUT: Metric name and unit. Those can be NULL if not defined + struct + { + char *shortName; + char *longName; + char *unit; + } metricInfo; //!< OUT: Metric name and unit. Those can be NULL if not defined } nvmlGpmMetric_t; /** @@ -12850,6 +13142,7 @@ typedef nvmlDeviceCapabilities_v1_t nvmlDeviceCapabilities_t; nvmlReturn_t DECLDIR nvmlDeviceGetCapabilities(nvmlDevice_t device, nvmlDeviceCapabilities_t *caps); + /* * Generic bitmask to hold 255 bits, represented by 8 elements of 32 bits */ @@ -13160,6 +13453,45 @@ nvmlReturn_t DECLDIR nvmlDevicePowerSmoothingSetState(nvmlDevice_t device, nvmlPowerSmoothingState_t *state); /** @} */ // @defgroup +/** + * Retrieves the counts of SRAM unique uncorrected ECC errors + * + * %BLACKWELL_OR_NEWER% + * + * Reads SRAM unique uncorrected ECC error counts. The total number of unique errors is returned by + * \a errorCounts->entryCount. Error counts are returned as an array of in the caller-supplied buffer pointed at by + * \a errorCounts->entries. Each error count entry holds the location/address of the unique error, the error count and + * whether the error is parity or not. + * + * To read SRAM unique uncorrected ECC error counts, first determine the size of buffer required to hold the error + * counts by invoking the function with \a errorCounts->entries set to NULL. The required array size is returned in + * \a errorCounts->entryCount. The caller should allocate a buffer of size "errorCounts->entryCount * + * sizeof(nvmlEccSramUniqueUncorrectedErrorCounts_t)". Invoke the function again with the allocated buffer passed in + * \a errorCounts->entries. This time \a errorCounts->entryCount will be taken as the entry array size that caller + * allocates for \a errorCounts->entries. + * + * On successful return of the second query, the function updates \a errorCounts->entries with all unique errors. This + * may fail if \a errorCounts->entryCount is smaller than the actual number of unique errors. This can happen in cases + * like new errors occur since the previous query of \a errorCounts->entryCount. No matter the query succeeds or not, + * the latest number of unique errors will be returned in \a errorCounts->entryCount. + * + * @note The query is only supported when ECC mode is enabled. + * + * @param device The identifier of the target device + * @param errorCounts Pointer to caller-supplied array which returns the unique error count entries + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a errorCounts->entryCount is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature or ECC mods is not enabled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the allocated error entry array is not big enough + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t device, + nvmlEccSramUniqueUncorrectedErrorCounts_t *errorCounts); + /** * NVML API versioning support */ diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go index 3be17966c..efa586372 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go @@ -22,7 +22,7 @@ type PciInfoExt_v1 struct { PciSubSystemId uint32 BaseClass uint32 SubClass uint32 - BusId [32]int8 + BusId [32]uint8 } type PciInfoExt struct { @@ -34,17 +34,17 @@ type PciInfoExt struct { PciSubSystemId uint32 BaseClass uint32 SubClass uint32 - BusId [32]int8 + BusId [32]uint8 } type PciInfo struct { - BusIdLegacy [16]int8 + BusIdLegacy [16]uint8 Domain uint32 Bus uint32 Device uint32 PciDeviceId uint32 PciSubSystemId uint32 - BusId [32]int8 + BusId [32]uint8 } type EccErrorCounts struct { @@ -136,6 +136,28 @@ type C2cModeInfo_v1 struct { IsC2cEnabled uint32 } +type DeviceAddressingMode_v1 struct { + Version uint32 + Value uint32 +} + +type DeviceAddressingMode struct { + Version uint32 + Value uint32 +} + +type RepairStatus_v1 struct { + Version uint32 + BChannelRepairPending uint32 + BTpcRepairPending uint32 +} + +type RepairStatus struct { + Version uint32 + BChannelRepairPending uint32 + BTpcRepairPending uint32 +} + type RowRemapperHistogramValues struct { Max uint32 High uint32 @@ -173,17 +195,9 @@ type ViolationTime struct { ViolationTime uint64 } -type GpuThermalSettingsSensor struct { - Controller int32 - DefaultMinTemp int32 - DefaultMaxTemp int32 - CurrentTemp int32 - Target int32 -} - type GpuThermalSettings struct { Count uint32 - Sensor [3]GpuThermalSettingsSensor + Sensor [3]_Ctype_struct___28 } type CoolerInfo_v1 struct { @@ -218,6 +232,16 @@ type UUID struct { Pad_cgo_0 [3]byte } +type Pdi_v1 struct { + Version uint32 + Value uint64 +} + +type Pdi struct { + Version uint32 + Value uint64 +} + type DramEncryptionInfo_v1 struct { Version uint32 EncryptionState uint32 @@ -281,22 +305,28 @@ type FanSpeedInfo struct { type DevicePerfModes_v1 struct { Version uint32 - Str [2048]int8 + Str [2048]uint8 } type DevicePerfModes struct { Version uint32 - Str [2048]int8 + Str [2048]uint8 } type DeviceCurrentClockFreqs_v1 struct { Version uint32 - Str [2048]int8 + Str [2048]uint8 } type DeviceCurrentClockFreqs struct { Version uint32 - Str [2048]int8 + Str [2048]uint8 +} + +type DevicePowerMizerModes_v1 struct { + CurrentMode uint32 + Mode uint32 + SupportedPowerMizerModes uint32 } type ProcessUtilizationSample struct { @@ -404,6 +434,28 @@ type PlatformInfo struct { Pad_cgo_0 [3]byte } +type EccSramUniqueUncorrectedErrorEntry_v1 struct { + Unit uint32 + Location uint32 + Sublocation uint32 + Extlocation uint32 + Address uint32 + IsParity uint32 + Count uint32 +} + +type EccSramUniqueUncorrectedErrorCounts_v1 struct { + Version uint32 + EntryCount uint32 + Entries *EccSramUniqueUncorrectedErrorEntry_v1 +} + +type EccSramUniqueUncorrectedErrorCounts struct { + Version uint32 + EntryCount uint32 + Entries *EccSramUniqueUncorrectedErrorEntry_v1 +} + type DeviceArchitecture uint32 type BusType uint32 @@ -412,16 +464,9 @@ type FanControlPolicy uint32 type PowerSource uint32 -type GpuDynamicPstatesInfoUtilization struct { - BIsPresent uint32 - Percentage uint32 - IncThreshold uint32 - DecThreshold uint32 -} - type GpuDynamicPstatesInfo struct { Flags uint32 - Utilization [8]GpuDynamicPstatesInfoUtilization + Utilization [8]_Ctype_struct___23 } type PowerScopeType byte @@ -531,7 +576,7 @@ type VgpuInstancesUtilizationInfo struct { type VgpuProcessUtilizationSample struct { VgpuInstance uint32 Pid uint32 - ProcessName [64]int8 + ProcessName [64]uint8 TimeStamp uint64 SmUtil uint32 MemUtil uint32 @@ -540,7 +585,7 @@ type VgpuProcessUtilizationSample struct { } type VgpuProcessUtilizationInfo_v1 struct { - ProcessName [64]int8 + ProcessName [64]uint8 TimeStamp uint64 VgpuInstance uint32 Pid uint32 @@ -576,15 +621,6 @@ type VgpuRuntimeState struct { Size uint64 } -type VgpuSchedulerParamsVgpuSchedDataWithARR struct { - AvgFactor uint32 - Timeslice uint32 -} - -type VgpuSchedulerParamsVgpuSchedData struct { - Timeslice uint32 -} - const sizeofVgpuSchedulerParams = unsafe.Sizeof([8]byte{}) type VgpuSchedulerParams [sizeofVgpuSchedulerParams]byte @@ -613,15 +649,6 @@ type VgpuSchedulerGetState struct { SchedulerParams [8]byte } -type VgpuSchedulerSetParamsVgpuSchedDataWithARR struct { - AvgFactor uint32 - Frequency uint32 -} - -type VgpuSchedulerSetParamsVgpuSchedData struct { - Timeslice uint32 -} - const sizeofVgpuSchedulerSetParams = unsafe.Sizeof([8]byte{}) type VgpuSchedulerSetParams [sizeofVgpuSchedulerSetParams]byte @@ -674,8 +701,8 @@ type GridLicenseExpiry struct { type GridLicensableFeature struct { FeatureCode uint32 FeatureState uint32 - LicenseInfo [128]int8 - ProductName [128]int8 + LicenseInfo [128]uint8 + ProductName [128]uint8 FeatureEnabled uint32 LicenseExpiry GridLicenseExpiry } @@ -812,23 +839,23 @@ type nvmlUnit struct { type HwbcEntry struct { HwbcId uint32 - FirmwareVersion [32]int8 + FirmwareVersion [32]uint8 } type LedState struct { - Cause [256]int8 + Cause [256]uint8 Color uint32 } type UnitInfo struct { - Name [96]int8 - Id [96]int8 - Serial [96]int8 - FirmwareVersion [96]int8 + Name [96]uint8 + Id [96]uint8 + Serial [96]uint8 + FirmwareVersion [96]uint8 } type PSUInfo struct { - State [256]int8 + State [256]uint8 Current uint32 Voltage uint32 Power uint32 @@ -1045,23 +1072,36 @@ type GpuFabricInfo_v2 struct { HealthMask uint32 } +type GpuFabricInfo_v3 struct { + Version uint32 + ClusterUuid [16]uint8 + Status uint32 + CliqueId uint32 + State uint8 + HealthMask uint32 + HealthSummary uint8 + Pad_cgo_0 [3]byte +} + type GpuFabricInfoV struct { - Version uint32 - ClusterUuid [16]uint8 - Status uint32 - CliqueId uint32 - State uint8 - HealthMask uint32 + Version uint32 + ClusterUuid [16]uint8 + Status uint32 + CliqueId uint32 + State uint8 + HealthMask uint32 + HealthSummary uint8 + Pad_cgo_0 [3]byte } type SystemDriverBranchInfo_v1 struct { Version uint32 - Branch [80]int8 + Branch [80]uint8 } type SystemDriverBranchInfo struct { Version uint32 - Branch [80]int8 + Branch [80]uint8 } type AffinityScope uint32 @@ -1118,6 +1158,35 @@ type NvlinkSetBwMode struct { Pad_cgo_0 [3]byte } +type NvLinkInfo_v1 struct { + Version uint32 + IsNvleEnabled uint32 +} + +type NvlinkFirmwareVersion struct { + UcodeType uint8 + Major uint32 + Minor uint32 + SubMinor uint32 +} + +type NvlinkFirmwareInfo struct { + FirmwareVersion [100]NvlinkFirmwareVersion + NumValidEntries uint32 +} + +type NvLinkInfo_v2 struct { + Version uint32 + IsNvleEnabled uint32 + FirmwareInfo NvlinkFirmwareInfo +} + +type NvLinkInfo struct { + Version uint32 + IsNvleEnabled uint32 + FirmwareInfo NvlinkFirmwareInfo +} + type VgpuVersion struct { MinVersion uint32 MaxVersion uint32 @@ -1127,24 +1196,24 @@ type nvmlVgpuMetadata struct { Version uint32 Revision uint32 GuestInfoState uint32 - GuestDriverVersion [80]int8 - HostDriverVersion [80]int8 + GuestDriverVersion [80]uint8 + HostDriverVersion [80]uint8 Reserved [6]uint32 VgpuVirtualizationCaps uint32 GuestVgpuVersion uint32 OpaqueDataSize uint32 - OpaqueData [4]int8 + OpaqueData [4]uint8 } type nvmlVgpuPgpuMetadata struct { Version uint32 Revision uint32 - HostDriverVersion [80]int8 + HostDriverVersion [80]uint8 PgpuVirtualizationCaps uint32 Reserved [5]uint32 HostSupportedVgpuRange VgpuVersion OpaqueDataSize uint32 - OpaqueData [4]int8 + OpaqueData [4]uint8 } type VgpuPgpuCompatibility struct { @@ -1154,7 +1223,13 @@ type VgpuPgpuCompatibility struct { type ExcludedDeviceInfo struct { PciInfo PciInfo - Uuid [80]int8 + Uuid [80]uint8 +} + +type PRMTLV_v1 struct { + DataSize uint32 + Status uint32 + InData [496]uint8 } type GpuInstancePlacement struct { @@ -1189,7 +1264,7 @@ type GpuInstanceProfileInfo_v2 struct { JpegCount uint32 OfaCount uint32 MemorySizeMB uint64 - Name [96]int8 + Name [96]uint8 } type GpuInstanceProfileInfo_v3 struct { @@ -1204,7 +1279,7 @@ type GpuInstanceProfileInfo_v3 struct { JpegCount uint32 OfaCount uint32 MemorySizeMB uint64 - Name [96]int8 + Name [96]uint8 Capabilities uint32 Pad_cgo_0 [4]byte } @@ -1244,7 +1319,7 @@ type ComputeInstanceProfileInfo_v2 struct { SharedEncoderCount uint32 SharedJpegCount uint32 SharedOfaCount uint32 - Name [96]int8 + Name [96]uint8 } type ComputeInstanceProfileInfo_v3 struct { @@ -1258,7 +1333,7 @@ type ComputeInstanceProfileInfo_v3 struct { SharedEncoderCount uint32 SharedJpegCount uint32 SharedOfaCount uint32 - Name [96]int8 + Name [96]uint8 Capabilities uint32 } @@ -1278,17 +1353,11 @@ type nvmlGpmSample struct { Handle *_Ctype_struct_nvmlGpmSample_st } -type GpmMetricMetricInfo struct { - ShortName *int8 - LongName *int8 - Unit *int8 -} - type GpmMetric struct { MetricId uint32 NvmlReturn uint32 Value float64 - MetricInfo GpmMetricMetricInfo + MetricInfo _Ctype_struct___19 } type nvmlGpmMetricsGetType struct { diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go index b1e0fa7c6..9ab649a42 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go @@ -214,6 +214,8 @@ func (vgpuInstance nvmlVgpuInstance) GetLicenseInfo() (VgpuLicenseInfo, Return) } // nvml.VgpuInstanceGetLicenseStatus() +// +// Deprecated: Use VgpuInstanceGetLicenseInfo instead. func (l *library) VgpuInstanceGetLicenseStatus(vgpuInstance VgpuInstance) (int, Return) { return vgpuInstance.GetLicenseStatus() } diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go index bfe4d0790..2f7a3220a 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go @@ -37,6 +37,7 @@ var ( DeviceGetAccountingStats = libnvml.DeviceGetAccountingStats DeviceGetActiveVgpus = libnvml.DeviceGetActiveVgpus DeviceGetAdaptiveClockInfoStatus = libnvml.DeviceGetAdaptiveClockInfoStatus + DeviceGetAddressingMode = libnvml.DeviceGetAddressingMode DeviceGetApplicationsClock = libnvml.DeviceGetApplicationsClock DeviceGetArchitecture = libnvml.DeviceGetArchitecture DeviceGetAttributes = libnvml.DeviceGetAttributes @@ -103,6 +104,7 @@ var ( DeviceGetGpuInstanceId = libnvml.DeviceGetGpuInstanceId DeviceGetGpuInstancePossiblePlacements = libnvml.DeviceGetGpuInstancePossiblePlacements DeviceGetGpuInstanceProfileInfo = libnvml.DeviceGetGpuInstanceProfileInfo + DeviceGetGpuInstanceProfileInfoByIdV = libnvml.DeviceGetGpuInstanceProfileInfoByIdV DeviceGetGpuInstanceProfileInfoV = libnvml.DeviceGetGpuInstanceProfileInfoV DeviceGetGpuInstanceRemainingCapacity = libnvml.DeviceGetGpuInstanceRemainingCapacity DeviceGetGpuInstances = libnvml.DeviceGetGpuInstances @@ -152,6 +154,7 @@ var ( DeviceGetNumaNodeId = libnvml.DeviceGetNumaNodeId DeviceGetNvLinkCapability = libnvml.DeviceGetNvLinkCapability DeviceGetNvLinkErrorCounter = libnvml.DeviceGetNvLinkErrorCounter + DeviceGetNvLinkInfo = libnvml.DeviceGetNvLinkInfo DeviceGetNvLinkRemoteDeviceType = libnvml.DeviceGetNvLinkRemoteDeviceType DeviceGetNvLinkRemotePciInfo = libnvml.DeviceGetNvLinkRemotePciInfo DeviceGetNvLinkState = libnvml.DeviceGetNvLinkState @@ -168,6 +171,7 @@ var ( DeviceGetPcieReplayCounter = libnvml.DeviceGetPcieReplayCounter DeviceGetPcieSpeed = libnvml.DeviceGetPcieSpeed DeviceGetPcieThroughput = libnvml.DeviceGetPcieThroughput + DeviceGetPdi = libnvml.DeviceGetPdi DeviceGetPerformanceModes = libnvml.DeviceGetPerformanceModes DeviceGetPerformanceState = libnvml.DeviceGetPerformanceState DeviceGetPersistenceMode = libnvml.DeviceGetPersistenceMode @@ -177,12 +181,14 @@ var ( DeviceGetPowerManagementLimit = libnvml.DeviceGetPowerManagementLimit DeviceGetPowerManagementLimitConstraints = libnvml.DeviceGetPowerManagementLimitConstraints DeviceGetPowerManagementMode = libnvml.DeviceGetPowerManagementMode + DeviceGetPowerMizerMode_v1 = libnvml.DeviceGetPowerMizerMode_v1 DeviceGetPowerSource = libnvml.DeviceGetPowerSource DeviceGetPowerState = libnvml.DeviceGetPowerState DeviceGetPowerUsage = libnvml.DeviceGetPowerUsage DeviceGetProcessUtilization = libnvml.DeviceGetProcessUtilization DeviceGetProcessesUtilizationInfo = libnvml.DeviceGetProcessesUtilizationInfo DeviceGetRemappedRows = libnvml.DeviceGetRemappedRows + DeviceGetRepairStatus = libnvml.DeviceGetRepairStatus DeviceGetRetiredPages = libnvml.DeviceGetRetiredPages DeviceGetRetiredPagesPendingStatus = libnvml.DeviceGetRetiredPagesPendingStatus DeviceGetRetiredPages_v2 = libnvml.DeviceGetRetiredPages_v2 @@ -191,6 +197,7 @@ var ( DeviceGetSamples = libnvml.DeviceGetSamples DeviceGetSerial = libnvml.DeviceGetSerial DeviceGetSramEccErrorStatus = libnvml.DeviceGetSramEccErrorStatus + DeviceGetSramUniqueUncorrectedEccErrorCounts = libnvml.DeviceGetSramUniqueUncorrectedEccErrorCounts DeviceGetSupportedClocksEventReasons = libnvml.DeviceGetSupportedClocksEventReasons DeviceGetSupportedClocksThrottleReasons = libnvml.DeviceGetSupportedClocksThrottleReasons DeviceGetSupportedEventTypes = libnvml.DeviceGetSupportedEventTypes @@ -231,6 +238,7 @@ var ( DevicePowerSmoothingSetState = libnvml.DevicePowerSmoothingSetState DevicePowerSmoothingUpdatePresetProfileParam = libnvml.DevicePowerSmoothingUpdatePresetProfileParam DeviceQueryDrainState = libnvml.DeviceQueryDrainState + DeviceReadWritePRM_v1 = libnvml.DeviceReadWritePRM_v1 DeviceRegisterEvents = libnvml.DeviceRegisterEvents DeviceRemoveGpu = libnvml.DeviceRemoveGpu DeviceRemoveGpu_v2 = libnvml.DeviceRemoveGpu_v2 @@ -409,6 +417,7 @@ type Interface interface { DeviceGetAccountingStats(Device, uint32) (AccountingStats, Return) DeviceGetActiveVgpus(Device) ([]VgpuInstance, Return) DeviceGetAdaptiveClockInfoStatus(Device) (uint32, Return) + DeviceGetAddressingMode(Device) (DeviceAddressingMode, Return) DeviceGetApplicationsClock(Device, ClockType) (uint32, Return) DeviceGetArchitecture(Device) (DeviceArchitecture, Return) DeviceGetAttributes(Device) (DeviceAttributes, Return) @@ -475,6 +484,7 @@ type Interface interface { DeviceGetGpuInstanceId(Device) (int, Return) DeviceGetGpuInstancePossiblePlacements(Device, *GpuInstanceProfileInfo) ([]GpuInstancePlacement, Return) DeviceGetGpuInstanceProfileInfo(Device, int) (GpuInstanceProfileInfo, Return) + DeviceGetGpuInstanceProfileInfoByIdV(Device, int) GpuInstanceProfileInfoByIdHandler DeviceGetGpuInstanceProfileInfoV(Device, int) GpuInstanceProfileInfoHandler DeviceGetGpuInstanceRemainingCapacity(Device, *GpuInstanceProfileInfo) (int, Return) DeviceGetGpuInstances(Device, *GpuInstanceProfileInfo) ([]GpuInstance, Return) @@ -524,6 +534,7 @@ type Interface interface { DeviceGetNumaNodeId(Device) (int, Return) DeviceGetNvLinkCapability(Device, int, NvLinkCapability) (uint32, Return) DeviceGetNvLinkErrorCounter(Device, int, NvLinkErrorCounter) (uint64, Return) + DeviceGetNvLinkInfo(Device) NvLinkInfoHandler DeviceGetNvLinkRemoteDeviceType(Device, int) (IntNvLinkDeviceType, Return) DeviceGetNvLinkRemotePciInfo(Device, int) (PciInfo, Return) DeviceGetNvLinkState(Device, int) (EnableState, Return) @@ -540,6 +551,7 @@ type Interface interface { DeviceGetPcieReplayCounter(Device) (int, Return) DeviceGetPcieSpeed(Device) (int, Return) DeviceGetPcieThroughput(Device, PcieUtilCounter) (uint32, Return) + DeviceGetPdi(Device) (Pdi, Return) DeviceGetPerformanceModes(Device) (DevicePerfModes, Return) DeviceGetPerformanceState(Device) (Pstates, Return) DeviceGetPersistenceMode(Device) (EnableState, Return) @@ -549,12 +561,14 @@ type Interface interface { DeviceGetPowerManagementLimit(Device) (uint32, Return) DeviceGetPowerManagementLimitConstraints(Device) (uint32, uint32, Return) DeviceGetPowerManagementMode(Device) (EnableState, Return) + DeviceGetPowerMizerMode_v1(Device) (DevicePowerMizerModes_v1, Return) DeviceGetPowerSource(Device) (PowerSource, Return) DeviceGetPowerState(Device) (Pstates, Return) DeviceGetPowerUsage(Device) (uint32, Return) DeviceGetProcessUtilization(Device, uint64) ([]ProcessUtilizationSample, Return) DeviceGetProcessesUtilizationInfo(Device) (ProcessesUtilizationInfo, Return) DeviceGetRemappedRows(Device) (int, int, bool, bool, Return) + DeviceGetRepairStatus(Device) (RepairStatus, Return) DeviceGetRetiredPages(Device, PageRetirementCause) ([]uint64, Return) DeviceGetRetiredPagesPendingStatus(Device) (EnableState, Return) DeviceGetRetiredPages_v2(Device, PageRetirementCause) ([]uint64, []uint64, Return) @@ -563,6 +577,7 @@ type Interface interface { DeviceGetSamples(Device, SamplingType, uint64) (ValueType, []Sample, Return) DeviceGetSerial(Device) (string, Return) DeviceGetSramEccErrorStatus(Device) (EccSramErrorStatus, Return) + DeviceGetSramUniqueUncorrectedEccErrorCounts(Device, *EccSramUniqueUncorrectedErrorCounts) Return DeviceGetSupportedClocksEventReasons(Device) (uint64, Return) DeviceGetSupportedClocksThrottleReasons(Device) (uint64, Return) DeviceGetSupportedEventTypes(Device) (uint64, Return) @@ -603,6 +618,7 @@ type Interface interface { DevicePowerSmoothingSetState(Device, *PowerSmoothingState) Return DevicePowerSmoothingUpdatePresetProfileParam(Device, *PowerSmoothingProfile) Return DeviceQueryDrainState(*PciInfo) (EnableState, Return) + DeviceReadWritePRM_v1(Device, *PRMTLV_v1) Return DeviceRegisterEvents(Device, uint64, EventSet) Return DeviceRemoveGpu(*PciInfo) Return DeviceRemoveGpu_v2(*PciInfo, DetachGpuState, PcieLinkState) Return @@ -778,6 +794,7 @@ type Device interface { GetAccountingStats(uint32) (AccountingStats, Return) GetActiveVgpus() ([]VgpuInstance, Return) GetAdaptiveClockInfoStatus() (uint32, Return) + GetAddressingMode() (DeviceAddressingMode, Return) GetApplicationsClock(ClockType) (uint32, Return) GetArchitecture() (DeviceArchitecture, Return) GetAttributes() (DeviceAttributes, Return) @@ -843,6 +860,7 @@ type Device interface { GetGpuInstanceId() (int, Return) GetGpuInstancePossiblePlacements(*GpuInstanceProfileInfo) ([]GpuInstancePlacement, Return) GetGpuInstanceProfileInfo(int) (GpuInstanceProfileInfo, Return) + GetGpuInstanceProfileInfoByIdV(int) GpuInstanceProfileInfoByIdHandler GetGpuInstanceProfileInfoV(int) GpuInstanceProfileInfoHandler GetGpuInstanceRemainingCapacity(*GpuInstanceProfileInfo) (int, Return) GetGpuInstances(*GpuInstanceProfileInfo) ([]GpuInstance, Return) @@ -887,6 +905,7 @@ type Device interface { GetNumaNodeId() (int, Return) GetNvLinkCapability(int, NvLinkCapability) (uint32, Return) GetNvLinkErrorCounter(int, NvLinkErrorCounter) (uint64, Return) + GetNvLinkInfo() NvLinkInfoHandler GetNvLinkRemoteDeviceType(int) (IntNvLinkDeviceType, Return) GetNvLinkRemotePciInfo(int) (PciInfo, Return) GetNvLinkState(int) (EnableState, Return) @@ -903,6 +922,7 @@ type Device interface { GetPcieReplayCounter() (int, Return) GetPcieSpeed() (int, Return) GetPcieThroughput(PcieUtilCounter) (uint32, Return) + GetPdi() (Pdi, Return) GetPerformanceModes() (DevicePerfModes, Return) GetPerformanceState() (Pstates, Return) GetPersistenceMode() (EnableState, Return) @@ -912,12 +932,14 @@ type Device interface { GetPowerManagementLimit() (uint32, Return) GetPowerManagementLimitConstraints() (uint32, uint32, Return) GetPowerManagementMode() (EnableState, Return) + GetPowerMizerMode_v1() (DevicePowerMizerModes_v1, Return) GetPowerSource() (PowerSource, Return) GetPowerState() (Pstates, Return) GetPowerUsage() (uint32, Return) GetProcessUtilization(uint64) ([]ProcessUtilizationSample, Return) GetProcessesUtilizationInfo() (ProcessesUtilizationInfo, Return) GetRemappedRows() (int, int, bool, bool, Return) + GetRepairStatus() (RepairStatus, Return) GetRetiredPages(PageRetirementCause) ([]uint64, Return) GetRetiredPagesPendingStatus() (EnableState, Return) GetRetiredPages_v2(PageRetirementCause) ([]uint64, []uint64, Return) @@ -926,6 +948,7 @@ type Device interface { GetSamples(SamplingType, uint64) (ValueType, []Sample, Return) GetSerial() (string, Return) GetSramEccErrorStatus() (EccSramErrorStatus, Return) + GetSramUniqueUncorrectedEccErrorCounts(*EccSramUniqueUncorrectedErrorCounts) Return GetSupportedClocksEventReasons() (uint64, Return) GetSupportedClocksThrottleReasons() (uint64, Return) GetSupportedEventTypes() (uint64, Return) @@ -970,6 +993,7 @@ type Device interface { PowerSmoothingActivatePresetProfile(*PowerSmoothingProfile) Return PowerSmoothingSetState(*PowerSmoothingState) Return PowerSmoothingUpdatePresetProfileParam(*PowerSmoothingProfile) Return + ReadWritePRM_v1(*PRMTLV_v1) Return RegisterEvents(uint64, EventSet) Return ResetApplicationsClocks() Return ResetGpuLockedClocks() Return diff --git a/vendor/modules.txt b/vendor/modules.txt index 5a94b1fe8..cbd53adcb 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -10,7 +10,7 @@ github.com/NVIDIA/go-nvlib/pkg/nvpci github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes github.com/NVIDIA/go-nvlib/pkg/nvpci/mmio github.com/NVIDIA/go-nvlib/pkg/pciids -# github.com/NVIDIA/go-nvml v0.12.9-0 +# github.com/NVIDIA/go-nvml v0.13.0-0 ## explicit; go 1.20 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml