Skip to content

Commit 63e6e44

Browse files
committed
Replace DCGM_FI_PROF_GR_ENGINE_ACTIVE with DCGM_FI_DEV_GPU_UTIL
* Profiling metrics DCGM_FI_PROF_* aren't available on pre-Volta GPUs * Consolidate the default metrics into the configmap so that installing the console plugin doesn't break other tools
1 parent eb3e36f commit 63e6e44

File tree

6 files changed

+58
-19
lines changed

6 files changed

+58
-19
lines changed

deployment/console-plugin-nvidia-gpu/templates/configmap.yaml

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,54 @@ metadata:
66
{{- include "console-plugin-nvidia-gpu.labels" . | nindent 4 }}
77
data:
88
dcgm-metrics.csv: |
9-
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization.
10-
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization.
11-
DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization.
12-
DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization.
13-
DCGM_FI_DEV_POWER_USAGE, gauge, power usage.
9+
# === Added by the console plugin ===
1410
DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, power mgmt limit.
15-
DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp.
16-
DCGM_FI_DEV_SM_CLOCK, gauge, sm clock.
1711
DCGM_FI_DEV_MAX_SM_CLOCK, gauge, max sm clock.
18-
DCGM_FI_DEV_MEM_CLOCK, gauge, mem clock.
1912
DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, max mem clock.
13+
14+
# === Available by default ===
15+
# Clocks
16+
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
17+
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
18+
19+
# Temperature
20+
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
21+
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
22+
23+
# Power
24+
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
25+
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
26+
27+
# PCIE
28+
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
29+
30+
# Utilization (the sample period varies depending on the product)
31+
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
32+
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
33+
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
34+
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
35+
36+
# Errors and violations
37+
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
38+
39+
# Memory usage
40+
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
41+
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
42+
43+
# NVLink
44+
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.
45+
46+
# VGPU License status
47+
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
48+
49+
# Remapped rows
50+
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
51+
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
52+
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
53+
54+
# DCP metrics
55+
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active.
56+
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active.
57+
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data.
58+
DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload.
59+
DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload.

src/components/GPUDashboard/Cards/GPUDashboardGraphs.tsx

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import {
77
humanizeDegrees,
88
humanizeHertz,
99
humanizePercentage,
10-
humanizeRatio,
1110
humanizeWatts,
1211
} from '../../../utils/units';
1312
import { useTranslation } from '../../../i18n';
@@ -17,7 +16,7 @@ import { useTranslation } from '../../../i18n';
1716
//
1817

1918
/*
20-
these are ok:
19+
these are ok:
2120
DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization.
2221
DCGM_FI_DEV_POWER_USAGE, gauge, power usage.
2322
DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp.
@@ -32,7 +31,7 @@ import { useTranslation } from '../../../i18n';
3231
*/
3332

3433
/* Used metrics
35-
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization.
34+
DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization.
3635
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization.
3736
DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization.
3837
DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization.
@@ -57,9 +56,9 @@ export const GPUDashboardGraphs: React.FC = () => {
5756
ariaTitle={t('Donut GPU utilization')}
5857
ariaRangeTitle={t('GPU utilization over time')}
5958
ariaDesc={t('Sparkline GPU utilization')}
60-
query={`sum(DCGM_FI_PROF_GR_ENGINE_ACTIVE{UUID="${selectedGPU?.uuid}"})`}
61-
maxDomain={1}
62-
humanize={humanizeRatio}
59+
query={`sum(DCGM_FI_DEV_GPU_UTIL{UUID="${selectedGPU?.uuid}"})`}
60+
maxDomain={100}
61+
humanize={humanizePercentage}
6362
/>
6463
</GridItem>
6564
<GridItem span={6} lg={3}>

src/components/GPUDashboard/Cards/WorkloadsCard.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ const WorkloadsCard: React.FC = () => {
127127
const [gpuMemoryMetrics, gpuMetricsLoaded, gpuMetricsError] = usePrometheusPoll({
128128
endpoint: PrometheusEndpoint.QUERY_RANGE,
129129
query:
130-
'sum (DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+"}) by (exported_namespace, exported_pod, UUID)',
130+
'sum (DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+"}) by (exported_namespace, exported_pod, UUID)',
131131
timespan: ONE_DAY,
132132
});
133133

src/hooks/use-gpus-info.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ export type GPUInfo = {
2020
export const useGPUsInfo = (): [GPUInfo[], /* loaded */ boolean, /* error */ unknown] => {
2121
const [result, loaded, error] = usePrometheusPoll({
2222
endpoint: PrometheusEndpoint.QUERY,
23-
query: 'DCGM_FI_PROF_GR_ENGINE_ACTIVE',
23+
query: 'DCGM_FI_DEV_GPU_UTIL',
2424
});
2525

2626
const gpus = useDeepCompareMemoize(

src/utils/cluster-overview.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import { GetQuery } from '@openshift-console/dynamic-plugin-sdk';
22

33
export const getGPUUtilizationQuery: GetQuery = () =>
4-
'count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+"})) or vector(0)';
4+
'count(count by (UUID,GPU_I_ID) (DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+"})) or vector(0)';
55
export const getGPUTotalUtilizationQuery: GetQuery = () =>
6-
'count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)) or vector(0)';
6+
'count(count by (UUID,GPU_I_ID) (DCGM_FI_DEV_GPU_UTIL)) or vector(0)';
77

88
export const getPowerUsageUtilizationQuery: GetQuery = () =>
99
'sum(max by (UUID) (DCGM_FI_DEV_POWER_USAGE))';

src/utils/project-overview.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
export const getGPUPodsQuery = (project: string) =>
2-
`count((kube_pod_status_phase > 0) * on(pod) group_left(gpu,device,instance,modelName) label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+", exported_namespace="${project}"}, "pod", "$1", "exported_pod", "(.*)"))`;
2+
`count((kube_pod_status_phase > 0) * on(pod) group_left(gpu,device,instance,modelName) label_replace(DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+", exported_namespace="${project}"}, "pod", "$1", "exported_pod", "(.*)"))`;

0 commit comments

Comments
 (0)