Skip to content

Commit f8d41f7

Browse files
authored
feat: Integrate GPU health monitor with metadata collector (#290)
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 79b0584 commit f8d41f7

File tree

11 files changed

+629
-18
lines changed

11 files changed

+629
-18
lines changed

distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ spec:
5656
- "/var/run/statefile"
5757
- --dcgm-k8s-service-enabled
5858
- {{ .Values.dcgm.dcgmK8sServiceEnabled | quote }}
59+
- --metadata-path
60+
- {{ .Values.global.metadataPath | quote }}
5961
securityContext:
6062
runAsUser: 0
6163
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}-dcgm-3.x"
@@ -89,6 +91,9 @@ spec:
8991
mountPath: /etc/dcgmhealth
9092
- name: var-run-vol
9193
mountPath: /var/run/
94+
- name: gpu-metadata
95+
mountPath: /var/lib/nvsentinel
96+
readOnly: true
9297
env:
9398
- name: NODE_NAME
9499
valueFrom:
@@ -104,6 +109,10 @@ spec:
104109
hostPath:
105110
path: /var/run/nvsentinel
106111
type: Directory
112+
- name: gpu-metadata
113+
hostPath:
114+
path: /var/lib/nvsentinel
115+
type: DirectoryOrCreate
107116
{{- if .Values.additionalHostVolumes }}
108117
{{ toYaml .Values.additionalHostVolumes | nindent 8 }}
109118
{{- end }}

distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ spec:
5656
- "/var/run/statefile"
5757
- --dcgm-k8s-service-enabled
5858
- {{ .Values.dcgm.dcgmK8sServiceEnabled | quote }}
59+
- --metadata-path
60+
- {{ .Values.global.metadataPath | quote }}
5961
securityContext:
6062
runAsUser: 0
6163
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}-dcgm-4.x"
@@ -89,6 +91,9 @@ spec:
8991
mountPath: /etc/dcgmhealth
9092
- name: var-run-vol
9193
mountPath: /var/run/
94+
- name: gpu-metadata
95+
mountPath: /var/lib/nvsentinel
96+
readOnly: true
9297
env:
9398
- name: NODE_NAME
9499
valueFrom:
@@ -104,6 +109,10 @@ spec:
104109
hostPath:
105110
path: /var/run/nvsentinel
106111
type: Directory
112+
- name: gpu-metadata
113+
hostPath:
114+
path: /var/lib/nvsentinel
115+
type: DirectoryOrCreate
107116
{{- if .Values.additionalHostVolumes }}
108117
{{ toYaml .Values.additionalHostVolumes | nindent 8 }}
109118
{{- end }}

health-monitors/gpu-health-monitor/gpu_health_monitor/cli.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def _init_event_processor(
3131
dcgm_errors_info_dict: dict[str, str],
3232
state_file_path: str,
3333
dcgm_health_conditions_categorization_mapping_config: dict[str, str],
34+
metadata_path: str,
3435
):
3536
platform_connector_config = config["eventprocessors.platformconnector"]
3637
match event_processor_name:
@@ -42,6 +43,7 @@ def _init_event_processor(
4243
dcgm_errors_info_dict=dcgm_errors_info_dict,
4344
state_file_path=state_file_path,
4445
dcgm_health_conditions_categorization_mapping_config=dcgm_health_conditions_categorization_mapping_config,
46+
metadata_path=metadata_path,
4547
)
4648
case _:
4749
log.fatal(f"Unknown event processor {event_processor_name}")
@@ -58,6 +60,13 @@ def _init_event_processor(
5860
@click.option("--verbose", type=bool, default=False, help="Enable debug logging", required=False)
5961
@click.option("--state-file", type=click.Path(), help="gpu health monitor state file path", required=True)
6062
@click.option("--dcgm-k8s-service-enabled", type=bool, help="Is DCGM K8s service Enabled", required=True)
63+
@click.option(
64+
"--metadata-path",
65+
type=click.Path(),
66+
default="/var/lib/nvsentinel/gpu_metadata.json",
67+
help="Path to GPU metadata JSON file",
68+
required=False,
69+
)
6170
def cli(
6271
dcgm_addr,
6372
dcgm_error_mapping_config_file,
@@ -66,6 +75,7 @@ def cli(
6675
verbose,
6776
state_file,
6877
dcgm_k8s_service_enabled,
78+
metadata_path,
6979
):
7080
exit = Event()
7181
config = configparser.ConfigParser()
@@ -112,6 +122,7 @@ def cli(
112122
dcgm_errors_info_dict,
113123
state_file_path,
114124
dcgm_health_conditions_categorization_mapping_config,
125+
metadata_path,
115126
)
116127
)
117128

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .reader import MetadataReader
16+
17+
__all__ = ["MetadataReader"]
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import logging as log
17+
import threading
18+
from typing import Optional
19+
20+
21+
class MetadataReader:
22+
"""Lazy-loading, thread-safe GPU metadata reader.
23+
24+
This class reads GPU metadata from a JSON file and provides
25+
thread-safe access to GPU UUID and chassis serial information.
26+
The metadata is loaded lazily on first access.
27+
"""
28+
29+
def __init__(self, metadata_path: str):
30+
"""Initialize the metadata reader.
31+
32+
Args:
33+
metadata_path: Path to the GPU metadata JSON file.
34+
"""
35+
self._path = metadata_path
36+
self._metadata = None
37+
self._lock = threading.RLock()
38+
self._loaded = False
39+
40+
def _ensure_loaded(self):
41+
"""Load metadata on first use (lazy loading).
42+
43+
This method uses double-checked locking to ensure thread-safe
44+
lazy initialization of the metadata.
45+
"""
46+
if self._loaded:
47+
return
48+
49+
with self._lock:
50+
if self._loaded:
51+
return
52+
53+
try:
54+
with open(self._path, "r") as f:
55+
self._metadata = json.load(f)
56+
self._loaded = True
57+
gpu_count = len(self._metadata.get("gpus", []))
58+
chassis = self._metadata.get("chassis_serial")
59+
log.info(
60+
f"GPU metadata loaded from {self._path}: "
61+
f"{gpu_count} GPUs, chassis_serial={'present' if chassis else 'absent'}"
62+
)
63+
except FileNotFoundError:
64+
log.warning(f"Metadata file not found: {self._path}, continuing without metadata enrichment")
65+
self._metadata = {}
66+
self._loaded = True
67+
except Exception as e:
68+
# Handles JSON decode errors, permission errors, etc.
69+
log.error(f"Error loading metadata from {self._path}: {e}")
70+
self._metadata = {}
71+
self._loaded = True
72+
73+
def get_gpu_uuid(self, gpu_id: int) -> Optional[str]:
74+
"""Get GPU UUID by DCGM GPU ID.
75+
76+
Args:
77+
gpu_id: The DCGM GPU ID (0, 1, 2, ...).
78+
79+
Returns:
80+
The GPU UUID string if found, None otherwise.
81+
"""
82+
self._ensure_loaded()
83+
84+
if not self._metadata:
85+
return None
86+
87+
gpus = self._metadata.get("gpus", [])
88+
for gpu in gpus:
89+
if gpu.get("gpu_id") == gpu_id:
90+
uuid = gpu.get("uuid")
91+
if uuid:
92+
log.debug(f"Found GPU UUID for GPU {gpu_id}: {uuid}")
93+
return uuid
94+
else:
95+
log.warning(f"GPU {gpu_id} found in metadata but has no UUID")
96+
return None
97+
98+
log.debug(f"GPU {gpu_id} not found in metadata")
99+
return None
100+
101+
def get_pci_address(self, gpu_id: int) -> Optional[str]:
102+
"""Get PCI address by DCGM GPU ID.
103+
104+
Args:
105+
gpu_id: The DCGM GPU ID (0, 1, 2, ...).
106+
107+
Returns:
108+
The PCI address string if found, None otherwise.
109+
"""
110+
self._ensure_loaded()
111+
112+
if not self._metadata:
113+
return None
114+
115+
gpus = self._metadata.get("gpus", [])
116+
for gpu in gpus:
117+
if gpu.get("gpu_id") == gpu_id:
118+
pci_address = gpu.get("pci_address")
119+
if pci_address:
120+
log.debug(f"Found PCI address for GPU {gpu_id}: {pci_address}")
121+
return pci_address
122+
else:
123+
log.warning(f"GPU {gpu_id} found in metadata but has no PCI address")
124+
return None
125+
126+
log.debug(f"GPU {gpu_id} not found in metadata")
127+
return None
128+
129+
def get_chassis_serial(self) -> Optional[str]:
130+
"""Get chassis serial number.
131+
132+
Returns:
133+
The chassis serial number if available, None otherwise.
134+
"""
135+
self._ensure_loaded()
136+
137+
if not self._metadata:
138+
return None
139+
140+
chassis_serial = self._metadata.get("chassis_serial")
141+
if chassis_serial:
142+
log.debug(f"Found chassis serial: {chassis_serial}")
143+
else:
144+
log.debug("No chassis serial in metadata")
145+
146+
return chassis_serial

0 commit comments

Comments
 (0)