Skip to content

Commit 2426e16

Browse files
committed
chore: address review comments
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 78d96b5 commit 2426e16

File tree

4 files changed

+74
-0
lines changed

4 files changed

+74
-0
lines changed

.github/workflows/publish.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ jobs:
106106
- component: syslog-health-monitor
107107
make_command: 'make -C health-monitors/syslog-health-monitor docker-publish'
108108
container_name: 'nvsentinel/syslog-health-monitor'
109+
- component: kubernetes-object-monitor
110+
make_command: 'make -C health-monitors/kubernetes-object-monitor docker-publish'
111+
container_name: 'nvsentinel/kubernetes-object-monitor'
109112
- component: metadata-collector
110113
make_command: 'make -C metadata-collector docker-publish'
111114
container_name: 'nvsentinel/metadata-collector'

distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/templates/clusterrole.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,18 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
{{- $resourcesByGroup := dict -}}
16+
{{- range .Values.policies -}}
17+
{{- $group := .resource.group | default "" -}}
18+
{{- $kind := .resource.kind | lower -}}
19+
{{- $resource := printf "%ss" $kind -}}
20+
{{- $existing := index $resourcesByGroup $group | default list -}}
21+
{{- if not (has $resource $existing) -}}
22+
{{- $updated := append $existing $resource -}}
23+
{{- $_ := set $resourcesByGroup $group $updated -}}
24+
{{- end -}}
25+
{{- end -}}
26+
1527
apiVersion: rbac.authorization.k8s.io/v1
1628
kind: ClusterRole
1729
metadata:
@@ -29,3 +41,18 @@ rules:
2941
- watch
3042
- patch
3143
- update
44+
45+
{{- range $group, $resources := $resourcesByGroup }}
46+
{{- if ne $group "" }}
47+
- apiGroups:
48+
- {{ $group | quote }}
49+
resources:
50+
{{- range $resources }}
51+
- {{ . }}
52+
{{- end }}
53+
verbs:
54+
- get
55+
- list
56+
- watch
57+
{{- end }}
58+
{{- end }}

distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/values.yaml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ podAnnotations: {}
3030
maxConcurrentReconciles: 1
3131
resyncPeriod: 5m
3232

33+
# RBAC permissions are automatically generated based on the resources defined in policies.
34+
# Nodes always get write permissions (patch/update) for annotations.
35+
# All other resources get read-only permissions (get/list/watch).
3336
policies:
3437
- name: node-not-ready
3538
enabled: true
@@ -48,6 +51,45 @@ policies:
4851
errorCode:
4952
- NODE_NOT_READY
5053

54+
# Example: Monitor a custom resource (e.g., a GPU Job)
55+
# Uncomment and modify to monitor your own custom resources
56+
#
57+
# - name: gpu-job-failed
58+
# enabled: true
59+
# resource:
60+
# group: batch.example.com
61+
# version: v1alpha1
62+
# kind: GPUJob
63+
# predicate:
64+
# # CEL expression to detect unhealthy state
65+
# # Access the resource via 'resource' variable
66+
# expression: |
67+
# has(resource.status.state) && resource.status.state == "Failed"
68+
# nodeAssociation:
69+
# # Optional: CEL expression to map this resource to a specific node
70+
# # This is useful for resources that run on specific nodes
71+
#
72+
# # Example 1: Direct node reference
73+
# expression: resource.spec.nodeName
74+
#
75+
# # Example 2: Use lookup() to find the node from a related Pod
76+
# # Useful when monitoring resources that reference other resources
77+
# # expression: |
78+
# # lookup('v1', 'Pod', resource.metadata.namespace, resource.spec.podName).spec.nodeName
79+
#
80+
# # Example 3: Chain lookup() calls to traverse multiple resources
81+
# # expression: |
82+
# # lookup('v1', 'Node', '',
83+
# # lookup('v1', 'Pod', resource.metadata.namespace, resource.status.podName).spec.nodeName
84+
# # ).metadata.name
85+
# healthEvent:
86+
# componentClass: GPU
87+
# isFatal: false
88+
# message: "GPU job failed on node"
89+
# recommendedAction: CONTACT_SUPPORT
90+
# errorCode:
91+
# - GPU_JOB_FAILED
92+
5193
resources:
5294
requests:
5395
cpu: 100m

distros/kubernetes/nvsentinel/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ global:
5454
cleanupMetricsPort: 9002
5555
mongodbStore:
5656
enabled: false
57+
kubernetesObjectMonitor:
58+
enabled: false
5759

5860
platformConnector:
5961
image:

0 commit comments

Comments
 (0)