Skip to content

Commit 85c726b

Browse files
authored
feat: make fault remediation use generic maintenance resources (#257)
1 parent 342c083 commit 85c726b

File tree

17 files changed

+301
-794
lines changed

17 files changed

+301
-794
lines changed

distros/kubernetes/nvsentinel/charts/fault-remediation/templates/configmap.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ data:
2424
namespace = {{ .Values.maintenance.namespace | quote }}
2525
version = {{ .Values.maintenance.version | quote }}
2626
apiGroup = {{ .Values.maintenance.apiGroup | quote }}
27+
kind = {{ .Values.maintenance.kind | quote }}
28+
completeConditionType = {{ .Values.maintenance.completeConditionType | quote }}
2729
2830
[template]
2931
mountPath = "/etc/config"

distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@ maintenance:
4848
apiGroup: "janitor.dgxc.nvidia.com"
4949
# API version of the maintenance CRD
5050
version: "v1alpha1"
51+
# Kind of the maintenance CRD
52+
kind: "RebootNode"
53+
# completeConditionType specifies which status condition to check for completion of maintenance.
54+
# If multiple faults occur on the same node in close proximity, the condition is checked to
55+
# prevent creation of duplicate maintenance requests for the same operation.
56+
# If the status is True, then it is implied that the maintenance is completed a new CR should be created
57+
# If the status is False, then it is implied that the maintenance has failed a new CR can be created
58+
completeConditionType: "NodeReady"
5159
# Kubernetes namespace where maintenance resources will be created
5260
namespace: "nvsentinel"
5361
# Names of maintenance resource types used by the janitor controller

fault-remediation/pkg/config/config.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@ package config
1616

1717
// MaintenanceResource holds configuration for the maintenance custom resource
1818
type MaintenanceResource struct {
19-
Namespace string `toml:"namespace"`
20-
Version string `toml:"version"`
21-
ApiGroup string `toml:"apiGroup"`
19+
Namespace string `toml:"namespace"`
20+
Version string `toml:"version"`
21+
ApiGroup string `toml:"apiGroup"`
22+
Kind string `toml:"kind"`
23+
CompleteConditionType string `toml:"completeConditionType"`
2224
}
2325

2426
// Template holds configuration for template files
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package crstatus
16+
17+
import (
18+
"context"
19+
"log/slog"
20+
21+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/config"
22+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
24+
"k8s.io/apimachinery/pkg/runtime/schema"
25+
"k8s.io/client-go/dynamic"
26+
"k8s.io/client-go/restmapper"
27+
)
28+
29+
type CRStatusChecker struct {
30+
dynamicClient dynamic.Interface
31+
restMapper *restmapper.DeferredDiscoveryRESTMapper
32+
config *config.MaintenanceResource
33+
dryRun bool
34+
}
35+
36+
func NewCRStatusChecker(
37+
dynamicClient dynamic.Interface,
38+
restMapper *restmapper.DeferredDiscoveryRESTMapper,
39+
cfg *config.MaintenanceResource,
40+
dryRun bool,
41+
) *CRStatusChecker {
42+
return &CRStatusChecker{
43+
dynamicClient: dynamicClient,
44+
restMapper: restMapper,
45+
config: cfg,
46+
dryRun: dryRun,
47+
}
48+
}
49+
50+
func (c *CRStatusChecker) ShouldSkipCRCreation(ctx context.Context, crName string) bool {
51+
if c.dryRun {
52+
slog.Info("DRY-RUN: CR doesn't exist (dry-run mode)", "crName", crName)
53+
return false
54+
}
55+
56+
gvk := schema.GroupVersionKind{
57+
Group: c.config.ApiGroup,
58+
Version: c.config.Version,
59+
Kind: c.config.Kind,
60+
}
61+
62+
mapping, err := c.restMapper.RESTMapping(gvk.GroupKind(), gvk.Version)
63+
if err != nil {
64+
slog.Error("Failed to get REST mapping", "gvk", gvk.String(), "error", err)
65+
return false
66+
}
67+
68+
resource, err := c.dynamicClient.Resource(mapping.Resource).Get(ctx, crName, metav1.GetOptions{})
69+
if err != nil {
70+
slog.Warn("Failed to get CR, allowing create", "crName", crName, "error", err)
71+
return false
72+
}
73+
74+
return c.checkCondition(resource)
75+
}
76+
77+
func (c *CRStatusChecker) checkCondition(obj *unstructured.Unstructured) bool {
78+
status, found, err := unstructured.NestedMap(obj.Object, "status")
79+
if err != nil || !found {
80+
return true
81+
}
82+
83+
conditions, found, err := unstructured.NestedSlice(status, "conditions")
84+
if err != nil || !found {
85+
return true
86+
}
87+
88+
conditionStatus := c.findConditionStatus(conditions)
89+
90+
return !c.isTerminal(conditionStatus)
91+
}
92+
93+
func (c *CRStatusChecker) findConditionStatus(conditions []any) string {
94+
for _, cond := range conditions {
95+
condition, ok := cond.(map[string]interface{})
96+
if !ok {
97+
continue
98+
}
99+
100+
condType, _ := condition["type"].(string)
101+
if condType == c.config.CompleteConditionType {
102+
condStatus, _ := condition["status"].(string)
103+
return condStatus
104+
}
105+
}
106+
107+
return ""
108+
}
109+
110+
func (c *CRStatusChecker) isTerminal(conditionStatus string) bool {
111+
return conditionStatus == "True" || conditionStatus == "False"
112+
}

0 commit comments

Comments
 (0)