Skip to content

Commit f0524b2

Browse files
authored
[Feature]: Janitor cleanup (retries, backoff, csp pkg, logging) (#247)
Signed-off-by: Mark Chmarny <[email protected]>
1 parent e4a130a commit f0524b2

35 files changed

+2260
-226
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111
- [ ] 🔨 Build/CI
1212

1313
## Component(s) Affected
14-
- [ ] Health Monitors
1514
- [ ] Core Services
16-
- [ ] Fault Management
1715
- [ ] Documentation/CI
16+
- [ ] Fault Management
17+
- [ ] Health Monitors
18+
- [ ] Janitor
1819
- [ ] Other: ____________
1920

2021
## Testing

.github/copilot-instructions.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ NVSentinel is a GPU Node Resilience System for Kubernetes that automatically det
99
## Architecture & Technologies
1010

1111
### Core Technologies
12-
- **Language**: Go 1.25+ (primary), Python 3.12+ (monitoring tools)
12+
- **Language**: Go 1.25+ (primary), Python 3.10+ (monitoring tools)
1313
- **Container Platform**: Kubernetes 1.25+
1414
- **Deployment**: Helm 3.0+, Tilt (development)
1515
- **Storage**: MongoDB (event store with change streams)

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,7 @@ tilt-ci: ## Run Tilt in CI mode (no UI, waits for all resources)
665665
echo "Error: tilt is not installed. Please install from https://tilt.dev/"; \
666666
exit 1; \
667667
fi
668-
tilt ci -f tilt/Tiltfile
668+
tilt ci -f tilt/Tiltfile --timeout=10m
669669
@echo "Waiting for all deployments to be ready..."
670670
@kubectl get deployments --all-namespaces --no-headers -o custom-columns=":metadata.namespace,:metadata.name" | while read ns name; do \
671671
echo "Waiting for deployment $$name in namespace $$ns..."; \

distros/kubernetes/nvsentinel/charts/janitor/crds/janitor.dgxc.nvidia.com_gpuresets.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
1514
---
1615
apiVersion: apiextensions.k8s.io/v1
1716
kind: CustomResourceDefinition

distros/kubernetes/nvsentinel/charts/janitor/crds/janitor.dgxc.nvidia.com_rebootnodes.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
1514
---
1615
apiVersion: apiextensions.k8s.io/v1
1716
kind: CustomResourceDefinition
@@ -142,6 +141,18 @@ spec:
142141
- type
143142
type: object
144143
type: array
144+
consecutiveFailures:
145+
description: |-
146+
ConsecutiveFailures tracks consecutive CSP operation failures for exponential backoff
147+
Reset to 0 on successful operations
148+
format: int32
149+
type: integer
150+
retryCount:
151+
description: |-
152+
RetryCount tracks the number of reconciliation attempts for this reboot operation
153+
Used to implement maximum retry limits to prevent indefinite reconciliation
154+
format: int32
155+
type: integer
145156
startTime:
146157
description: StartTime is the time when the reboot was initiated
147158
format: date-time

distros/kubernetes/nvsentinel/charts/janitor/crds/janitor.dgxc.nvidia.com_terminatenodes.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
1514
---
1615
apiVersion: apiextensions.k8s.io/v1
1716
kind: CustomResourceDefinition
@@ -142,6 +141,18 @@ spec:
142141
- type
143142
type: object
144143
type: array
144+
consecutiveFailures:
145+
description: |-
146+
ConsecutiveFailures tracks consecutive CSP operation failures for exponential backoff
147+
Reset to 0 on successful operations
148+
format: int32
149+
type: integer
150+
retryCount:
151+
description: |-
152+
RetryCount tracks the number of reconciliation attempts for this terminate operation
153+
Used to implement maximum retry limits to prevent indefinite reconciliation
154+
format: int32
155+
type: integer
145156
startTime:
146157
description: StartTime is the time when the termination was initiated
147158
format: date-time

distros/kubernetes/nvsentinel/charts/labeler/templates/clusterrole.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,6 @@ rules:
3434
verbs:
3535
- get
3636
- list
37+
- watch
3738
- patch
3839
- update

janitor/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ generate: ## Generate CRDs and move them to Helm chart directory
6060
@# Install controller-gen if not present
6161
@which controller-gen > /dev/null || (echo "Installing controller-gen..." && go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest)
6262
@# Generate CRDs into api/v1alpha1 directory
63+
@# Note: Generated CRD YAML files do not include license headers (this is expected)
64+
@# YAML files are excluded from license header checks via main Makefile: -ignore '**/*.yaml'
6365
@controller-gen crd paths=./$(API_DIR) output:crd:dir=./$(API_DIR)
6466
@# Move generated CRDs to Helm chart crds directory
6567
@echo "Moving generated CRDs to $(CRD_OUTPUT_DIR)..."

janitor/api/v1alpha1/gpureset_types.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,14 @@ import (
2525
type GPUSelector struct {
2626
// UUIDs is a list of GPU UUIDs.
2727
// +optional
28-
// nolint:lll
28+
//nolint:lll // kubebuilder validation pattern
2929
// +kubebuilder:validation:items:Pattern="^GPU-[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
3030
UUIDs []string `json:"uuids,omitempty"`
3131

3232
// PCIBusIDs is a list of GPU PCI bus IDs.
3333
// Format: "domain:bus:device.function" (e.g., "0000:01:00.0").
3434
// +optional
35-
// nolint:lll
35+
//nolint:lll // kubebuilder validation pattern
3636
// +kubebuilder:validation:items:Pattern="^[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\\.[0-9a-fA-F]{1}$"
3737
PCIBusIDs []string `json:"pciBusIDs,omitempty"`
3838
}
@@ -71,7 +71,7 @@ const (
7171
// +kubebuilder:object:root=true
7272
// +kubebuilder:resource:scope=Cluster
7373
// +kubebuilder:subresource:status
74-
// nolint:lll
74+
//nolint:lll // kubebuilder printcolumn marker
7575
// +kubebuilder:printcolumn:name="Node",type="string",JSONPath=".spec.nodeName",description="The target node for the GPU reset"
7676
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
7777

janitor/api/v1alpha1/rebootnode_types.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ type RebootNodeStatus struct {
4949
// CompletionTime is the time when the reboot was completed
5050
CompletionTime *metav1.Time `json:"completionTime,omitempty"`
5151

52+
// RetryCount tracks the number of reconciliation attempts for this reboot operation
53+
// Used to implement maximum retry limits to prevent indefinite reconciliation
54+
RetryCount int32 `json:"retryCount,omitempty"`
55+
56+
// ConsecutiveFailures tracks consecutive CSP operation failures for exponential backoff
57+
// Reset to 0 on successful operations
58+
ConsecutiveFailures int32 `json:"consecutiveFailures,omitempty"`
59+
5260
// Conditions represent the latest available observations of an object's current state.
5361
Conditions []metav1.Condition `json:"conditions,omitempty"`
5462
}
@@ -184,6 +192,33 @@ func (r *RebootNode) SetCompletionTime() {
184192
}
185193
}
186194

195+
// Interface implementation for generic status update handling
196+
197+
// GetRetryCount returns the retry count
198+
func (s *RebootNodeStatus) GetRetryCount() int32 {
199+
return s.RetryCount
200+
}
201+
202+
// GetConsecutiveFailures returns the consecutive failures count
203+
func (s *RebootNodeStatus) GetConsecutiveFailures() int32 {
204+
return s.ConsecutiveFailures
205+
}
206+
207+
// GetStartTime returns the start time
208+
func (s *RebootNodeStatus) GetStartTime() *metav1.Time {
209+
return s.StartTime
210+
}
211+
212+
// GetCompletionTime returns the completion time
213+
func (s *RebootNodeStatus) GetCompletionTime() *metav1.Time {
214+
return s.CompletionTime
215+
}
216+
217+
// GetConditions returns the conditions
218+
func (s *RebootNodeStatus) GetConditions() []metav1.Condition {
219+
return s.Conditions
220+
}
221+
187222
func init() {
188223
SchemeBuilder.Register(&RebootNode{}, &RebootNodeList{})
189224
}

0 commit comments

Comments
 (0)