@@ -270,6 +270,42 @@ func (c *FaultRemediationClient) handleCreateCRError(
270270 return false , ""
271271}
272272
273+ // applyNodeAnnotationsToJob applies test overrides from node annotations to the job spec.
274+ func (c * FaultRemediationClient ) applyNodeAnnotationsToJob (ctx context.Context , job * batchv1.Job , nodeName string ) {
275+ node , err := c .kubeClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
276+ if err != nil || node .Annotations == nil {
277+ return
278+ }
279+
280+ // Override MOCK_EXIT_CODE from annotation (for test scenarios)
281+ if exitCodeStr , ok := node .Annotations ["nvsentinel.nvidia.com/log-collector-mock-exit-code" ]; ok {
282+ log .Printf ("Overriding log collector mock exit code from node annotation: %s" , exitCodeStr )
283+
284+ for i := range job .Spec .Template .Spec .Containers {
285+ for j := range job .Spec .Template .Spec .Containers [i ].Env {
286+ if job .Spec .Template .Spec .Containers [i ].Env [j ].Name == "MOCK_EXIT_CODE" {
287+ job .Spec .Template .Spec .Containers [i ].Env [j ].Value = exitCodeStr
288+ break
289+ }
290+ }
291+ }
292+ }
293+
294+ // Override MOCK_SLEEP_DURATION from annotation (for test scenarios)
295+ if sleepDurationStr , ok := node .Annotations ["nvsentinel.nvidia.com/log-collector-mock-sleep" ]; ok {
296+ log .Printf ("Overriding log collector mock sleep duration from node annotation: %s" , sleepDurationStr )
297+
298+ for i := range job .Spec .Template .Spec .Containers {
299+ for j := range job .Spec .Template .Spec .Containers [i ].Env {
300+ if job .Spec .Template .Spec .Containers [i ].Env [j ].Name == "MOCK_SLEEP_DURATION" {
301+ job .Spec .Template .Spec .Containers [i ].Env [j ].Value = sleepDurationStr
302+ break
303+ }
304+ }
305+ }
306+ }
307+ }
308+
273309// RunLogCollectorJob creates a log collector Job and waits for completion.
274310// nolint: cyclop // todo
275311func (c * FaultRemediationClient ) RunLogCollectorJob (ctx context.Context , nodeName string ) error {
@@ -297,6 +333,9 @@ func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeNam
297333 return fmt .Errorf ("failed to unmarshal Job manifest: %w" , err )
298334 }
299335
336+ // Apply test overrides from node annotations (for test scenarios)
337+ c .applyNodeAnnotationsToJob (ctx , job , nodeName )
338+
300339 // Set target node
301340 job .Spec .Template .Spec .NodeName = nodeName
302341
@@ -311,7 +350,7 @@ func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeNam
311350 log .Printf ("Waiting for log collector job %s to complete" , created .Name )
312351
313352 // Use a context with timeout for the watch
314- watchCtx , cancel := context .WithTimeout (ctx , 5 * time .Minute )
353+ watchCtx , cancel := context .WithTimeout (ctx , 10 * time .Minute )
315354 defer cancel ()
316355
317356 // Use SharedInformerFactory for efficient job status monitoring with filtering
0 commit comments