Skip to content

Commit df34cb8

Browse files
feat(gfd): make labeler composition fault-tolerant
Make Merge() resilient to individual labeler failures by logging errors as warnings and continuing with remaining labelers. This prevents GFD from crashing when devices go unhealthy (e.g., XID errors) and allows partial label generation. Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 0635d3e commit df34cb8

File tree

2 files changed

+151
-4
lines changed

2 files changed

+151
-4
lines changed

internal/lm/list.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616

1717
package lm
1818

19-
import "fmt"
19+
import (
20+
"k8s.io/klog/v2"
21+
)
2022

2123
// list represents a list of labelers that iself implements the Labeler interface.
2224
type list []Labeler
@@ -29,13 +31,17 @@ func Merge(labelers ...Labeler) Labeler {
2931
}
3032

3133
// Labels returns the labels from a set of labelers. Labels later in the list
32-
// overwrite earlier labels.
34+
// overwrite earlier labels. If a labeler fails, the error is logged as a
35+
// warning and the labeler is skipped, allowing the pipeline to continue with
36+
// partial results. This provides fault tolerance for unhealthy devices or
37+
// transient errors without crashing the entire labeling process.
3338
func (labelers list) Labels() (Labels, error) {
3439
allLabels := make(Labels)
35-
for _, labeler := range labelers {
40+
for i, labeler := range labelers {
3641
labels, err := labeler.Labels()
3742
if err != nil {
38-
return nil, fmt.Errorf("error generating labels: %v", err)
43+
klog.Warningf("Labeler at index %d failed, skipping: %v", i, err)
44+
continue
3945
}
4046
for k, v := range labels {
4147
allLabels[k] = v

internal/lm/list_test.go

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/**
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package lm
18+
19+
import (
20+
"fmt"
21+
"testing"
22+
23+
"github.com/stretchr/testify/require"
24+
)
25+
26+
// mockLabeler is a test double for the Labeler interface
27+
type mockLabeler struct {
28+
labels Labels
29+
err error
30+
}
31+
32+
func (m *mockLabeler) Labels() (Labels, error) {
33+
return m.labels, m.err
34+
}
35+
36+
func TestMerge(t *testing.T) {
37+
testCases := []struct {
38+
description string
39+
labelers []Labeler
40+
expectedLabels Labels
41+
}{
42+
{
43+
description: "empty list returns empty labels",
44+
labelers: []Labeler{},
45+
expectedLabels: Labels{},
46+
},
47+
{
48+
description: "single successful labeler",
49+
labelers: []Labeler{
50+
&mockLabeler{
51+
labels: Labels{"key1": "value1"},
52+
},
53+
},
54+
expectedLabels: Labels{"key1": "value1"},
55+
},
56+
{
57+
description: "multiple successful labelers",
58+
labelers: []Labeler{
59+
&mockLabeler{labels: Labels{"key1": "value1"}},
60+
&mockLabeler{labels: Labels{"key2": "value2"}},
61+
&mockLabeler{labels: Labels{"key3": "value3"}},
62+
},
63+
expectedLabels: Labels{
64+
"key1": "value1",
65+
"key2": "value2",
66+
"key3": "value3",
67+
},
68+
},
69+
{
70+
description: "single failing labeler is skipped",
71+
labelers: []Labeler{
72+
&mockLabeler{labels: Labels{"key1": "value1"}},
73+
&mockLabeler{err: fmt.Errorf("device unhealthy: GPU is lost")},
74+
&mockLabeler{labels: Labels{"key3": "value3"}},
75+
},
76+
expectedLabels: Labels{
77+
"key1": "value1",
78+
"key3": "value3",
79+
},
80+
},
81+
{
82+
description: "multiple failing labelers are skipped",
83+
labelers: []Labeler{
84+
&mockLabeler{labels: Labels{"key1": "value1"}},
85+
&mockLabeler{err: fmt.Errorf("error 1")},
86+
&mockLabeler{err: fmt.Errorf("error 2")},
87+
&mockLabeler{labels: Labels{"key4": "value4"}},
88+
},
89+
expectedLabels: Labels{
90+
"key1": "value1",
91+
"key4": "value4",
92+
},
93+
},
94+
{
95+
description: "all failing labelers returns empty labels",
96+
labelers: []Labeler{
97+
&mockLabeler{err: fmt.Errorf("error 1")},
98+
&mockLabeler{err: fmt.Errorf("error 2")},
99+
},
100+
expectedLabels: Labels{},
101+
},
102+
{
103+
description: "later labeler overwrites earlier labels",
104+
labelers: []Labeler{
105+
&mockLabeler{labels: Labels{"key": "value1"}},
106+
&mockLabeler{labels: Labels{"key": "value2"}},
107+
},
108+
expectedLabels: Labels{"key": "value2"},
109+
},
110+
{
111+
description: "empty labels from labeler are merged",
112+
labelers: []Labeler{
113+
&mockLabeler{labels: Labels{}},
114+
&mockLabeler{labels: Labels{"key": "value"}},
115+
},
116+
expectedLabels: Labels{"key": "value"},
117+
},
118+
{
119+
description: "failing labeler between successful ones",
120+
labelers: []Labeler{
121+
&mockLabeler{labels: Labels{"before": "value"}},
122+
&mockLabeler{err: fmt.Errorf("GPU XID error")},
123+
&mockLabeler{labels: Labels{"after": "value"}},
124+
},
125+
expectedLabels: Labels{
126+
"before": "value",
127+
"after": "value",
128+
},
129+
},
130+
}
131+
132+
for _, tc := range testCases {
133+
t.Run(tc.description, func(t *testing.T) {
134+
merged := Merge(tc.labelers...)
135+
labels, err := merged.Labels()
136+
137+
require.NoError(t, err, "Merge should never return error")
138+
require.EqualValues(t, tc.expectedLabels, labels)
139+
})
140+
}
141+
}

0 commit comments

Comments
 (0)