Skip to content

Commit bff11fe

Browse files
Avoid guid pool exhaustion from guid leaks by syncing it with SM (#92)
If the node running pods that use ib network is restarted, those pods' GUID are deleted from UFM but are persisted in ib-kubernetes. They become unusable and might exhaust the GUID pool if it's configured to be small enough. The solution is to sync guid pool with UFM because some GUIDs might have become free to use. Signed-off-by: amaslennikov <[email protected]> Signed-off-by: amaslennikov <[email protected]>
1 parent 52b597f commit bff11fe

File tree

7 files changed

+222
-10
lines changed

7 files changed

+222
-10
lines changed

pkg/daemon/daemon.go

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,6 @@ func NewDaemon() (Daemon, error) {
9595
return nil, err
9696
}
9797

98-
guidPool, err := guid.NewPool(&daemonConfig.GUIDPool)
99-
if err != nil {
100-
return nil, err
101-
}
102-
10398
pluginLoader := sm.NewPluginLoader()
10499
getSmClientFunc, err := pluginLoader.LoadPlugin(path.Join(
105100
daemonConfig.PluginPath, daemonConfig.Plugin+".so"), sm.InitializePluginFunc)
@@ -125,6 +120,14 @@ func NewDaemon() (Daemon, error) {
125120
return nil, validateErr
126121
}
127122

123+
guidPool, err := guid.NewPool(&daemonConfig.GUIDPool)
124+
if err != nil {
125+
return nil, err
126+
}
127+
128+
// Reset guid pool with already allocated guids to avoid collisions
129+
err = syncGuidPool(smClient, guidPool)
130+
128131
podWatcher := watcher.NewWatcher(podEventHandler, client)
129132
return &daemon{
130133
config: daemonConfig,
@@ -256,7 +259,16 @@ func (d *daemon) processNetworkGUID(networkID string, spec *utils.IbSriovCniSpec
256259
} else {
257260
guidAddr, err = d.guidPool.GenerateGUID()
258261
if err != nil {
259-
return fmt.Errorf("failed to generate GUID for pod ID %s, with error: %v", pi.pod.UID, err)
262+
switch err {
263+
// If the guid pool is exhausted, need to sync with SM in case there are unsynced changes
264+
case guid.GuidPoolExhaustedError:
265+
err = syncGuidPool(d.smClient, d.guidPool)
266+
if err != nil {
267+
return err
268+
}
269+
default:
270+
return fmt.Errorf("failed to generate GUID for pod ID %s, with error: %v", pi.pod.UID, err)
271+
}
260272
}
261273

262274
allocatedGUID = guidAddr.String()
@@ -284,6 +296,20 @@ func (d *daemon) processNetworkGUID(networkID string, spec *utils.IbSriovCniSpec
284296
return nil
285297
}
286298

299+
func syncGuidPool(smClient plugins.SubnetManagerClient, guidPool guid.Pool) error {
300+
usedGuids, err := smClient.ListGuidsInUse()
301+
if err != nil {
302+
return err
303+
}
304+
305+
// Reset guid pool with already allocated guids to avoid collisions
306+
err = guidPool.Reset(usedGuids)
307+
if err != nil {
308+
return err
309+
}
310+
return nil
311+
}
312+
287313
// Update and set Pod's network annotation.
288314
// If failed to update annotation, pod's GUID added into the list to be removed from Pkey.
289315
func (d *daemon) updatePodNetworkAnnotation(pi *podNetworkInfo, removedList *[]net.HardwareAddr) error {
@@ -537,7 +563,7 @@ func (d *daemon) DeletePeriodicUpdate() {
537563
log.Info().Msg("delete periodic update finished")
538564
}
539565

540-
// initPool check the guids that are already allocated by the running pods
566+
// initPool check the guids that are already allocated by the running pods
541567
func (d *daemon) initPool() error {
542568
log.Info().Msg("Initializing GUID pool.")
543569

pkg/guid/guid_pool.go

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
package guid
22

33
import (
4+
"errors"
45
"fmt"
5-
66
"github.com/rs/zerolog/log"
77

88
"github.com/Mellanox/ib-kubernetes/pkg/config"
@@ -19,8 +19,13 @@ type Pool interface {
1919
// ReleaseGUID release the reservation of the guid.
2020
// It returns error if the guid is not in the range.
2121
ReleaseGUID(string) error
22+
23+
// Reset clears the current pool and resets it with given values (may be empty)
24+
Reset(guids []string) error
2225
}
2326

27+
var GuidPoolExhaustedError = errors.New("GUID pool is exhausted")
28+
2429
type guidPool struct {
2530
rangeStart GUID // first guid in range
2631
rangeEnd GUID // last guid in range
@@ -50,6 +55,34 @@ func NewPool(conf *config.GUIDPoolConfig) (Pool, error) {
5055
}, nil
5156
}
5257

58+
// Reset clears the current pool and resets it with given values (may be empty)
59+
func (p *guidPool) Reset(guids []string) error {
60+
log.Debug().Msg("resetting guid pool")
61+
62+
p.guidPoolMap = map[GUID]bool{}
63+
if guids == nil {
64+
return nil
65+
}
66+
67+
for _, guid := range guids {
68+
guidInRange, err := p.isGuidStringInRange(guid)
69+
if err != nil {
70+
log.Debug().Msgf("error validating GUID: %s: %v", guid, err)
71+
return err
72+
}
73+
if !guidInRange {
74+
// Out of range GUID may be expected and shouldn't be allocated in the pool
75+
continue
76+
}
77+
err = p.AllocateGUID(guid)
78+
if err != nil {
79+
log.Debug().Msgf("error resetting the pool with value: %s: %v", guid, err)
80+
return err
81+
}
82+
}
83+
return nil
84+
}
85+
5386
// GenerateGUID generates a guid from the range
5487
func (p *guidPool) GenerateGUID() (GUID, error) {
5588
// this look will ensure that we check all the range
@@ -62,7 +95,7 @@ func (p *guidPool) GenerateGUID() (GUID, error) {
6295
if guid := p.getFreeGUID(p.rangeStart, p.rangeEnd); guid != 0 {
6396
return guid, nil
6497
}
65-
return 0, fmt.Errorf("guid pool range is full")
98+
return 0, GuidPoolExhaustedError
6699
}
67100

68101
// ReleaseGUID release allocated guid
@@ -88,7 +121,7 @@ func (p *guidPool) AllocateGUID(guid string) error {
88121
return err
89122
}
90123

91-
if guidAddr < p.rangeStart || guidAddr > p.rangeEnd {
124+
if !p.isGuidInRange(guidAddr) {
92125
return fmt.Errorf("out of range guid %s, pool range %v - %v", guid, p.rangeStart, p.rangeEnd)
93126
}
94127

@@ -104,6 +137,18 @@ func isValidRange(rangeStart, rangeEnd GUID) bool {
104137
return rangeStart <= rangeEnd && rangeStart != 0 && rangeEnd != 0xFFFFFFFFFFFFFFFF
105138
}
106139

140+
func (p *guidPool) isGuidInRange(guid GUID) bool {
141+
return guid >= p.rangeStart && guid <= p.rangeEnd
142+
}
143+
144+
func (p *guidPool) isGuidStringInRange(guid string) (bool, error) {
145+
guidAddr, err := ParseGUID(guid)
146+
if err != nil {
147+
return false, err
148+
}
149+
return p.isGuidInRange(guidAddr), nil
150+
}
151+
107152
// getFreeGUID return free guid in given range
108153
func (p *guidPool) getFreeGUID(start, end GUID) GUID {
109154
for guid := start; guid <= end; guid++ {

pkg/guid/guid_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,57 @@ import (
99

1010
var _ = Describe("GUID Pool", func() {
1111
conf := &config.GUIDPoolConfig{RangeStart: "02:00:00:00:00:00:00:00", RangeEnd: "02:FF:FF:FF:FF:FF:FF:FF"}
12+
Context("ResetPool", func() {
13+
It("Reset pool clears previous values", func() {
14+
pool, err := NewPool(conf)
15+
Expect(err).ToNot(HaveOccurred())
16+
Expect(pool).ToNot(BeNil())
17+
18+
err = pool.AllocateGUID("02:00:00:00:00:00:00:00")
19+
Expect(err).ToNot(HaveOccurred())
20+
err = pool.AllocateGUID("02:00:00:00:FF:00:00:00")
21+
Expect(err).ToNot(HaveOccurred())
22+
23+
pool.Reset(nil)
24+
25+
err = pool.ReleaseGUID("02:00:00:00:00:00:00:00")
26+
Expect(err).To(HaveOccurred())
27+
err = pool.ReleaseGUID("02:00:00:00:FF:00:00:00")
28+
Expect(err).To(HaveOccurred())
29+
})
30+
It("Reset pool stores new values", func() {
31+
pool, err := NewPool(conf)
32+
Expect(err).ToNot(HaveOccurred())
33+
Expect(pool).ToNot(BeNil())
34+
35+
expectedGuids := []string{"02:00:00:00:00:00:00:3e", "02:00:0F:F0:00:FF:00:09", "02:00:00:00:00:00:00:00"}
36+
37+
pool.Reset(expectedGuids)
38+
39+
for _, expectedGuid := range expectedGuids {
40+
err = pool.ReleaseGUID(expectedGuid)
41+
Expect(err).ToNot(HaveOccurred())
42+
}
43+
})
44+
It("Exhausted pool throws error and doesn't after reset", func() {
45+
conf := &config.GUIDPoolConfig{RangeStart: "02:00:00:00:00:00:00:00", RangeEnd: "02:00:00:00:00:00:00:00"}
46+
pool, err := NewPool(conf)
47+
Expect(err).ToNot(HaveOccurred())
48+
Expect(pool).ToNot(BeNil())
49+
guid, err := pool.GenerateGUID()
50+
Expect(err).ToNot(HaveOccurred())
51+
err = pool.AllocateGUID(guid.String())
52+
Expect(err).ToNot(HaveOccurred())
53+
guid, err = pool.GenerateGUID()
54+
Expect(err).To(Equal(GuidPoolExhaustedError))
55+
56+
err = pool.Reset(nil)
57+
Expect(err).ToNot(HaveOccurred())
58+
59+
guid, err = pool.GenerateGUID()
60+
Expect(err).ToNot(HaveOccurred())
61+
})
62+
})
1263
Context("NewPool", func() {
1364
It("Create guid pool with valid parameters", func() {
1465
pool, err := NewPool(conf)

pkg/sm/plugins/noop/noop.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ func (p *plugin) RemoveGuidsFromPKey(pkey int, guids []net.HardwareAddr) error {
4848
return nil
4949
}
5050

51+
func (p *plugin) ListGuidsInUse() ([]string, error) {
52+
log.Info().Msg("noop Plugin ListGuidsInUse()")
53+
return nil, nil
54+
}
55+
5156
// Initialize applies configs to plugin and return a subnet manager client
5257
func Initialize() (plugins.SubnetManagerClient, error) {
5358
log.Info().Msg("Initializing noop plugin")

pkg/sm/plugins/plugin.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,7 @@ type SubnetManagerClient interface {
1919
// RemoveGuidsFromPKey remove guids for given pkey.
2020
// It return error if failed.
2121
RemoveGuidsFromPKey(pkey int, guids []net.HardwareAddr) error
22+
23+
// ListGuidsInUse returns a list of all GUIDS associated with PKeys
24+
ListGuidsInUse() ([]string, error)
2225
}

pkg/sm/plugins/ufm/ufm.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package main
22

33
import (
4+
"encoding/json"
45
"fmt"
56
"net"
67
"net/http"
@@ -133,6 +134,48 @@ func (u *ufmPlugin) RemoveGuidsFromPKey(pKey int, guids []net.HardwareAddr) erro
133134
return nil
134135
}
135136

137+
// convertToMacAddr adds semicolons each 2 characters to convert to MAC format
138+
// UFM returns GUIDS without any delimiters, so expected format is as follows:
139+
// FF00FF00FF00FF00
140+
func convertToMacAddr(guid string) string {
141+
for i := 2; i < len(guid); i += 3 {
142+
guid = guid[:i] + ":" + guid[i:]
143+
}
144+
return guid
145+
}
146+
147+
type Guid struct {
148+
GuidValue string `json:"guid"`
149+
}
150+
151+
type PKey struct {
152+
Guids []Guid `json:"guids"`
153+
}
154+
155+
// ListGuidsInUse returns all guids currently in use by pKeys
156+
func (u *ufmPlugin) ListGuidsInUse() ([]string, error) {
157+
response, err := u.client.Get(u.buildURL("/ufmRest/resources/pkeys/?guids_data=true"), http.StatusOK)
158+
if err != nil {
159+
return nil, fmt.Errorf("failed to get the list of guids: %v", err)
160+
}
161+
162+
var pKeys map[string]PKey
163+
164+
if err := json.Unmarshal(response, &pKeys); err != nil {
165+
return nil, fmt.Errorf("failed to get the list of guids: %v", err)
166+
}
167+
168+
var guids []string
169+
170+
for pkey, _ := range pKeys {
171+
pkeyData := pKeys[pkey]
172+
for _, guidData := range pkeyData.Guids {
173+
guids = append(guids, convertToMacAddr(guidData.GuidValue))
174+
}
175+
}
176+
return guids, nil
177+
}
178+
136179
func (u *ufmPlugin) buildURL(path string) string {
137180
return fmt.Sprintf("%s://%s:%d%s", u.conf.HTTPSchema, u.conf.Address, u.conf.Port, path)
138181
}

pkg/sm/plugins/ufm/ufm_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,4 +152,43 @@ var _ = Describe("Ufm Subnet Manager Client plugin", func() {
152152
Expect(&errMsg).To(Equal(&errMessage))
153153
})
154154
})
155+
Context("ListGuidsInUse", func() {
156+
It("Remove guid from valid pkey", func() {
157+
testResponse := `{
158+
"0x7fff": {
159+
"guids": []
160+
},
161+
"0x7aff": {
162+
"test": "val"
163+
},
164+
"0x5": {
165+
"guids": [
166+
{
167+
"guid": "020000000000003e"
168+
},
169+
{
170+
"guid": "02000FF000FF0009"
171+
}
172+
]
173+
},
174+
"0x6": {
175+
"guids": [
176+
{
177+
"guid": "0200000000000000"
178+
}
179+
]
180+
}
181+
}`
182+
183+
client := &mocks.Client{}
184+
client.On("Get", mock.Anything, mock.Anything).Return([]byte(testResponse), nil)
185+
186+
plugin := &ufmPlugin{client: client, conf: UFMConfig{}}
187+
guids, err := plugin.ListGuidsInUse()
188+
Expect(err).ToNot(HaveOccurred())
189+
190+
expectedGuids := []string{"02:00:00:00:00:00:00:3e", "02:00:0F:F0:00:FF:00:09", "02:00:00:00:00:00:00:00"}
191+
Expect(guids).To(ConsistOf(expectedGuids))
192+
})
193+
})
155194
})

0 commit comments

Comments
 (0)