Skip to content

Commit a917bdb

Browse files
Rozziimetal3-io-bot
authored andcommitted
fix for 1.10->1.11 upgrade issue
This commit: - Implements a fallback Metal3Data retrieval process as part of the Metal3DataTemplate controller's Metal3DataClaim and Metal3Data deletion process -- The new process allows the controller to retrieve the Metal3Data based on the Metal3DataClaim's render information that supposed to point directly to the Metal3Data - Adds related unit test - Extends the existing "UpdateDatas" unit test table evaluation process to take into consideration that when a claim is deleted the related data has to be deleted too Original issue: If the templateReference was used before upgrade to 1.11 and Metal3Data was generated based on the `templateReference name + Metal3DataClaim index` instead of the `template name + data claim index` then after the upgrade during the first new KCP or Metal3MachineDeployment rollout CAPM3 was unable to find the old Metal3Data because after upgrade to 1.11 CAPM3 was only looking for the Metal3Data based on `template name + data claim index` only while old Metal3Data was named based on `templateReference + index`. In CAPM3 technically Metal3Data is allowed to get orphaned in case CAPM3 can't find them based on the information in the Metal3DataClaim and the related Metal3DataTemplate so the Metal3Data with names based on the deprecated templateRefference got orphaned and then they were holding onto IPClaims. The presence of the orphaned Metal3Data in environments where IP pre-allocation was used resulted in the old Metal3Data holding onto the pre allocated IP claims thus the during the rollout the newly created Machies got stuck as they were unable to re-claim the IPs. Signed-off-by: Adam Rozman <[email protected]>
1 parent 5e7b7ec commit a917bdb

File tree

3 files changed

+192
-41
lines changed

3 files changed

+192
-41
lines changed

baremetal/metal3datatemplate_manager.go

Lines changed: 82 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ type DataTemplateManagerInterface interface {
4545
UpdateDatas(context.Context) (bool, bool, error)
4646
}
4747

48+
// Klogr log levels.
49+
const (
50+
EXTRAINFO = 3
51+
TRACE = 5
52+
)
53+
4854
// DataTemplateManager is responsible for performing machine reconciliation.
4955
type DataTemplateManager struct {
5056
client client.Client
@@ -167,7 +173,7 @@ func (m *DataTemplateManager) UpdateDatas(ctx context.Context) (bool, bool, erro
167173
}
168174

169175
hasClaims := false
170-
// Iterate over the Metal3Data objects to find all indexes and objects
176+
// Iterate over the Metal3DataClaim objects to find all indexes and objects
171177
for _, dataClaim := range dataClaimObjects.Items {
172178
// If DataTemplate does not point to this object, discard
173179
if dataClaim.Spec.Template.Name != m.DataTemplate.Name {
@@ -179,11 +185,12 @@ func (m *DataTemplateManager) UpdateDatas(ctx context.Context) (bool, bool, erro
179185
continue
180186
}
181187
}
182-
188+
m.Log.V(TRACE).Info("Initiate updating data of claim", "Metal3DataClaim", dataClaim.Name, "Metal3DataTemplate", m.DataTemplate.Name)
183189
indexes, err = m.updateData(ctx, &dataClaim, indexes)
184190
if err != nil {
185191
return false, false, err
186192
}
193+
m.Log.V(EXTRAINFO).Info("Success updating data of claim", "Metal3DataClaim", dataClaim.Name, "Metal3DataTemplate", m.DataTemplate.Name)
187194
}
188195
m.updateStatusTimestamp()
189196
return hasData, hasClaims, nil
@@ -212,7 +219,8 @@ func (m *DataTemplateManager) updateData(ctx context.Context,
212219
return indexes, err
213220
}
214221
} else {
215-
indexes, err = m.deleteData(ctx, dataClaim, indexes)
222+
m.Log.V(TRACE).Info("Attempting to delete data of claim and related data if present", "Metal3DataClaim", dataClaim.Name)
223+
indexes, err = m.deleteMetal3DataAndClaim(ctx, dataClaim, indexes)
216224
if err != nil {
217225
return indexes, err
218226
}
@@ -341,40 +349,87 @@ func (m *DataTemplateManager) createData(ctx context.Context,
341349
return indexes, nil
342350
}
343351

344-
// deleteData deletes the Metal3DataClaim and marks the Metal3Data for deletion.
345-
func (m *DataTemplateManager) deleteData(ctx context.Context,
352+
// handleRetrieveDataError handles errors generated by Metal3Data retrieval.
353+
// First return value is true if Metal3Data was retrieved successfully.
354+
// Second return value contains an error message if an unknown error was encountered.
355+
func (m *DataTemplateManager) handleRetrieveDataError(err error, filter string, dataClaimName string, dataName string) (bool, string) {
356+
if err != nil && apierrors.IsNotFound(err) {
357+
m.Log.Error(err, "Metal3Data NOT FOUND", "Metal3Data was not found based on filter:", filter, "Metal3DataClaim", dataClaimName, "Metal3DataTemplate", m.DataTemplate.Name, "Metal3Data", dataName)
358+
} else if err != nil && !apierrors.IsNotFound(err) {
359+
persistentErrMsg := "Failed to get Metal3Data object for reason OTHER THAN not finding it based on " + filter
360+
m.Log.Error(err, persistentErrMsg, "Metal3DataClaim", dataClaimName, "Metal3DataTemplate", m.DataTemplate.Name, "Metal3Data", dataName)
361+
return false, persistentErrMsg
362+
} else if err == nil {
363+
m.Log.V(TRACE).Info("Metal3Data found!", "Metal3DataClaim", dataClaimName, "Metal3DataTemplate", m.DataTemplate.Name, "Metal3Data", dataName)
364+
return true, ""
365+
}
366+
return false, ""
367+
}
368+
369+
// retrieveData is a utility to retrieve Metal3Data based on name and save the data to specified object.
370+
func (m *DataTemplateManager) retrieveData(ctx context.Context, dataName string, tmpM3Data *infrav1.Metal3Data) error {
371+
key := client.ObjectKey{
372+
Name: dataName,
373+
Namespace: m.DataTemplate.Namespace,
374+
}
375+
return m.client.Get(ctx, key, tmpM3Data)
376+
}
377+
378+
// deleteMetal3DataAndClaim deletes the Metal3DataClaim and marks the Metal3Data for deletion.
379+
func (m *DataTemplateManager) deleteMetal3DataAndClaim(ctx context.Context,
346380
dataClaim *infrav1.Metal3DataClaim, indexes map[int]string,
347381
) (map[int]string, error) {
348382
m.Log.Info("Deleting Metal3DataClaim", "Metal3DataClaim", dataClaim.Name)
383+
persistentErrMsg := ""
384+
m3DataFound := false
385+
tmpM3Data := &infrav1.Metal3Data{}
386+
dataName := ""
349387

350388
dataClaimIndex, ok := m.DataTemplate.Status.Indexes[dataClaim.Name]
389+
351390
if ok {
352-
// Try to get the Metal3Data. if it succeeds, delete it
353-
tmpM3Data := &infrav1.Metal3Data{}
354-
key := client.ObjectKey{
355-
Name: m.DataTemplate.Name + "-" + strconv.Itoa(dataClaimIndex),
356-
Namespace: m.DataTemplate.Namespace,
391+
// Try to get the Metal3Data, if it succeeds, delete it
392+
dataName = m.DataTemplate.Name + "-" + strconv.Itoa(dataClaimIndex)
393+
err := m.retrieveData(ctx, dataName, tmpM3Data)
394+
m3DataFound, persistentErrMsg = m.handleRetrieveDataError(err, "template name and claim index", dataClaim.Name, dataName)
395+
} else {
396+
persistentErrMsg = "Index of the claim was not found"
397+
m.Log.Error(errors.New(persistentErrMsg), dataClaim.Name, "Metal3DataTemplate", m.DataTemplate.Name)
398+
}
399+
400+
if !m3DataFound {
401+
m.Log.V(TRACE).Info("Attempting to retrieve Metal3Data based on Metal3DataClaim render information")
402+
if dataClaim != nil && dataClaim.Status.RenderedData != nil {
403+
dataName = dataClaim.Status.RenderedData.Name
404+
err := m.retrieveData(ctx, dataName, tmpM3Data)
405+
errMsg := ""
406+
m3DataFound, errMsg = m.handleRetrieveDataError(err, "render data reference", dataClaim.Name, dataName)
407+
persistentErrMsg += errMsg
357408
}
358-
err := m.client.Get(ctx, key, tmpM3Data)
409+
}
410+
411+
if m3DataFound {
412+
// Remove the finalizer
413+
m.Log.V(TRACE).Info("Attempting to remove finalizer from associated Metal3Data", "Metal3DataClaim", dataClaim.Name, "Metal3DataTemplate", m.DataTemplate.Name, "Metal3Data", dataName)
414+
controllerutil.RemoveFinalizer(tmpM3Data, infrav1.DataClaimFinalizer)
415+
err := updateObject(ctx, m.client, tmpM3Data)
359416
if err != nil && !apierrors.IsNotFound(err) {
360-
dataClaim.Status.ErrorMessage = ptr.To("Failed to get associated Metal3Data object")
417+
m.Log.Error(errors.New("Unable to remove finalizer from Metal3Data"), tmpM3Data.Name)
418+
return indexes, err
419+
}
420+
// Delete the Metal3Data
421+
m.Log.V(EXTRAINFO).Info("Deleting associated Metal3Data", "Metal3DataClaim", dataClaim.Name, "Metal3DataTemplate", m.DataTemplate.Name, "Metal3Data", dataName)
422+
err = deleteObject(ctx, m.client, tmpM3Data)
423+
if err != nil && !apierrors.IsNotFound(err) {
424+
dataClaim.Status.ErrorMessage = ptr.To("Failed to delete associated Metal3Data object")
361425
return indexes, err
362-
} else if err == nil {
363-
// Remove the finalizer
364-
controllerutil.RemoveFinalizer(tmpM3Data, infrav1.DataClaimFinalizer)
365-
err = updateObject(ctx, m.client, tmpM3Data)
366-
if err != nil && !apierrors.IsNotFound(err) {
367-
m.Log.Info("Unable to remove finalizer from Metal3Data", "Metal3Data", tmpM3Data.Name)
368-
return indexes, err
369-
}
370-
// Delete the Metal3Data
371-
err = deleteObject(ctx, m.client, tmpM3Data)
372-
if err != nil && !apierrors.IsNotFound(err) {
373-
dataClaim.Status.ErrorMessage = ptr.To("Failed to delete associated Metal3Data object")
374-
return indexes, err
375-
}
376-
m.Log.Info("Deleted Metal3Data", "Metal3Data", tmpM3Data.Name)
377426
}
427+
m.Log.Info("Deleted Metal3Data", "Metal3Data", tmpM3Data.Name)
428+
} else {
429+
errMsg := "Failed to retrieve Metal3Data object because it was not found or for other unknown reason."
430+
persistentErrMsg += errMsg
431+
dataClaim.Status.ErrorMessage = ptr.To(persistentErrMsg)
432+
m.Log.Error(errors.New(errMsg), "error added to Metal3DataClaim status", "Metal3DataClaim", dataClaim.Name, "Metal3DataTemplate", m.DataTemplate.Name, "Metal3Data", dataName)
378433
}
379434

380435
dataClaim.Status.RenderedData = nil

baremetal/metal3datatemplate_manager_test.go

Lines changed: 107 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,7 @@ var _ = Describe("Metal3DataTemplate manager", func() {
250250
objects = append(objects, claim)
251251
}
252252
fakeClient := fake.NewClientBuilder().WithScheme(setupSchemeMm()).WithObjects(objects...).WithStatusSubresource(objects...).Build()
253-
templateMgr, err := NewDataTemplateManager(fakeClient, tc.template,
254-
logr.Discard(),
255-
)
253+
templateMgr, err := NewDataTemplateManager(fakeClient, tc.template, logr.Discard())
256254
Expect(err).NotTo(HaveOccurred())
257255

258256
hasData, _, err := templateMgr.UpdateDatas(context.TODO())
@@ -270,19 +268,42 @@ var _ = Describe("Metal3DataTemplate manager", func() {
270268
Expect(tc.template.Status.LastUpdated.IsZero()).To(BeFalse())
271269
Expect(tc.template.Status.Indexes).To(Equal(tc.expectedIndexes))
272270

273-
// get list of Metal3Data objects
274-
dataObjects := infrav1.Metal3DataClaimList{}
271+
// get list of Metal3DataClaim objects
272+
dataClaimObjects := infrav1.Metal3DataClaimList{}
275273
opts := &client.ListOptions{}
276-
err = fakeClient.List(context.TODO(), &dataObjects, opts)
274+
err = fakeClient.List(context.TODO(), &dataClaimObjects, opts)
277275
Expect(err).NotTo(HaveOccurred())
278276

279-
// Iterate over the Metal3Data objects to find all indexes and objects
280-
for _, claim := range dataObjects.Items {
281-
if claim.DeletionTimestamp.IsZero() {
277+
// Iterate over the Metal3DataClaim objects to check
278+
// if the render data points to somewhere on claims that are not
279+
// being deleted
280+
for _, claim := range dataClaimObjects.Items {
281+
if claim.DeletionTimestamp == nil || claim.DeletionTimestamp.IsZero() {
282282
Expect(claim.Status.RenderedData).NotTo(BeNil())
283283
}
284284
}
285285

286+
// get list of Metal3DataClaim objects
287+
dataObjects := infrav1.Metal3DataList{}
288+
opts = &client.ListOptions{}
289+
err = fakeClient.List(context.TODO(), &dataObjects, opts)
290+
Expect(err).NotTo(HaveOccurred())
291+
292+
for _, data := range dataObjects.Items {
293+
if data.DeletionTimestamp == nil || data.DeletionTimestamp.IsZero() {
294+
correctDeletion := false
295+
parentClaim := data.Spec.Claim.Name
296+
Expect(parentClaim).NotTo(BeEmpty())
297+
for _, claim := range dataClaimObjects.Items {
298+
if parentClaim == claim.ObjectMeta.Name {
299+
Expect(claim.DeletionTimestamp.IsZero()).To(BeTrue())
300+
Expect(claim.Status.RenderedData).NotTo(BeNil())
301+
correctDeletion = true
302+
}
303+
}
304+
Expect(correctDeletion).To(BeTrue())
305+
}
306+
}
286307
},
287308
Entry("No Claims", testCaseUpdateDatas{
288309
template: &infrav1.Metal3DataTemplate{
@@ -321,7 +342,7 @@ var _ = Describe("Metal3DataTemplate manager", func() {
321342
},
322343
Status: infrav1.Metal3DataClaimStatus{
323344
RenderedData: &corev1.ObjectReference{
324-
Name: "abc-2",
345+
Name: "abc-2", // Doesn't matter because we are not reconciling the "other template"
325346
Namespace: namespaceName,
326347
},
327348
},
@@ -339,7 +360,7 @@ var _ = Describe("Metal3DataTemplate manager", func() {
339360
},
340361
Status: infrav1.Metal3DataClaimStatus{
341362
RenderedData: &corev1.ObjectReference{
342-
Name: "abc-2",
363+
Name: "abc-1",
343364
Namespace: namespaceName,
344365
},
345366
},
@@ -364,6 +385,26 @@ var _ = Describe("Metal3DataTemplate manager", func() {
364385
},
365386
},
366387
},
388+
{
389+
ObjectMeta: metav1.ObjectMeta{
390+
Name: "deleting-claim-missmatch-name",
391+
Namespace: namespaceName,
392+
DeletionTimestamp: &timeNow,
393+
Finalizers: []string{"ipclaim.ipam.metal3.io"},
394+
},
395+
Spec: infrav1.Metal3DataClaimSpec{
396+
Template: corev1.ObjectReference{
397+
Name: templateMeta.Name,
398+
Namespace: namespaceName,
399+
},
400+
},
401+
Status: infrav1.Metal3DataClaimStatus{
402+
RenderedData: &corev1.ObjectReference{
403+
Name: "cdc-3",
404+
Namespace: namespaceName,
405+
},
406+
},
407+
},
367408
},
368409
datas: []*infrav1.Metal3Data{
369410
{
@@ -402,7 +443,7 @@ var _ = Describe("Metal3DataTemplate manager", func() {
402443
},
403444
{
404445
ObjectMeta: metav1.ObjectMeta{
405-
Name: "data-to-delete",
446+
Name: "abc-3",
406447
Namespace: namespaceName,
407448
},
408449
Spec: infrav1.Metal3DataSpec{
@@ -417,6 +458,23 @@ var _ = Describe("Metal3DataTemplate manager", func() {
417458
Index: 3,
418459
},
419460
},
461+
{
462+
ObjectMeta: metav1.ObjectMeta{
463+
Name: "cdc-3",
464+
Namespace: namespaceName,
465+
},
466+
Spec: infrav1.Metal3DataSpec{
467+
Template: corev1.ObjectReference{
468+
Name: templateMeta.Name,
469+
Namespace: namespaceName,
470+
},
471+
Claim: corev1.ObjectReference{
472+
Name: "deleting-claim-missmatch-name",
473+
Namespace: namespaceName,
474+
},
475+
Index: 40,
476+
},
477+
},
420478
},
421479
expectedIndexes: map[string]int{
422480
"claim-without-status": 0,
@@ -598,7 +656,7 @@ var _ = Describe("Metal3DataTemplate manager", func() {
598656
)
599657
Expect(err).NotTo(HaveOccurred())
600658

601-
allocatedMap, err := templateMgr.deleteData(context.TODO(), tc.dataClaim, tc.indexes)
659+
allocatedMap, err := templateMgr.deleteMetal3DataAndClaim(context.TODO(), tc.dataClaim, tc.indexes)
602660
if tc.expectError {
603661
Expect(err).To(HaveOccurred())
604662
} else {
@@ -681,6 +739,42 @@ var _ = Describe("Metal3DataTemplate manager", func() {
681739
},
682740
},
683741
}),
742+
Entry("Deletion needed but data name doesn't match template", testCaseDeleteDatas{
743+
template: &infrav1.Metal3DataTemplate{
744+
ObjectMeta: testObjectMeta("abc", "", ""),
745+
Spec: infrav1.Metal3DataTemplateSpec{},
746+
Status: infrav1.Metal3DataTemplateStatus{
747+
Indexes: map[string]int{
748+
"TestRef": 0,
749+
},
750+
},
751+
},
752+
dataClaim: &infrav1.Metal3DataClaim{
753+
ObjectMeta: metav1.ObjectMeta{
754+
Name: "TestRef",
755+
Finalizers: []string{
756+
infrav1.DataClaimFinalizer,
757+
},
758+
},
759+
Status: infrav1.Metal3DataClaimStatus{
760+
RenderedData: &corev1.ObjectReference{
761+
Name: "error-42",
762+
},
763+
},
764+
},
765+
indexes: map[int]string{
766+
0: "TestRef",
767+
},
768+
expectedMap: map[int]string{},
769+
expectedIndexes: map[string]int{},
770+
datas: []*infrav1.Metal3Data{
771+
{
772+
ObjectMeta: metav1.ObjectMeta{
773+
Name: "error-42",
774+
},
775+
},
776+
},
777+
}),
684778
)
685779

686780
})

baremetal/metal3machine_manager.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1770,9 +1770,11 @@ func (m *MachineManager) DissociateM3Metadata(ctx context.Context) error {
17701770
if !(errors.As(err, &reconcileError) && reconcileError.IsTransient()) {
17711771
return err
17721772
}
1773+
m.Log.Error(errors.New("related claim can't be retrieved because of unknown error"), "unknown error", "Metal3Machine", m.Metal3Machine.Name)
17731774
return nil
17741775
}
17751776
if metal3DataClaim == nil {
1777+
m.Log.Info("Related Metal3DataClaim is nil!", "Metal3Machine", m.Metal3Machine.Name)
17761778
return nil
17771779
}
17781780

@@ -1782,7 +1784,7 @@ func (m *MachineManager) DissociateM3Metadata(ctx context.Context) error {
17821784
m.Log.Info("Unable to remove finalizers from Metal3DataClaim", "Metal3DataClaim", metal3DataClaim.Name)
17831785
return err
17841786
}
1785-
1787+
m.Log.Info("Finalizers successfully removed. Initiate deletion of related claim.", "Metal3DataClaim", metal3DataClaim.Name)
17861788
return deleteObject(ctx, m.client, metal3DataClaim)
17871789
}
17881790

0 commit comments

Comments
 (0)