Skip to content

Commit 3c9449a

Browse files
Hardware alerts integration
Link alertmanager with resource server to handle hardware monitoring alerts. - Add resource server client for alarms service - Distinguish between CaaS and hardware alert sources - Add internal resource endpoint to resource server API - Update RBAC for resource types and PrometheusRules access - Support alarm source constants (AlarmSourceCaaS, AlarmSourceHardware) Co-Authored-By: Claude <[email protected]>
1 parent a94389b commit 3c9449a

File tree

22 files changed

+6376
-158
lines changed

22 files changed

+6376
-158
lines changed

config/rbac/role.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ rules:
99
- /o2ims-infrastructureCluster/v1/alarmDictionaries
1010
- /o2ims-infrastructureCluster/v1/nodeClusterTypes
1111
- /o2ims-infrastructureCluster/v1/nodeClusters
12+
- /o2ims-infrastructureInventory/v1/resourceTypes
1213
verbs:
1314
- get
1415
- list
@@ -30,6 +31,8 @@ rules:
3031
- /o2ims-infrastructureCluster/v1/alarmDictionaries/*
3132
- /o2ims-infrastructureCluster/v1/nodeClusterTypes/*
3233
- /o2ims-infrastructureCluster/v1/nodeClusters/*
34+
- /o2ims-infrastructureInventory/v1/internal/resources/*
35+
- /o2ims-infrastructureInventory/v1/resourceTypes/*
3336
verbs:
3437
- get
3538
- apiGroups:

internal/controllers/inventory_controller.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ import (
7171
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/nodeClusterTypes/*",verbs=get
7272
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/nodeClusters/*",verbs=get
7373
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/alarmDictionaries/*",verbs=get
74+
//+kubebuilder:rbac:urls="/o2ims-infrastructureInventory/v1/internal/resources/*",verbs=get
75+
//+kubebuilder:rbac:urls="/o2ims-infrastructureInventory/v1/resourceTypes",verbs=get;list
76+
//+kubebuilder:rbac:urls="/o2ims-infrastructureInventory/v1/resourceTypes/*",verbs=get
7477
//+kubebuilder:rbac:urls="/hardware-manager/inventory/*",verbs=get;list
7578
//+kubebuilder:rbac:groups="batch",resources=cronjobs,verbs=get;list;watch;create;update;patch;delete
7679
//+kubebuilder:rbac:groups=route.openshift.io,resources=routes,verbs=get;list;watch
@@ -1380,6 +1383,31 @@ func (t *reconcilerTask) createAlarmServerClusterRole(ctx context.Context) error
13801383
"get",
13811384
},
13821385
},
1386+
{
1387+
NonResourceURLs: []string{
1388+
"/o2ims-infrastructureInventory/v1/resourceTypes",
1389+
},
1390+
Verbs: []string{
1391+
"get",
1392+
"list",
1393+
},
1394+
},
1395+
{
1396+
NonResourceURLs: []string{
1397+
"/o2ims-infrastructureInventory/v1/resourceTypes/*",
1398+
},
1399+
Verbs: []string{
1400+
"get",
1401+
},
1402+
},
1403+
{
1404+
NonResourceURLs: []string{
1405+
"/o2ims-infrastructureInventory/v1/internal/resources/*",
1406+
},
1407+
Verbs: []string{
1408+
"get",
1409+
},
1410+
},
13831411
},
13841412
}
13851413

internal/service/alarms/api/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ func (a *AlarmsServer) UpdateAlarmServiceConfiguration(ctx context.Context, requ
493493
func (a *AlarmsServer) AmNotification(ctx context.Context, request api.AmNotificationRequestObject) (api.AmNotificationResponseObject, error) {
494494
// TODO: AM auto retries if it receives 5xx error code. That means any error, even if permanent (e.g postgres syntax), will be processed the same way. Once we have a better retry mechanism for pg, update all 5xx to 4xx as needed.
495495

496-
if err := alertmanager.HandleAlerts(ctx, a.Infrastructure.Clients, a.AlarmsRepository, &request.Body.Alerts, alertmanager.Webhook); err != nil {
496+
if err := alertmanager.HandleAlerts(ctx, a.Infrastructure.ClusterServer, a.Infrastructure.ResourceServer, a.AlarmsRepository, &request.Body.Alerts, alertmanager.Webhook); err != nil {
497497
msg := "failed to handle alerts"
498498
slog.Error(msg, "error", err)
499499
return nil, fmt.Errorf("%s: %w", msg, err)

internal/service/alarms/internal/alertmanager/api.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func (c *AMClient) SyncAlerts(ctx context.Context) error {
9999
// Covert to Webhook payload to allow us to maintain a single point of entry in the DB
100100
webhookPayload := ConvertAPIAlertsToWebhook(&apiPayload)
101101
if len(webhookPayload) != 0 {
102-
if err := HandleAlerts(ctx, c.infrastructure.Clients, c.alarmsRepository, &webhookPayload, API); err != nil {
102+
if err := HandleAlerts(ctx, c.infrastructure.ClusterServer, c.infrastructure.ResourceServer, c.alarmsRepository, &webhookPayload, API); err != nil {
103103
return fmt.Errorf("failed to handle alerts during full sync: %w", err)
104104
}
105105
}

internal/service/alarms/internal/alertmanager/api_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,8 @@ var _ = Describe("Alertmanager API Client", func() {
113113
Build()
114114

115115
infra = &infrastructure.Infrastructure{
116-
Clients: []infrastructure.Client{
117-
&infrastructure.ClusterServer{},
118-
},
116+
ClusterServer: &infrastructure.ClusterServer{},
117+
ResourceServer: &infrastructure.ResourceServer{},
119118
}
120119

121120
amClient = alertmanager.NewAlertmanagerClient(fakeClient, mockRepo, infra)

internal/service/alarms/internal/alertmanager/converter.go

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ import (
1919
)
2020

2121
// ConvertAmToAlarmEventRecordModels get alarmEventRecords based on the alertmanager notification and AlarmDefinition
22-
func ConvertAmToAlarmEventRecordModels(ctx context.Context, alerts *[]api.Alert, infrastructureClient infrastructure.Client) []models.AlarmEventRecord {
22+
func ConvertAmToAlarmEventRecordModels(ctx context.Context, alerts *[]api.Alert, clusterServer, resourceServer infrastructure.Client) []models.AlarmEventRecord {
2323
records := make([]models.AlarmEventRecord, 0, len(*alerts))
2424
for _, alert := range *alerts {
2525
record := models.AlarmEventRecord{}
@@ -69,8 +69,21 @@ func ConvertAmToAlarmEventRecordModels(ctx context.Context, alerts *[]api.Alert,
6969
// Update Extensions with things we didn't really process
7070
record.Extensions = getExtensions(alert)
7171

72-
// for caas alerts object is the cluster ID
73-
record.ObjectID = getClusterID(labels)
72+
// Determine alert type (CaaS vs hardware) and select appropriate infrastructure client
73+
// Hardware alerts have both type=hardware AND component=ironic labels
74+
isHardware := isHardwareAlert(labels)
75+
var infrastructureClient infrastructure.Client
76+
if isHardware {
77+
// For hardware alerts, object is the resource ID from instance_uuid
78+
record.ObjectID = getResourceID(labels)
79+
record.AlarmSource = models.AlarmSourceHardware
80+
infrastructureClient = resourceServer
81+
} else {
82+
// For CaaS alerts, object is the cluster ID from managed_cluster
83+
record.ObjectID = getClusterID(labels)
84+
record.AlarmSource = models.AlarmSourceCaaS
85+
infrastructureClient = clusterServer
86+
}
7487

7588
// derive ObjectTypeID from ObjectID
7689
if record.ObjectID != nil {
@@ -118,6 +131,30 @@ func getClusterID(labels map[string]string) *uuid.UUID {
118131
return &id
119132
}
120133

134+
// isHardwareAlert checks if an alert is a hardware alert by checking for type=hardware AND component=ironic labels
135+
func isHardwareAlert(labels map[string]string) bool {
136+
alertType, hasType := labels["type"]
137+
component, hasComponent := labels["component"]
138+
return hasType && alertType == "hardware" && hasComponent && component == "ironic"
139+
}
140+
141+
// getResourceID extracts resource ID from instance_uuid label for hardware alerts
142+
func getResourceID(labels map[string]string) *uuid.UUID {
143+
val, ok := labels["instance_uuid"]
144+
if !ok {
145+
slog.Warn("Could not find instance_uuid for hardware alert", "labels", labels)
146+
return nil
147+
}
148+
149+
id, err := uuid.Parse(val)
150+
if err != nil {
151+
slog.Warn("Could not convert instance_uuid string to uuid", "labels", labels, "err", err.Error())
152+
return nil
153+
}
154+
155+
return &id
156+
}
157+
121158
// getAlertName extract name from alert label
122159
func getAlertName(labels map[string]string) string {
123160
val, ok := labels["alertname"]

internal/service/alarms/internal/alertmanager/converter_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ var _ = Describe("Alertmanager Functions", func() {
7575
},
7676
}
7777

78-
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient)
78+
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient, mockInfraClient)
7979

8080
// Assert
8181
Expect(records).To(HaveLen(1))
@@ -131,7 +131,7 @@ var _ = Describe("Alertmanager Functions", func() {
131131
GetAlarmDefinitionID(gomock.Any(), objectTypeIDUUID, "TestAlert", "critical").
132132
Return(alarmDefUUID, nil)
133133

134-
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient)
134+
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient, mockInfraClient)
135135

136136
// Assert
137137
Expect(records).To(HaveLen(1))
@@ -188,9 +188,9 @@ var _ = Describe("Alertmanager Functions", func() {
188188
},
189189
}
190190

191-
Expect(alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts1, mockInfraClient)).To(BeEmpty())
192-
Expect(alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts2, mockInfraClient)).To(BeEmpty())
193-
Expect(alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts3, mockInfraClient)).To(BeEmpty())
191+
Expect(alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts1, mockInfraClient, mockInfraClient)).To(BeEmpty())
192+
Expect(alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts2, mockInfraClient, mockInfraClient)).To(BeEmpty())
193+
Expect(alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts3, mockInfraClient, mockInfraClient)).To(BeEmpty())
194194
})
195195

196196
It("should handle infrastructure client errors gracefully", func() {
@@ -220,7 +220,7 @@ var _ = Describe("Alertmanager Functions", func() {
220220
},
221221
}
222222

223-
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient)
223+
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient, mockInfraClient)
224224

225225
// Assert - should still create record but without ObjectTypeID
226226
Expect(records).To(HaveLen(1))
@@ -265,7 +265,7 @@ var _ = Describe("Alertmanager Functions", func() {
265265
},
266266
}
267267

268-
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient)
268+
records := alertmanager.ConvertAmToAlarmEventRecordModels(ctx, &alerts, mockInfraClient, mockInfraClient)
269269

270270
// Assert
271271
Expect(records).To(HaveLen(1))

internal/service/alarms/internal/alertmanager/handler.go

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ const (
2626

2727
// HandleAlerts can be called when a payload from Webhook or API `/alerts` is received
2828
// Webhook is our primary and API as our backup and sync mechanism
29-
func HandleAlerts(ctx context.Context, clients []infrastructure.Client, repository repo.AlarmRepositoryInterface, alerts *[]api.Alert, source SourceType) error {
29+
func HandleAlerts(ctx context.Context, clusterServer, resourceServer infrastructure.Client, repository repo.AlarmRepositoryInterface, alerts *[]api.Alert, source SourceType) error {
3030
// Handle nil alerts
3131
if alerts == nil {
3232
return nil
@@ -37,23 +37,8 @@ func HandleAlerts(ctx context.Context, clients []infrastructure.Client, reposito
3737
return nil
3838
}
3939

40-
// Get cached cluster server data
41-
var (
42-
clusterServer infrastructure.Client
43-
found bool
44-
)
45-
for i := range clients {
46-
if clients[i].Name() == infrastructure.Name {
47-
clusterServer = clients[i]
48-
found = true
49-
}
50-
}
51-
if !found {
52-
return fmt.Errorf("no cluster server found with name %q", infrastructure.Name)
53-
}
54-
5540
// Combine possible definitions with events
56-
aerModels := ConvertAmToAlarmEventRecordModels(ctx, alerts, clusterServer)
41+
aerModels := ConvertAmToAlarmEventRecordModels(ctx, alerts, clusterServer, resourceServer)
5742

5843
// Insert and update AlarmEventRecord and optionally resolve stale
5944
if err := repository.WithTransaction(ctx, func(tx pgx.Tx) error {

internal/service/alarms/internal/db/models/alarm_event_record.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ import (
1313
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/api/generated"
1414
)
1515

16+
const (
17+
// AlarmSourceCaaS represents alarms from Container-as-a-Service (cluster alerts)
18+
AlarmSourceCaaS = "caas"
19+
// AlarmSourceHardware represents alarms from hardware resources
20+
AlarmSourceHardware = "hardware"
21+
)
22+
1623
// AlarmEventRecord represents a record in the alarm_event_record table.
1724
type AlarmEventRecord struct {
1825
AlarmEventRecordID uuid.UUID `db:"alarm_event_record_id" json:"alarm_event_record_id"`

internal/service/alarms/internal/db/repo/alarms_repository.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ func (ar *AlarmsRepository) UpsertAlarmEventCaaSRecord(ctx context.Context, tx p
141141
"AlarmAcknowledged", "PerceivedSeverity", "Extensions",
142142
"ObjectID", "ObjectTypeID", "AlarmStatus",
143143
"Fingerprint", "AlarmDefinitionID", "ProbableCauseID",
144-
"GenerationID",
144+
"GenerationID", "AlarmSource",
145145
})
146146

147147
// Set values
@@ -152,7 +152,7 @@ func (ar *AlarmsRepository) UpsertAlarmEventCaaSRecord(ctx context.Context, tx p
152152
record.AlarmAcknowledged, record.PerceivedSeverity, record.Extensions,
153153
record.ObjectID, record.ObjectTypeID, record.AlarmStatus,
154154
record.Fingerprint, record.AlarmDefinitionID, record.ProbableCauseID,
155-
generationID,
155+
generationID, record.AlarmSource,
156156
)))
157157
}
158158
query.Apply(values...)
@@ -169,7 +169,7 @@ func (ar *AlarmsRepository) UpsertAlarmEventCaaSRecord(ctx context.Context, tx p
169169
im.SetExcluded(dbTags["AlarmDefinitionID"]),
170170
im.SetExcluded(dbTags["ProbableCauseID"]),
171171
im.SetExcluded(dbTags["GenerationID"]),
172-
im.Where(psql.Quote(m.TableName(), dbTags["AlarmSource"]).EQ(psql.Arg("alertmanager"))),
172+
im.SetExcluded(dbTags["AlarmSource"]),
173173
))
174174

175175
sql, params, err := query.Build(ctx)
@@ -209,12 +209,12 @@ func (ar *AlarmsRepository) ResolveStaleAlarmEventCaaSRecord(ctx context.Context
209209

210210
query := psql.Update(
211211
um.Table(tableName),
212-
um.SetCol(alarmStatus).ToArg(api.Resolved), // Set to resolved
213-
um.SetCol(perceivedSeverity).ToArg(api.CLEARED), // Set corresponding perceivedSeverity
214-
um.Set(psql.Raw(updateClearedTimeCase, TimeNow())), // Set a resolved time if not there already
215-
um.Where(psql.Quote(generationIDCol).LT(psql.Arg(generationID))), // An alert is stale if its GenID is less than current
216-
um.Where(psql.Quote(alarmSource).EQ(psql.Arg("alertmanager"))), // This is only applicable for alertmanager rows
217-
um.Where(psql.Quote(alarmStatus).NE(psql.Arg(api.Resolved))), // If already resolved no need to process that row
212+
um.SetCol(alarmStatus).ToArg(api.Resolved), // Set to resolved
213+
um.SetCol(perceivedSeverity).ToArg(api.CLEARED), // Set corresponding perceivedSeverity
214+
um.Set(psql.Raw(updateClearedTimeCase, TimeNow())), // Set a resolved time if not there already
215+
um.Where(psql.Quote(generationIDCol).LT(psql.Arg(generationID))), // An alert is stale if its GenID is less than current
216+
um.Where(psql.Quote(alarmSource).In(psql.Arg(models.AlarmSourceCaaS, models.AlarmSourceHardware))), // Support both CaaS and hardware alerts
217+
um.Where(psql.Quote(alarmStatus).NE(psql.Arg(api.Resolved))), // If already resolved no need to process that row
218218
um.Returning(psql.Quote(alarmEventRecordID)),
219219
)
220220

0 commit comments

Comments
 (0)