Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions api/controllers/app/scope/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,24 @@ import (
"context"
"time"

"fmt"
"sync"

"github.com/IBM/power-access-cloud/api/apis/app/v1alpha1"
"github.com/IBM/power-access-cloud/api/internal/pkg/pac-go-server/db"
"github.com/IBM/power-access-cloud/api/internal/pkg/pac-go-server/db/mongodb"
"github.com/IBM/power-access-cloud/api/internal/pkg/pac-go-server/models"
"github.com/go-logr/logr"
"github.com/pkg/errors"
"sigs.k8s.io/cluster-api/util/patch"
)

var (
notificationCache = make(map[string]time.Time)
cacheMutex sync.RWMutex
minIntervalMinutes = 30
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the cache is in-memory this can cause multiple emails in a short time if the controller goes into a restart loop, right?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this can happen. In the long run, we should persist this using DB

)

type ServiceScopeParams struct {
ControllerScopeParams
Service *v1alpha1.Service
Expand All @@ -30,6 +43,48 @@ func (m *ServiceScope) PatchServiceObject() error {
return m.servicePatchHelper.Patch(context.TODO(), m.Service)
}

// NotifyServiceCreationFailure creates an event to notify about service creation failure
func (s *ServiceScope) NotifyServiceCreationFailure(errorMessage string) error {
if !shouldNotify(s.Service.Status.VM.InstanceID) {
return nil
}
event, err := models.NewEvent(s.Service.Spec.UserID, s.Service.Spec.UserID, models.EventServiceCreateFailed)
if err != nil {
return err
}

// Notify both user and admin
event.SetNotifiyBoth()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to notify user as well? I think sending a email to user might not be needed. @mayuka-c what do you think

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think not required for user, only admin should be fine.


// Set the error message
logMessage := fmt.Sprintf("Service '%s' creation failed. Reason: %s", s.Service.Name, errorMessage)
event.SetLog(models.EventLogLevelERROR, logMessage)

dbCon, disconnect, err := connectDB(s.Logger)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will be better if we can maintain a single long living connection instead of recreating the connection every time this method gets called. Hypothetically if a bunch of VMs fail at the same time then this will create a bunch of connections to the DB potentially causing slowdown/crash on the DB as well.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original changes was actually like that,
#75 (comment)

Mayuka added this comment that contradicts with yours. I think, maintaining a connection would be a better idea

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, better to have a single persisted connection rather

Copy link
Copy Markdown
Member

@mayuka-c mayuka-c Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@GunaKKIBM We can go with single DB connection but please do make it a singleton (if the connection doesn't exist or if fails -> create new one or resuse the same connection)

if err != nil {
s.Logger.Error(err, "Error connecting to DB")
return err
}
defer disconnect()

err = dbCon.NewEvent(event)
if err != nil {
return err
}
recordNotification(s.Service.Status.VM.InstanceID)
s.Logger.Info("Created failure notification event", "service", s.Service.Name)
return nil
}

func (s *ServiceScope) ClearNotificationCache() {
if s.Service.Status.VM.InstanceID != "" {
clearNotification(s.Service.Status.VM.InstanceID)
s.Logger.Info("Cleared notification cache for VM instance",
"instanceID", s.Service.Status.VM.InstanceID,
"service", s.Service.Name)
}
}

func NewServiceScope(ctx context.Context, params ServiceScopeParams) (*ServiceScope, error) {
scope := &ServiceScope{}

Expand All @@ -55,3 +110,50 @@ func NewServiceScope(ctx context.Context, params ServiceScopeParams) (*ServiceSc

return scope, nil
}

// shouldNotify returns true if a notification can be sent for the service.
// Implements rate limiting to prevent duplicate notifications within minIntervalMinutes.
func shouldNotify(instanceID string) bool {
cacheMutex.RLock()
lastNotified, exists := notificationCache[instanceID]
cacheMutex.RUnlock()

if !exists {
return true
}

return time.Since(lastNotified) > time.Duration(minIntervalMinutes)*time.Minute
}

// recordNotification records the current time as the last notification timestamp for the service.
// Used by rate limiting to track when notifications were sent.
func recordNotification(instanceID string) {
cacheMutex.Lock()
notificationCache[instanceID] = time.Now()
cacheMutex.Unlock()
}

// clearNotification removes the service from the notification cache.
func clearNotification(instanceID string) {
cacheMutex.Lock()
delete(notificationCache, instanceID)
cacheMutex.Unlock()
}

// connectDB establishes a database connection and returns the connection along with a disconnect function
// The caller is responsible for calling the disconnect function (typically with defer)
func connectDB(logger logr.Logger) (db.DB, func(), error) {
dbCon := mongodb.New()
if err := dbCon.Connect(); err != nil {
return nil, nil, fmt.Errorf("failed to connect to MongoDB: %w", err)
}

// Return disconnect function that the caller can defer
disconnect := func() {
if err := dbCon.Disconnect(); err != nil {
logger.Error(err, "Failed disconnecting from MongoDB")
}
}

return dbCon, disconnect, nil
}
24 changes: 23 additions & 1 deletion api/controllers/app/service/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,31 @@ func updateStatus(scope *scope.ServiceScope, pvmInstance *models.PVMInstance) {
handleActiveStatus(scope)
case "ERROR":
scope.Service.Status.State = appv1alpha1.ServiceStateFailed
serverName := "unknown"
if pvmInstance.ServerName != nil {
serverName = *pvmInstance.ServerName
}
instanceID := "unknown"
if pvmInstance.PvmInstanceID != nil {
instanceID = *pvmInstance.PvmInstanceID
}

errorMsg := fmt.Sprintf(`VM Creation Failed

Service Details:
- Server Name: %s
- Instance ID: %s`, serverName, instanceID)
if pvmInstance.Fault != nil {
scope.Service.Status.Message = fmt.Sprintf("vm creation failed with reason: %s", pvmInstance.Fault.Message)
errorMsg += fmt.Sprintf(`

Error Details:
%s`, pvmInstance.Fault.Message)
scope.Service.Status.Message = fmt.Sprintf("VM creation failed with reason: %s", pvmInstance.Fault.Message)
}
scope.Service.Status.AccessInfo = ""
if err := scope.NotifyServiceCreationFailure(errorMsg); err != nil {
scope.Logger.Error(err, "failed to create failure notification event")
}
default:
scope.Service.Status.State = appv1alpha1.ServiceStateInProgress
scope.Service.Status.Message = "vm creation started, will update the access info once vm is ready"
Expand Down Expand Up @@ -184,6 +205,7 @@ func handleActiveStatus(scope *scope.ServiceScope) {
scope.Service.Status.VM.IPAddress)
scope.Service.Status.Message = ""
scope.Logger.Info("Service marked as CREATED")
scope.ClearNotificationCache()
}

func isIBMiOS(scope *scope.ServiceScope) bool {
Expand Down
2 changes: 1 addition & 1 deletion api/internal/pkg/pac-go-server/db/mongodb/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,4 @@ func (db *MongoDB) CollectionExists(name string) (bool, error) {

// collection exists
return true, nil
}
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like, this got updated because of gofmt

}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this a mistake?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.I ran gofmt, on a few files, and this got added, will remove it.

1 change: 1 addition & 0 deletions api/internal/pkg/pac-go-server/models/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ const (
EventServiceUpdate EventType = "SERVICE_UPDATE"
EventServiceDelete EventType = "SERVICE_DELETE"
EventServiceDeleteFailed EventType = "SERVICE_DELETE_FAILED"
EventServiceCreateFailed EventType = "SERVICE_CREATE_FAILED"

EventLogLevelINFO EventLogLevel = "INFO"
EventLogLevelERROR EventLogLevel = "ERROR"
Expand Down
Loading