Skip to content

Commit 71b8596

Browse files
committed
fix/byosg: remove managed SG on BYOSG scenario for CLB
Fix the "managed" (controller-owned) security group leak when user provided security group is added to an existing Service type-loadBalancer CLB.
1 parent 37381a3 commit 71b8596

File tree

2 files changed

+156
-25
lines changed

2 files changed

+156
-25
lines changed

pkg/providers/v1/aws.go

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2940,19 +2940,9 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context,
29402940
loadBalancerSecurityGroupID := lbSecurityGroupIDs[0]
29412941

29422942
// Get the actual list of groups that allow ingress from the load-balancer
2943-
actualGroups := make(map[*ec2types.SecurityGroup]bool)
2944-
{
2945-
describeRequest := &ec2.DescribeSecurityGroupsInput{}
2946-
describeRequest.Filters = []ec2types.Filter{
2947-
newEc2Filter("ip-permission.group-id", loadBalancerSecurityGroupID),
2948-
}
2949-
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
2950-
if err != nil {
2951-
return fmt.Errorf("error querying security groups for ELB: %q", err)
2952-
}
2953-
for _, sg := range response {
2954-
actualGroups[&sg] = c.tagging.hasClusterTag(sg.Tags)
2955-
}
2943+
actualGroups, _, err := c.buildSecurityGroupRuleReferences(ctx, loadBalancerSecurityGroupID)
2944+
if err != nil {
2945+
return fmt.Errorf("error building security group rule references: %w", err)
29562946
}
29572947

29582948
// Open the firewall from the load balancer to the instance

pkg/providers/v1/aws_loadbalancer.go

Lines changed: 153 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"time"
3030

3131
"github.com/aws/aws-sdk-go-v2/aws"
32+
"github.com/aws/aws-sdk-go-v2/service/ec2"
3233
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
3334

3435
elb "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing"
@@ -1251,24 +1252,24 @@ func (c *Cloud) ensureLoadBalancer(ctx context.Context, namespacedName types.Nam
12511252

12521253
{
12531254
// Sync security groups
1254-
expected := sets.New[string](securityGroupIDs...)
1255-
actual := stringSetFromList(loadBalancer.SecurityGroups)
1255+
expected := sets.New(securityGroupIDs...)
1256+
actual := sets.New(loadBalancer.SecurityGroups...)
12561257

12571258
if !expected.Equal(actual) {
12581259
// This call just replaces the security groups, unlike e.g. subnets (!)
1259-
request := &elb.ApplySecurityGroupsToLoadBalancerInput{}
1260-
request.LoadBalancerName = aws.String(loadBalancerName)
1261-
if securityGroupIDs == nil {
1262-
request.SecurityGroups = nil
1263-
} else {
1264-
request.SecurityGroups = securityGroupIDs
1265-
}
1266-
klog.V(2).Info("Applying updated security groups to load balancer")
1267-
_, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, request)
1268-
if err != nil {
1260+
klog.V(2).Infof("Applying updated security groups to load balancer %q", loadBalancerName)
1261+
if _, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, &elb.ApplySecurityGroupsToLoadBalancerInput{
1262+
LoadBalancerName: aws.String(loadBalancerName),
1263+
SecurityGroups: securityGroupIDs,
1264+
}); err != nil {
12691265
return nil, fmt.Errorf("error applying AWS loadbalancer security groups: %q", err)
12701266
}
12711267
dirty = true
1268+
1269+
// Ensure the replaced security groups are removed from AWS when owned by the controller.
1270+
if errs := c.removeOwnedSecurityGroups(ctx, loadBalancerName, actual.UnsortedList()); len(errs) > 0 {
1271+
return nil, fmt.Errorf("error removing owned security groups: %v", errs)
1272+
}
12721273
}
12731274
}
12741275

@@ -1879,3 +1880,143 @@ func ValidateHealthCheck(s *elbtypes.HealthCheck) error {
18791880

18801881
return nil
18811882
}
1883+
1884+
// isOwnedSecurityGroup checks if the security group is owned by the controller
1885+
// by checking if the security group has the cluster ownership tag
1886+
// (kubernetes.io/cluster/<clusterID>=owned).
1887+
//
1888+
// Parameters:
1889+
// - `ctx`: The context for the operation.
1890+
// - `securityGroupID`: The ID of the security group to check.
1891+
//
1892+
// Returns:
1893+
// - `bool`: True if the security group is owned by the controller, false otherwise.
1894+
// - `error`: An error if the security group cannot be retrieved, is not found,
1895+
// or if multiple security groups are found with the same ID (unexpected).
1896+
func (c *Cloud) isOwnedSecurityGroup(ctx context.Context, securityGroupID string) (bool, error) {
1897+
groups, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1898+
GroupIds: []string{securityGroupID},
1899+
})
1900+
if err != nil {
1901+
return false, fmt.Errorf("error retrieving security group %q: %w", securityGroupID, err)
1902+
}
1903+
if len(groups) == 0 {
1904+
return false, fmt.Errorf("security group %q not found", securityGroupID)
1905+
}
1906+
if len(groups) != 1 {
1907+
// This should not be possible - ids should be unique
1908+
return false, fmt.Errorf("[BUG] multiple security groups(%d) found with same id %q", len(groups), securityGroupID)
1909+
}
1910+
return c.tagging.hasClusterTagOwned(groups[0].Tags)
1911+
}
1912+
1913+
// buildSecurityGroupRuleReferences finds all security groups that have ingress rules
1914+
// referencing the specified security group ID, and categorizes them based on cluster tagging.
1915+
// This is used to identify dependencies before removing a security group.
1916+
//
1917+
// Parameters:
1918+
// - ctx: The context for the request.
1919+
// - sgID: The ID of the security group to find references for.
1920+
//
1921+
// Returns:
1922+
// - map[*ec2types.SecurityGroup]bool: All security groups with ingress rules referencing sgID, mapped to their cluster tag status (true/false).
1923+
// - map[*ec2types.SecurityGroup]IPPermissionSet: Only cluster-tagged security groups mapped to their ingress rules that reference sgID.
1924+
// - error: An error if the AWS DescribeSecurityGroups API call fails.
1925+
func (c *Cloud) buildSecurityGroupRuleReferences(ctx context.Context, sgID string) (map[*ec2types.SecurityGroup]bool, map[*ec2types.SecurityGroup]IPPermissionSet, error) {
1926+
groupsHasTags := make(map[*ec2types.SecurityGroup]bool)
1927+
groupsLinkedPermissions := make(map[*ec2types.SecurityGroup]IPPermissionSet)
1928+
sgsOut, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1929+
Filters: []ec2types.Filter{
1930+
newEc2Filter("ip-permission.group-id", sgID),
1931+
},
1932+
})
1933+
if err != nil {
1934+
return groupsHasTags, groupsLinkedPermissions, fmt.Errorf("error querying security groups for ELB: %q", err)
1935+
}
1936+
1937+
for _, sg := range sgsOut {
1938+
groupsHasTags[&sg] = c.tagging.hasClusterTag(sg.Tags)
1939+
1940+
groupsLinkedPermissions[&sg] = NewIPPermissionSet()
1941+
for _, rule := range sg.IpPermissions {
1942+
if rule.UserIdGroupPairs != nil {
1943+
for _, pair := range rule.UserIdGroupPairs {
1944+
if pair.GroupId != nil && aws.ToString(pair.GroupId) == sgID {
1945+
groupsLinkedPermissions[&sg].Insert(rule)
1946+
}
1947+
}
1948+
}
1949+
}
1950+
1951+
}
1952+
return groupsHasTags, groupsLinkedPermissions, nil
1953+
}
1954+
1955+
// removeOwnedSecurityGroups removes the CLB owned/managed security groups from AWS.
1956+
// It revokes ingress rules that reference the security groups to be removed,
1957+
// then deletes the security groups that are owned by the controller.
1958+
// This is used when updating load balancer security groups to clean up orphaned ones.
1959+
//
1960+
// Parameters:
1961+
// - `ctx`: The context for the operation.
1962+
// - `loadBalancerName`: The name of the load balancer (used for logging and deletion operations).
1963+
// - `securityGroups`: The list of security group IDs to process for removal.
1964+
//
1965+
// Returns:
1966+
// - `[]error`: Collection of all errors encountered during the removal process.
1967+
func (c *Cloud) removeOwnedSecurityGroups(ctx context.Context, loadBalancerName string, securityGroups []string) []error {
1968+
allErrs := []error{}
1969+
sgMap := make(map[string]struct{})
1970+
1971+
// Validate each security group references building a reading list to be deleted.
1972+
for _, sg := range securityGroups {
1973+
isOwned, err := c.isOwnedSecurityGroup(ctx, sg)
1974+
if err != nil {
1975+
allErrs = append(allErrs, fmt.Errorf("unable to validate if security group %q is owned by the controller: %w", sg, err))
1976+
continue
1977+
}
1978+
1979+
groupsWithClusterTag, groupsLinkedPermissions, err := c.buildSecurityGroupRuleReferences(ctx, sg)
1980+
if err != nil {
1981+
allErrs = append(allErrs, fmt.Errorf("error building security group rule references for %q: %w", sg, err))
1982+
continue
1983+
}
1984+
1985+
// Revoke ingress rules referencing the security group to be deleted
1986+
// from cluster-tagged security groups, when the referenced security
1987+
// group has no cluster tag, skip the revoke assuming it is user-managed.
1988+
for sgTarget, sgPerms := range groupsLinkedPermissions {
1989+
if !groupsWithClusterTag[sgTarget] {
1990+
klog.Warningf("security group %q has no cluster tag, skipping remove lifecycle after update", sg)
1991+
continue
1992+
}
1993+
1994+
klog.Infof("revoking security group ingress references of %q from %q", sg, aws.ToString(sgTarget.GroupId))
1995+
if _, err := c.ec2.RevokeSecurityGroupIngress(ctx, &ec2.RevokeSecurityGroupIngressInput{
1996+
GroupId: sgTarget.GroupId,
1997+
IpPermissions: sgPerms.List(),
1998+
}); err != nil {
1999+
allErrs = append(allErrs, fmt.Errorf("error revoking security group ingress rules from %q: %w", aws.ToString(sgTarget.GroupId), err))
2000+
continue
2001+
}
2002+
}
2003+
2004+
// Skip security group removal when the security group is not owned by the controller.
2005+
if !isOwned {
2006+
klog.Warningf("security group %q is not owned by the controller, skipping remove lifecycle after update", sg)
2007+
continue
2008+
}
2009+
2010+
klog.Infof("making loadbalancer owned security group %q ready for deletion", sg)
2011+
sgMap[sg] = struct{}{}
2012+
}
2013+
if len(sgMap) == 0 {
2014+
return allErrs
2015+
}
2016+
2017+
if err := c.deleteSecurityGroupsWithBackoff(ctx, loadBalancerName, sgMap); err != nil {
2018+
return append(allErrs, fmt.Errorf("error deleting security groups %v: %v", sgMap, err))
2019+
}
2020+
klog.Infof("loadbalancer owned security groups deleted!")
2021+
return nil
2022+
}

0 commit comments

Comments
 (0)