Skip to content

Commit 167061f

Browse files
author
Jun Peng
committed
Set up GPU nodes for nvidia tests
1 parent dda14bf commit 167061f

File tree

10 files changed

+70
-20
lines changed

10 files changed

+70
-20
lines changed

test/e2e/nodeadm.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ type NodeadmOS interface {
1616
Name() string
1717
AMIName(ctx context.Context, awsConfig aws.Config) (string, error)
1818
BuildUserData(userDataInput UserDataInput) ([]byte, error)
19-
InstanceType(region string, instanceSize InstanceSize) string
19+
InstanceType(region string, instanceSize InstanceSize, computeType ComputeType) string
2020
}
2121

2222
type InstanceSize int
@@ -26,6 +26,13 @@ const (
2626
XLarge
2727
)
2828

29+
type ComputeType int
30+
31+
const (
32+
CPUInstance ComputeType = iota
33+
GPUInstance
34+
)
35+
2936
type UserDataInput struct {
3037
CredsProviderName string
3138
EKSEndpoint string

test/e2e/os/amazonlinux.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ func (a AmazonLinux2023) Name() string {
4141
return "al23-" + a.architecture.String()
4242
}
4343

44-
func (a AmazonLinux2023) InstanceType(region string, instanceSize e2e.InstanceSize) string {
45-
return getInstanceTypeFromRegionAndArch(region, a.architecture, instanceSize)
44+
func (a AmazonLinux2023) InstanceType(region string, instanceSize e2e.InstanceSize, computeType e2e.ComputeType) string {
45+
return getInstanceTypeFromRegionAndArch(region, a.architecture, instanceSize, computeType)
4646
}
4747

4848
func (a AmazonLinux2023) AMIName(ctx context.Context, awsConfig aws.Config) (string, error) {

test/e2e/os/arch.go

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,17 @@ var instanceSizeToType = map[architecture]map[e2e.InstanceSize]string{
3636
},
3737
}
3838

39+
var gpuInstanceSizeToType = map[architecture]map[e2e.InstanceSize]string{
40+
amd64: {
41+
e2e.XLarge: "g4dn.2xlarge",
42+
e2e.Large: "g4dn.xlarge",
43+
},
44+
arm64: {
45+
e2e.XLarge: "g5g.2xlarge",
46+
e2e.Large: "g5g.xlarge",
47+
},
48+
}
49+
3950
func (a architecture) String() string {
4051
return string(a)
4152
}
@@ -94,8 +105,16 @@ func getAmiIDFromSSM(ctx context.Context, client *ssm.Client, amiName string) (*
94105
}
95106

96107
// an unknown size and arch combination is a coding error, so we panic
97-
func getInstanceTypeFromRegionAndArch(_ string, arch architecture, instanceSize e2e.InstanceSize) string {
98-
instanceType, ok := instanceSizeToType[arch][instanceSize]
108+
func getInstanceTypeFromRegionAndArch(_ string, arch architecture, instanceSize e2e.InstanceSize, computeType e2e.ComputeType) string {
109+
var instanceType string
110+
var ok bool
111+
112+
if computeType == e2e.GPUInstance {
113+
instanceType, ok = gpuInstanceSizeToType[arch][instanceSize]
114+
} else {
115+
instanceType, ok = instanceSizeToType[arch][instanceSize]
116+
}
117+
99118
if !ok {
100119
panic(fmt.Errorf("unknown instance size %d for architecture %s", instanceSize, arch))
101120
}

test/e2e/os/rhel.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ func (r RedHat8) Name() string {
6767
return "rhel8-" + r.architecture.String()
6868
}
6969

70-
func (r RedHat8) InstanceType(region string, instanceSize e2e.InstanceSize) string {
71-
return getInstanceTypeFromRegionAndArch(region, r.architecture, instanceSize)
70+
func (r RedHat8) InstanceType(region string, instanceSize e2e.InstanceSize, computeType e2e.ComputeType) string {
71+
return getInstanceTypeFromRegionAndArch(region, r.architecture, instanceSize, computeType)
7272
}
7373

7474
func (r RedHat8) AMIName(ctx context.Context, awsConfig aws.Config) (string, error) {
@@ -140,8 +140,8 @@ func (r RedHat9) Name() string {
140140
return name
141141
}
142142

143-
func (r RedHat9) InstanceType(region string, instanceSize e2e.InstanceSize) string {
144-
return getInstanceTypeFromRegionAndArch(region, r.architecture, instanceSize)
143+
func (r RedHat9) InstanceType(region string, instanceSize e2e.InstanceSize, computeType e2e.ComputeType) string {
144+
return getInstanceTypeFromRegionAndArch(region, r.architecture, instanceSize, computeType)
145145
}
146146

147147
func (r RedHat9) AMIName(ctx context.Context, awsConfig aws.Config) (string, error) {

test/e2e/os/ubuntu.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ func (u Ubuntu2004) Name() string {
8787
return name
8888
}
8989

90-
func (u Ubuntu2004) InstanceType(region string, instanceSize e2e.InstanceSize) string {
91-
return getInstanceTypeFromRegionAndArch(region, u.architecture, instanceSize)
90+
func (u Ubuntu2004) InstanceType(region string, instanceSize e2e.InstanceSize, computeType e2e.ComputeType) string {
91+
return getInstanceTypeFromRegionAndArch(region, u.architecture, instanceSize, computeType)
9292
}
9393

9494
func (u Ubuntu2004) AMIName(ctx context.Context, awsConfig aws.Config) (string, error) {
@@ -155,8 +155,8 @@ func (u Ubuntu2204) Name() string {
155155
return name
156156
}
157157

158-
func (u Ubuntu2204) InstanceType(region string, instanceSize e2e.InstanceSize) string {
159-
return getInstanceTypeFromRegionAndArch(region, u.architecture, instanceSize)
158+
func (u Ubuntu2204) InstanceType(region string, instanceSize e2e.InstanceSize, computeType e2e.ComputeType) string {
159+
return getInstanceTypeFromRegionAndArch(region, u.architecture, instanceSize, computeType)
160160
}
161161

162162
func (u Ubuntu2204) AMIName(ctx context.Context, awsConfig aws.Config) (string, error) {
@@ -234,8 +234,8 @@ func (u Ubuntu2404) Name() string {
234234
return name
235235
}
236236

237-
func (u Ubuntu2404) InstanceType(region string, instanceSize e2e.InstanceSize) string {
238-
return getInstanceTypeFromRegionAndArch(region, u.architecture, instanceSize)
237+
func (u Ubuntu2404) InstanceType(region string, instanceSize e2e.InstanceSize, computeType e2e.ComputeType) string {
238+
return getInstanceTypeFromRegionAndArch(region, u.architecture, instanceSize, computeType)
239239
}
240240

241241
func (u Ubuntu2404) AMIName(ctx context.Context, awsConfig aws.Config) (string, error) {

test/e2e/peered/node.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ type NodeSpec struct {
5050
InstanceProfileARN string
5151
NodeK8sVersion string
5252
NodeName string
53+
ComputeType e2e.ComputeType
5354
OS e2e.NodeadmOS
5455
Provider e2e.NodeadmCredentialsProvider
5556
}
@@ -137,7 +138,7 @@ func (c NodeCreate) Create(ctx context.Context, spec *NodeSpec) (PeerdNode, erro
137138

138139
instanceType := spec.InstanceType
139140
if instanceType == "" {
140-
instanceType = spec.OS.InstanceType(c.Cluster.Region, spec.InstanceSize)
141+
instanceType = spec.OS.InstanceType(c.Cluster.Region, spec.InstanceSize, spec.ComputeType)
141142
}
142143

143144
ec2Input := ec2.InstanceConfig{

test/e2e/suite/addons/addons_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,11 @@ var _ = SynchronizedBeforeSuite(
7979
var _ = Describe("Hybrid Nodes", func() {
8080
When("using peered VPC", func() {
8181
var addonEc2Test *suite.AddonEc2Test
82+
credentialProviders := suite.CredentialProviders()
8283

8384
BeforeEach(func(ctx context.Context) {
8485
addonEc2Test = &suite.AddonEc2Test{PeeredVPCTest: suite.BeforeVPCTest(ctx, suiteConfig)}
86+
credentialProviders = suite.AddClientsToCredentialProviders(credentialProviders, addonEc2Test.PeeredVPCTest)
8587
})
8688

8789
When("using ec2 instance as hybrid nodes", func() {
@@ -196,6 +198,23 @@ var _ = Describe("Hybrid Nodes", func() {
196198
Succeed(), "prometheus node exporter should have been validated successfully",
197199
)
198200
}, Label("prometheus-node-exporter"))
201+
It("runs nvidia device plugin tests", func(ctx context.Context) {
202+
osList := suite.OSProviderList(credentialProviders)
203+
Expect(osList).ToNot(BeEmpty(), "OS list should not be empty")
204+
205+
// randomly pick one os/provider combination to provision GPU nodes
206+
rand.Shuffle(len(osList), func(i, j int) {
207+
osList[i], osList[j] = osList[j], osList[i]
208+
})
209+
210+
os := osList[0].OS
211+
provider := osList[0].Provider
212+
instanceName := addonEc2Test.InstanceName("addon-nvidia-test", os, provider)
213+
nodeName := fmt.Sprintf("addon-nvidia-node-%s-%s", provider.Name(), os.Name())
214+
testNode := addonEc2Test.NewTestNode(ctx, instanceName, nodeName, addonEc2Test.Cluster.KubernetesVersion, os, provider, e2e.Large, e2e.GPUInstance)
215+
Expect(testNode.Start(ctx)).To(Succeed(), "node should start successfully")
216+
Expect(testNode.Verify(ctx)).To(Succeed(), "node should be fully functional")
217+
}, Label("nvidia-device-plugin"))
199218
})
200219
})
201220
})

test/e2e/suite/nodeadm/nodeadm_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ var _ = Describe("Hybrid Nodes", func() {
9898
k8sVersion = test.OverrideNodeK8sVersion
9999
}
100100

101-
testNode := test.NewTestNode(ctx, instanceName, nodeName, k8sVersion, nodeOS, provider, e2e.Large)
101+
testNode := test.NewTestNode(ctx, instanceName, nodeName, k8sVersion, nodeOS, provider, e2e.Large, e2e.CPUInstance)
102102
Expect(testNode.Start(ctx)).To(Succeed(), "node should start successfully")
103103
Expect(testNode.Verify(ctx)).To(Succeed(), "node should be fully functional")
104104

@@ -155,7 +155,7 @@ var _ = Describe("Hybrid Nodes", func() {
155155
nodeKubernetesVersion, err := kubernetes.PreviousVersion(test.Cluster.KubernetesVersion)
156156
Expect(err).NotTo(HaveOccurred(), "expected to get previous k8s version")
157157

158-
testNode := test.NewTestNode(ctx, instanceName, nodeName, nodeKubernetesVersion, nodeOS, provider, e2e.Large)
158+
testNode := test.NewTestNode(ctx, instanceName, nodeName, nodeKubernetesVersion, nodeOS, provider, e2e.Large, e2e.CPUInstance)
159159
Expect(testNode.Start(ctx)).To(Succeed(), "node should start successfully")
160160
Expect(testNode.Verify(ctx)).To(Succeed(), "node should be fully functional")
161161

test/e2e/suite/peered_vpc.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ func WithLogging(loggerControl e2e.PausableLogger, serialOutputWriter io.Writer)
263263
}
264264
}
265265

266-
func (t *PeeredVPCTest) NewTestNode(ctx context.Context, instanceName, nodeName, k8sVersion string, os e2e.NodeadmOS, provider e2e.NodeadmCredentialsProvider, instanceSize e2e.InstanceSize, opts ...TestNodeOption) *testNode {
266+
func (t *PeeredVPCTest) NewTestNode(ctx context.Context, instanceName, nodeName, k8sVersion string, os e2e.NodeadmOS, provider e2e.NodeadmCredentialsProvider, instanceSize e2e.InstanceSize, computeType e2e.ComputeType, opts ...TestNodeOption) *testNode {
267267
node := &testNode{
268268
ArtifactsPath: t.ArtifactsPath,
269269
ClusterName: t.Cluster.Name,
@@ -282,6 +282,7 @@ func (t *PeeredVPCTest) NewTestNode(ctx context.Context, instanceName, nodeName,
282282
OS: os,
283283
Provider: provider,
284284
Region: t.Cluster.Region,
285+
ComputeType: computeType,
285286
}
286287

287288
for _, opt := range opts {
@@ -458,6 +459,7 @@ type NodeCreate struct {
458459
NodeName string
459460
OS e2e.NodeadmOS
460461
Provider e2e.NodeadmCredentialsProvider
462+
ComputeType e2e.ComputeType
461463
}
462464

463465
func CreateNodes(ctx context.Context, test *PeeredVPCTest, nodesToCreate []NodeCreate) {
@@ -480,7 +482,7 @@ func CreateNodes(ctx context.Context, test *PeeredVPCTest, nodesToCreate []NodeC
480482

481483
// Create a new logger that uses our SwitchWriter
482484
controlledLogger := e2e.NewPausableLogger(e2e.WithWriter(outputControl))
483-
testNode := test.NewTestNode(ctx, entry.InstanceName, entry.NodeName, test.Cluster.KubernetesVersion, entry.OS, entry.Provider, entry.InstanceSize,
485+
testNode := test.NewTestNode(ctx, entry.InstanceName, entry.NodeName, test.Cluster.KubernetesVersion, entry.OS, entry.Provider, entry.InstanceSize, entry.ComputeType,
484486
WithLogging(controlledLogger, outputControl))
485487

486488
Expect(testNode.Start(ctx)).To(Succeed(), "node should start successfully")

test/e2e/suite/test_node.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
type testNode struct {
2727
ArtifactsPath string
2828
ClusterName string
29+
ComputeType e2e.ComputeType
2930
EC2Client *ec2v2.Client
3031
EKSEndpoint string
3132
FailHandler func(message string, callerSkip ...int)
@@ -69,6 +70,7 @@ func (n *testNode) Start(ctx context.Context) error {
6970
NodeName: n.NodeName,
7071
OS: n.OS,
7172
Provider: n.Provider,
73+
ComputeType: n.ComputeType,
7274
})
7375
Expect(err).NotTo(HaveOccurred(), "EC2 Instance should have been created successfully")
7476
flakeRun.DeferCleanup(func(ctx context.Context) {

0 commit comments

Comments
 (0)