Skip to content

Commit bb1ddad

Browse files
authored
Merge pull request #3642 from abbas1902/release
Add startup script to set OFI env vars for Cloud RDMA workloads
2 parents 2c46986 + 6f9bc2a commit bb1ddad

File tree

5 files changed

+39
-0
lines changed

5 files changed

+39
-0
lines changed

community/examples/hpc-slurm-h4d.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ deployment_groups:
7676
source: modules/scripts/startup-script
7777
settings:
7878
install_cloud_rdma_drivers: true
79+
set_ofi_cloud_rdma_tunables: true
80+
local_ssd_filesystem:
81+
fs_type: ext4
82+
mountpoint: /mnt/lssd
83+
permissions: "1777"
7984

8085
- id: h4d_nodeset
8186
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
@@ -123,8 +128,14 @@ deployment_groups:
123128
machine_type: n2-standard-4
124129
enable_login_public_ips: true
125130

131+
- id: slurm_controller_startup
132+
source: modules/scripts/startup-script
133+
settings:
134+
set_ofi_cloud_rdma_tunables: true
135+
126136
- id: slurm_controller
127137
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
128138
use: [h4d-slurm-net-0, h4d_partition, slurm_login, homefs, appsfs]
129139
settings:
130140
enable_controller_public_ips: true
141+
controller_startup_script: $(slurm_controller_startup.startup_script)

examples/h4d-vm.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ deployment_groups:
7373
configure_ssh_host_patterns:
7474
- $(vars.hostname_prefix)-*
7575
install_cloud_rdma_drivers: true
76+
set_ofi_cloud_rdma_tunables: true
77+
local_ssd_filesystem:
78+
fs_type: ext4
79+
mountpoint: /mnt/lssd
80+
permissions: "1777"
81+
7682

7783
- id: h4d-vms
7884
source: modules/compute/vm-instance

modules/scripts/startup-script/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ No modules.
334334
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes |
335335
| <a name="input_region"></a> [region](#input\_region) | The region to deploy to | `string` | n/a | yes |
336336
| <a name="input_runners"></a> [runners](#input\_runners) | List of runners to run on remote VM.<br/> Runners can be of type ansible-local, shell or data.<br/> A runner must specify one of 'source' or 'content'.<br/> All runners must specify 'destination'. If 'destination' does not include a<br/> path, it will be copied in a temporary folder and deleted after running.<br/> Runners may also pass 'args', which will be passed as argument to shell runners only. | `list(map(string))` | `[]` | no |
337+
| <a name="input_set_ofi_cloud_rdma_tunables"></a> [set\_ofi\_cloud\_rdma\_tunables](#input\_set\_ofi\_cloud\_rdma\_tunables) | Controls whether to enable specific OFI environment variables for workloads using Cloud RDMA networking. Should be false for non-RDMA workloads. | `bool` | `false` | no |
337338
338339
## Outputs
339340

modules/scripts/startup-script/main.tf

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,20 @@ locals {
9393
}
9494
]
9595

96+
ofi_runner = var.set_ofi_cloud_rdma_tunables == "" ? [] : [
97+
{
98+
type = "data"
99+
destination = "/etc/profile.d/set_ofi_cloud_rdma_tunables.sh"
100+
content = <<-EOT
101+
#!/bin/bash
102+
export FI_PROVIDER="verbs;ofi_rxm"
103+
export FI_OFI_RXM_USE_RNDV_WRITE=1
104+
export FI_VERBS_INLINE_SIZE=39
105+
export I_MPI_FABRICS="shm:rxm"
106+
EOT
107+
},
108+
]
109+
96110
rdma_runner = !var.install_cloud_rdma_drivers ? [] : [
97111
{
98112
type = "shell"
@@ -153,6 +167,7 @@ locals {
153167
local.warnings,
154168
local.hotfix_runner,
155169
local.proxy_runner,
170+
local.ofi_runner,
156171
local.rdma_runner,
157172
local.monitoring_agent_installer,
158173
local.ansible_installer,

modules/scripts/startup-script/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,3 +268,9 @@ variable "install_cloud_rdma_drivers" {
268268
type = bool
269269
default = false
270270
}
271+
272+
variable "set_ofi_cloud_rdma_tunables" {
273+
description = "Controls whether to enable specific OFI environment variables for workloads using Cloud RDMA networking. Should be false for non-RDMA workloads."
274+
type = bool
275+
default = false
276+
}

0 commit comments

Comments
 (0)