2121 deployment_name : eda-hybrid-cloud
2222 region : us-central1
2323 zone : us-central1-a
24- network : shared-vpc-prod
24+ network : default
25+ rdma_net_range : 192.168.128.0/18
2526
2627# Documentation for each of the modules used below can be found at
2728# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
@@ -37,27 +38,26 @@ deployment_groups:
3738# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3839- group : setup
3940 modules :
40- # Source is an embedded module, denoted by "modules/*" without ./, ../, /
41- # as a prefix. To refer to a local module, prefix with ./, ../ or /
42- - id : network
41+ # Frontend network for GCE, NetApp Volumes and other services. Make sure it has internet access.
42+ - id : frontend-network
4343 source : modules/network/pre-existing-vpc
4444 settings :
4545 project_id : $(vars.project_id)
4646 region : $(vars.region)
4747 network_name : $(vars.network)
4848
49- - id : firewall_rule
49+ - id : firewall-rule-frontend
5050 source : modules/network/firewall-rules
5151 use :
52- - network
52+ - frontend- network
5353 settings :
5454 ingress_rules :
5555 - name : $(vars.deployment_name)-allow-internal-traffic
5656 description : Allow internal traffic
5757 destination_ranges :
58- - $(network.subnetwork_address)
58+ - $(frontend- network.subnetwork_address)
5959 source_ranges :
60- - $(network.subnetwork_address)
60+ - $(frontend- network.subnetwork_address)
6161 allow :
6262 - protocol : tcp
6363 ports :
@@ -69,21 +69,39 @@ deployment_groups:
6969 - name : $(vars.deployment_name)-allow-iap-ssh
7070 description : Allow IAP-tunneled SSH connections
7171 destination_ranges :
72- - $(network.subnetwork_address)
72+ - $(frontend- network.subnetwork_address)
7373 source_ranges :
7474 - 35.235.240.0/20
7575 allow :
7676 - protocol : tcp
7777 ports :
7878 - 22
7979
80+ # Backend RDMA network for GCE instances with RDMA capabilities
81+ - id : backend-network
82+ source : modules/network/vpc
83+ settings :
84+ network_name : $(vars.deployment_name)-rdma-net-0
85+ mtu : 8896
86+ network_profile : https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-falcon
87+ network_routing_mode : REGIONAL
88+ enable_cloud_router : false
89+ enable_cloud_nat : false
90+ enable_internal_traffic : false
91+ subnetworks :
92+ - subnet_name : $(vars.deployment_name)-rdma-sub-0
93+ subnet_region : $(vars.region)
94+ subnet_ip : $(vars.rdma_net_range)
95+ region : $(vars.region)
96+
8097# Connect existing Google Cloud NetApp Volumes
8198# Replace server_ip, remote_mount, and local_mount values as needed for toolsfs, libraryfs, homefs, scratchfs
99+ # Make sure the root inode of each volume has appropriate permissions for intended users, otherwise SLURM jobs may fail
82100 - id : toolsfs
83101 source : modules/file-system/pre-existing-network-storage
84102 settings :
85- server_ip : 10.165.129.4 # Set IP address of the NFS server here
86- remote_mount : toolsfs # Set exported NFS share on the server here
103+ server_ip : # Set IP address of the NFS server here
104+ remote_mount : # Set exported path of NFS share here
87105 local_mount : /tools
88106 fs_type : nfs
89107 # Mount options are optimized for aggressive caching, assuming rare changes oon the volume
@@ -92,8 +110,8 @@ deployment_groups:
92110 - id : libraryfs
93111 source : modules/file-system/pre-existing-network-storage
94112 settings :
95- server_ip : 10.165.129.4 # Set IP address of the NFS server here
96- remote_mount : libraryfs # Set exported NFS share on the server here
113+ server_ip : # Set IP address of the NFS server here
114+ remote_mount : # Set exported path of NFS share here
97115 local_mount : /library
98116 fs_type : nfs
99117 # Mount options are optimized for aggressive caching, assuming rare changes oon the volume
@@ -102,17 +120,17 @@ deployment_groups:
102120 - id : homefs
103121 source : modules/file-system/pre-existing-network-storage
104122 settings :
105- server_ip : 10.165.129.4 # Set IP address of the NFS server here
106- remote_mount : homefs # Set exported NFS share on the server here
123+ server_ip : # Set IP address of the NFS server here
124+ remote_mount : # Set exported path of NFS share here
107125 local_mount : /home
108126 fs_type : nfs
109127 mount_options : " hard,rsize=262144,wsize=262144,vers=3,tcp,mountproto=tcp"
110128
111129 - id : scratchfs
112130 source : modules/file-system/pre-existing-network-storage
113131 settings :
114- server_ip : 10.165.129.5 # Set IP address of the NFS server here
115- remote_mount : scratchfs # Set exported NFS share on the server here
132+ server_ip : # Set IP address of the NFS server here
133+ remote_mount : # Set exported path of NFS share here
116134 local_mount : /scratch
117135 fs_type : nfs
118136 mount_options : " hard,rsize=262144,wsize=262144,vers=3,tcp,mountproto=tcp"
@@ -140,76 +158,78 @@ deployment_groups:
140158# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
141159- group : cluster
142160 modules :
143- - id : debug_nodeset
144- source : community/modules/compute/schedmd-slurm-gcp-v6-nodeset
145- use : [network]
161+ - id : h4d_startup
162+ source : modules/scripts/startup-script
146163 settings :
147- node_count_dynamic_max : 4
148- machine_type : n2-standard-2
149- allow_automatic_updates : false
164+ set_ofi_cloud_rdma_tunables : true
165+ local_ssd_filesystem :
166+ fs_type : ext4
167+ mountpoint : /mnt/lssd
168+ permissions : " 1777"
150169
151- - id : debug_partition
152- source : community/modules/compute/schedmd-slurm-gcp-v6-partition
153- use :
154- - debug_nodeset
155- settings :
156- partition_name : debug
157- exclusive : false # allows nodes to stay up after jobs are done
158- is_default : true
159-
160- - id : compute_nodeset
170+ - id : h4d_nodeset
161171 source : community/modules/compute/schedmd-slurm-gcp-v6-nodeset
162- use : [network]
163- settings :
164- node_count_dynamic_max : 20
165- bandwidth_tier : gvnic_enabled
166- allow_automatic_updates : false
167-
168- - id : compute_partition
169- source : community/modules/compute/schedmd-slurm-gcp-v6-partition
170172 use :
171- - compute_nodeset
172- settings :
173- partition_name : compute
174-
175- - id : h3_nodeset
176- source : community/modules/compute/schedmd-slurm-gcp-v6-nodeset
177- use : [network]
173+ - h4d_startup
174+ - frontend-network
175+ - homefs
176+ - toolsfs
177+ - libraryfs
178+ - scratchfs
178179 settings :
179- node_count_dynamic_max : 20
180- # Note that H3 is available in only specific zones. https://cloud.google.com/compute/docs/regions-zones
181- machine_type : h3-standard-88
182- # H3 does not support pd-ssd and pd-standard
183- # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks
184- disk_type : pd-balanced
185180 bandwidth_tier : gvnic_enabled
186- allow_automatic_updates : false
187-
188- - id : h3_partition
181+ machine_type : h4d-highmem-192-lssd
182+ node_count_static : 1 # Adjust as needed
183+ node_count_dynamic_max : 0 # Adjust as needed
184+ enable_placement : false
185+ disk_type : hyperdisk-balanced
186+ on_host_maintenance : TERMINATE
187+ additional_networks :
188+ $(concat(
189+ [{
190+ network=null,
191+ subnetwork=backend-network.subnetwork_self_link,
192+ subnetwork_project=vars.project_id,
193+ nic_type="IRDMA",
194+ queue_count=null,
195+ network_ip=null,
196+ stack_type=null,
197+ access_config=null,
198+ ipv6_access_config=[],
199+ alias_ip_range=[]
200+ }]
201+ ))
202+
203+ - id : h4d_partition
189204 source : community/modules/compute/schedmd-slurm-gcp-v6-partition
190205 use :
191- - h3_nodeset
206+ - h4d_nodeset
192207 settings :
193- partition_name : h3
208+ exclusive : false
209+ partition_name : h4d
210+ is_default : true
211+ partition_conf :
212+ ResumeTimeout : 900
213+ SuspendTimeout : 600
194214
195215 - id : slurm_login
196216 source : community/modules/scheduler/schedmd-slurm-gcp-v6-login
197- use : [network]
217+ use : [frontend- network]
198218 settings :
199219 machine_type : n2-standard-4
200220 enable_login_public_ips : true
201221
202222 - id : slurm_controller
203223 source : community/modules/scheduler/schedmd-slurm-gcp-v6-controller
204224 use :
205- - network
206- - debug_partition
207- - compute_partition
208- - h3_partition
225+ - frontend- network
226+ - h4d_partition
227+ - slurm_login
228+ - homefs
209229 - toolsfs
210230 - libraryfs
211- - homefs
212231 - scratchfs
213- - slurm_login
214232 settings :
215233 enable_controller_public_ips : true
234+ cloud_parameters :
235+ slurmd_timeout : 900
0 commit comments