Skip to content

Commit 82c0ff9

Browse files
committed
updates eda-hybrid-cloud blueprint with H4D compute support
1 parent dd2b0fd commit 82c0ff9

File tree

1 file changed

+86
-66
lines changed

1 file changed

+86
-66
lines changed

examples/eda/eda-hybrid-cloud.yaml

Lines changed: 86 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ vars:
2121
deployment_name: eda-hybrid-cloud
2222
region: us-central1
2323
zone: us-central1-a
24-
network: shared-vpc-prod
24+
network: default
25+
rdma_net_range: 192.168.128.0/18
2526

2627
# Documentation for each of the modules used below can be found at
2728
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
@@ -37,27 +38,26 @@ deployment_groups:
3738
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3839
- group: setup
3940
modules:
40-
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
41-
# as a prefix. To refer to a local module, prefix with ./, ../ or /
42-
- id: network
41+
# Frontend network for GCE, NetApp Volumes and other services. Make sure it has internet access.
42+
- id: frontend-network
4343
source: modules/network/pre-existing-vpc
4444
settings:
4545
project_id: $(vars.project_id)
4646
region: $(vars.region)
4747
network_name: $(vars.network)
4848

49-
- id: firewall_rule
49+
- id: firewall-rule-frontend
5050
source: modules/network/firewall-rules
5151
use:
52-
- network
52+
- frontend-network
5353
settings:
5454
ingress_rules:
5555
- name: $(vars.deployment_name)-allow-internal-traffic
5656
description: Allow internal traffic
5757
destination_ranges:
58-
- $(network.subnetwork_address)
58+
- $(frontend-network.subnetwork_address)
5959
source_ranges:
60-
- $(network.subnetwork_address)
60+
- $(frontend-network.subnetwork_address)
6161
allow:
6262
- protocol: tcp
6363
ports:
@@ -69,21 +69,39 @@ deployment_groups:
6969
- name: $(vars.deployment_name)-allow-iap-ssh
7070
description: Allow IAP-tunneled SSH connections
7171
destination_ranges:
72-
- $(network.subnetwork_address)
72+
- $(frontend-network.subnetwork_address)
7373
source_ranges:
7474
- 35.235.240.0/20
7575
allow:
7676
- protocol: tcp
7777
ports:
7878
- 22
7979

80+
# Backend RDMA network for GCE instances with RDMA capabilities
81+
- id: backend-network
82+
source: modules/network/vpc
83+
settings:
84+
network_name: $(vars.deployment_name)-rdma-net-0
85+
mtu: 8896
86+
network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-falcon
87+
network_routing_mode: REGIONAL
88+
enable_cloud_router: false
89+
enable_cloud_nat: false
90+
enable_internal_traffic: false
91+
subnetworks:
92+
- subnet_name: $(vars.deployment_name)-rdma-sub-0
93+
subnet_region: $(vars.region)
94+
subnet_ip: $(vars.rdma_net_range)
95+
region: $(vars.region)
96+
8097
# Connect existing Google Cloud NetApp Volumes
8198
# Replace server_ip, remote_mount, and local_mount values as needed for toolsfs, libraryfs, homefs, scratchfs
99+
# Make sure the root inode of each volume has appropriate permissions for intended users, otherwise SLURM jobs may fail
82100
- id: toolsfs
83101
source: modules/file-system/pre-existing-network-storage
84102
settings:
85-
server_ip: 10.165.129.4 # Set IP address of the NFS server here
86-
remote_mount: toolsfs # Set exported NFS share on the server here
103+
server_ip: # Set IP address of the NFS server here
104+
remote_mount: # Set exported path of NFS share here
87105
local_mount: /tools
88106
fs_type: nfs
89107
# Mount options are optimized for aggressive caching, assuming rare changes oon the volume
@@ -92,8 +110,8 @@ deployment_groups:
92110
- id: libraryfs
93111
source: modules/file-system/pre-existing-network-storage
94112
settings:
95-
server_ip: 10.165.129.4 # Set IP address of the NFS server here
96-
remote_mount: libraryfs # Set exported NFS share on the server here
113+
server_ip: # Set IP address of the NFS server here
114+
remote_mount: # Set exported path of NFS share here
97115
local_mount: /library
98116
fs_type: nfs
99117
# Mount options are optimized for aggressive caching, assuming rare changes oon the volume
@@ -102,17 +120,17 @@ deployment_groups:
102120
- id: homefs
103121
source: modules/file-system/pre-existing-network-storage
104122
settings:
105-
server_ip: 10.165.129.4 # Set IP address of the NFS server here
106-
remote_mount: homefs # Set exported NFS share on the server here
123+
server_ip: # Set IP address of the NFS server here
124+
remote_mount: # Set exported path of NFS share here
107125
local_mount: /home
108126
fs_type: nfs
109127
mount_options: "hard,rsize=262144,wsize=262144,vers=3,tcp,mountproto=tcp"
110128

111129
- id: scratchfs
112130
source: modules/file-system/pre-existing-network-storage
113131
settings:
114-
server_ip: 10.165.129.5 # Set IP address of the NFS server here
115-
remote_mount: scratchfs # Set exported NFS share on the server here
132+
server_ip: # Set IP address of the NFS server here
133+
remote_mount: # Set exported path of NFS share here
116134
local_mount: /scratch
117135
fs_type: nfs
118136
mount_options: "hard,rsize=262144,wsize=262144,vers=3,tcp,mountproto=tcp"
@@ -140,76 +158,78 @@ deployment_groups:
140158
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
141159
- group: cluster
142160
modules:
143-
- id: debug_nodeset
144-
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
145-
use: [network]
161+
- id: h4d_startup
162+
source: modules/scripts/startup-script
146163
settings:
147-
node_count_dynamic_max: 4
148-
machine_type: n2-standard-2
149-
allow_automatic_updates: false
164+
set_ofi_cloud_rdma_tunables: true
165+
local_ssd_filesystem:
166+
fs_type: ext4
167+
mountpoint: /mnt/lssd
168+
permissions: "1777"
150169

151-
- id: debug_partition
152-
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
153-
use:
154-
- debug_nodeset
155-
settings:
156-
partition_name: debug
157-
exclusive: false # allows nodes to stay up after jobs are done
158-
is_default: true
159-
160-
- id: compute_nodeset
170+
- id: h4d_nodeset
161171
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
162-
use: [network]
163-
settings:
164-
node_count_dynamic_max: 20
165-
bandwidth_tier: gvnic_enabled
166-
allow_automatic_updates: false
167-
168-
- id: compute_partition
169-
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
170172
use:
171-
- compute_nodeset
172-
settings:
173-
partition_name: compute
174-
175-
- id: h3_nodeset
176-
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
177-
use: [network]
173+
- h4d_startup
174+
- frontend-network
175+
- homefs
176+
- toolsfs
177+
- libraryfs
178+
- scratchfs
178179
settings:
179-
node_count_dynamic_max: 20
180-
# Note that H3 is available in only specific zones. https://cloud.google.com/compute/docs/regions-zones
181-
machine_type: h3-standard-88
182-
# H3 does not support pd-ssd and pd-standard
183-
# https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks
184-
disk_type: pd-balanced
185180
bandwidth_tier: gvnic_enabled
186-
allow_automatic_updates: false
187-
188-
- id: h3_partition
181+
machine_type: h4d-highmem-192-lssd
182+
node_count_static: 1 # Adjust as needed
183+
node_count_dynamic_max: 0 # Adjust as needed
184+
enable_placement: false
185+
disk_type: hyperdisk-balanced
186+
on_host_maintenance: TERMINATE
187+
additional_networks:
188+
$(concat(
189+
[{
190+
network=null,
191+
subnetwork=backend-network.subnetwork_self_link,
192+
subnetwork_project=vars.project_id,
193+
nic_type="IRDMA",
194+
queue_count=null,
195+
network_ip=null,
196+
stack_type=null,
197+
access_config=null,
198+
ipv6_access_config=[],
199+
alias_ip_range=[]
200+
}]
201+
))
202+
203+
- id: h4d_partition
189204
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
190205
use:
191-
- h3_nodeset
206+
- h4d_nodeset
192207
settings:
193-
partition_name: h3
208+
exclusive: false
209+
partition_name: h4d
210+
is_default: true
211+
partition_conf:
212+
ResumeTimeout: 900
213+
SuspendTimeout: 600
194214

195215
- id: slurm_login
196216
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
197-
use: [network]
217+
use: [frontend-network]
198218
settings:
199219
machine_type: n2-standard-4
200220
enable_login_public_ips: true
201221

202222
- id: slurm_controller
203223
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
204224
use:
205-
- network
206-
- debug_partition
207-
- compute_partition
208-
- h3_partition
225+
- frontend-network
226+
- h4d_partition
227+
- slurm_login
228+
- homefs
209229
- toolsfs
210230
- libraryfs
211-
- homefs
212231
- scratchfs
213-
- slurm_login
214232
settings:
215233
enable_controller_public_ips: true
234+
cloud_parameters:
235+
slurmd_timeout: 900

0 commit comments

Comments
 (0)