Skip to content

Commit 54c4bd9

Browse files
committed
Adding necessary test files and changes to test using spot VMs
1 parent 439e2bf commit 54c4bd9

File tree

6 files changed

+384
-10
lines changed

6 files changed

+384
-10
lines changed

community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
locals {
1616
# This label allows for billing report tracking based on module.
17-
labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute" })
17+
labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute", spot = var.enable_spot_vm })
1818
}
1919

2020
module "instance_validation" {

tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@
191191
ignore_unreachable: true # ensure always block will run even if SSH fails
192192
vars:
193193
key_type: "munge"
194+
integration_tests_failed: false # Initialize failure flag
194195
tasks:
195196
- name: Slurm Test Block
196197
vars:
@@ -209,7 +210,7 @@
209210
- name: Wait until Slurm key exists
210211
ansible.builtin.wait_for:
211212
path: /etc/slurm/slurm.key
212-
timeout: 600 # Waits for up to 10 minutes for the file to appear
213+
timeout: 600 # Waits for up to 10 minutes for the file to appear
213214
when: key_type == 'slurm'
214215

215216
- name: Count Slurm nodes
@@ -223,17 +224,43 @@
223224
retries: 60
224225
delay: 15
225226

226-
- name: Run Integration tests for Cluster Toolkit
227-
ansible.builtin.include_tasks: "{{ test }}"
228-
vars:
229-
login_node: "{{ login_node }}"
230-
custom_vars: "{{ custom_vars }}"
231-
loop: "{{ post_deploy_tests }}"
232-
loop_control:
233-
loop_var: test
227+
- name: Block for running main integration tests
228+
block:
229+
- name: Run Integration tests for Cluster Toolkit
230+
ansible.builtin.include_tasks: "{{ test }}"
231+
vars:
232+
login_node: "{{ login_node }}"
233+
custom_vars: "{{ custom_vars }}"
234+
loop: "{{ post_deploy_tests }}"
235+
loop_control:
236+
loop_var: test
237+
rescue:
238+
- name: Set flag indicating test failure
239+
ansible.builtin.set_fact:
240+
integration_tests_failed: true
241+
- name: Display test failure message
242+
ansible.builtin.debug:
243+
msg: "A task within the integration tests failed. Preemption check might run."
234244

235245
## Always cleanup, even on failure
236246
always:
247+
- name: Check if tests failed
248+
ansible.builtin.debug:
249+
var: integration_tests_failed
250+
251+
- name: Check if spot is enable
252+
ansible.builtin.debug:
253+
var: custom_vars.enable_spot | default(false) | bool
254+
255+
# Conditional Preemption Check
256+
- name: Check for recent preemptions if tests failed and Spot is enabled
257+
ansible.builtin.include_tasks: test-validation/test-a3m-preemption.yml
258+
when:
259+
- integration_tests_failed | bool
260+
- custom_vars.enable_spot | default(false) | bool
261+
vars:
262+
custom_vars: "{{ custom_vars }}"
263+
237264
- name: Ensure all nodes are powered down
238265
vars:
239266
wait_for_compute_nodes_to_go_down: true
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
- name: Set up variables for preemption query
17+
ansible.builtin.set_fact:
18+
zone: "{{ custom_vars.zone }}"
19+
slurm_cluster_name: "{{ custom_vars.slurm_cluster_name }}"
20+
node_pattern: "{{ custom_vars.slurm_cluster_name }}-a3mega-*"
21+
gcp_project: "{{ custom_vars.project_id }}"
22+
query_window: "24h" # How far back to look for preemptions
23+
24+
- name: Fail if gcp_project is not set
25+
ansible.builtin.fail:
26+
msg: "gcp_project variable is not defined. Please ensure custom_vars.project_id is passed."
27+
when: gcp_project is not defined or gcp_project == ""
28+
29+
- name: Fail if zone is not set
30+
ansible.builtin.fail:
31+
msg: "zone variable is not defined. Please ensure custom_vars.zone is passed."
32+
when: zone is not defined or zone == ""
33+
34+
- name: Fail if slurm_cluster_name is not set
35+
ansible.builtin.fail:
36+
msg: "slurm_cluster_name variable is not defined. Please ensure custom_vars.slurm_cluster_name is passed."
37+
when: slurm_cluster_name is not defined or slurm_cluster_name == ""
38+
39+
- name: Get current A3 Mega node instance details
40+
ansible.builtin.shell: |
41+
gcloud compute instances list --project={{ gcp_project }} --filter="name ~ '^{{ node_pattern }}' AND zone={{ zone }}" --format="json(name,id)"
42+
register: a3_nodes_raw
43+
changed_when: false
44+
environment:
45+
CLOUDSDK_CORE_PROJECT: "{{ gcp_project }}"
46+
delegate_to: localhost
47+
48+
- name: Parse instance details for query
49+
ansible.builtin.set_fact:
50+
a3_nodes: "{{ a3_nodes_raw.stdout | from_json }}"
51+
52+
- name: Build instance ID filter string
53+
ansible.builtin.set_fact:
54+
instance_id_filter: "{{ a3_nodes | map(attribute='id') | map('regex_replace', '^(.*)$', 'resource.labels.instance_id=\"\\1\"') | join(' OR ') }}"
55+
when: a3_nodes is defined and a3_nodes | length > 0
56+
57+
- name: Debug instance ID filter
58+
ansible.builtin.debug:
59+
var: instance_id_filter
60+
when: instance_id_filter is defined and instance_id_filter != ""
61+
62+
- name: Query Cloud Audit Logs for any preemption events for these nodes
63+
ansible.builtin.shell: |
64+
gcloud logging read '
65+
resource.type="gce_instance"
66+
AND protoPayload.methodName="compute.instances.preempted"
67+
AND log_id("cloudaudit.googleapis.com/system_event")
68+
AND ({{ instance_id_filter }})
69+
' --project={{ gcp_project }} --freshness={{ query_window }} --format="json"
70+
register: audit_logs_raw
71+
changed_when: false
72+
environment:
73+
CLOUDSDK_CORE_PROJECT: "{{ gcp_project }}"
74+
delegate_to: localhost
75+
when: instance_id_filter is defined and instance_id_filter != ""
76+
77+
- name: Parse preemption logs
78+
ansible.builtin.set_fact:
79+
preemption_logs: "{{ audit_logs_raw.stdout | from_json }}"
80+
when: audit_logs_raw.stdout is defined and audit_logs_raw.stdout != "" and audit_logs_raw.stdout != "[]"
81+
82+
- name: Report preempted nodes
83+
ansible.builtin.debug:
84+
msg: "Preemption Events Found in the last {{ query_window }}: {{ preemption_logs }}"
85+
when: preemption_logs is defined and preemption_logs | length > 0
86+
87+
- name: Report no preemptions found
88+
ansible.builtin.debug:
89+
msg: "No preemption events found for nodes matching '{{ node_pattern }}' in zone {{ zone }} in the last {{ query_window }}."
90+
when: preemption_logs is not defined or preemption_logs | length == 0
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
tags:
17+
- m.custom-image
18+
- m.startup-script
19+
- slurm6
20+
- m.filestore
21+
- m.schedmd-slurm-gcp-v6-controller
22+
- m.schedmd-slurm-gcp-v6-login
23+
- m.schedmd-slurm-gcp-v6-nodeset
24+
- m.schedmd-slurm-gcp-v6-partition
25+
- m.vpc
26+
- m.cloud-storage-bucket
27+
- m.multivpc
28+
- m.private-service-access
29+
30+
# cloudbuild.yaml
31+
timeout: 14400s # 4hr
32+
steps:
33+
# While using static network names we are gaurding against more than 1 instance running at a time (for multi-group tests)
34+
- id: check_for_running_build
35+
name: gcr.io/cloud-builders/gcloud
36+
script: "/workspace/tools/cloud-build/check_running_build.sh /workspace/tools/cloud-build/daily-tests/builds/ml-a3-mega-slurm-ubuntu.yaml"
37+
38+
- id: ml-a3-megagpu-slurm
39+
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
40+
entrypoint: /bin/bash
41+
env:
42+
# General Ansible configuration
43+
- "ANSIBLE_HOST_KEY_CHECKING=false"
44+
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
45+
46+
# Configuration Variables for the A3-MegaGPU Test (These will change for other tests)
47+
- "MACHINE_TYPE=a3-megagpu-8g"
48+
- "IMAGE_FAMILY=slurm-a3mega"
49+
- "NUM_NODES=3"
50+
- "BLUEPRINT_PATH=/workspace/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml"
51+
- "TEST_VARS_FILE=@tools/cloud-build/daily-tests/tests/ml-a3-megagpu-onspot-slurm-ubuntu.yml"
52+
- "INSTANCE_PREFIX=a3" # Use a generic prefix that the script will complete
53+
- "PROJECT_ID=$PROJECT_ID"
54+
- "BUILD_ID=$BUILD_ID"
55+
args:
56+
# Execute the script, which now relies on the exported environment variables.
57+
- -c
58+
- |
59+
set -x
60+
/workspace/tools/cloud-build/instance_provisioning.sh
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
17+
# region, zone must be defined in build file with --extra-vars flag!
18+
test_name: a3m-slurm
19+
deployment_name: a3m-slurm-{{ build }}
20+
slurm_cluster_name: "a3m{{ build[0:4] }}"
21+
workspace: /workspace
22+
blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml"
23+
login_node: "{{ slurm_cluster_name }}-login-*"
24+
controller_node: "{{ slurm_cluster_name }}-controller"
25+
network: "{{ deployment_name }}-net-0"
26+
sub_network: "{{ deployment_name }}-sub-net-0"
27+
post_deploy_tests:
28+
- test-validation/test-mounts.yml
29+
- test-validation/test-partitions.yml
30+
- test-validation/test-enroot.yml
31+
- test-validation/test-gpus-slurm.yml
32+
post_destroy_tasks:
33+
- post-destroy-tasks/delete-image.yml
34+
custom_vars:
35+
gpu_count: 8
36+
gpu_partition: a3mega
37+
test_persistenced: true
38+
partitions:
39+
- a3mega
40+
- debug
41+
mounts:
42+
- /home
43+
project_id: "{{ project }}"
44+
zone: "{{ zone }}"
45+
slurm_cluster_name: "{{ slurm_cluster_name }}"
46+
enable_spot: true
47+
cli_deployment_vars:
48+
network_name_system: "{{ network }}"
49+
subnetwork_name_system: "{{ sub_network}}"
50+
region: "{{ region }}"
51+
zone: "{{ zone }}"
52+
slurm_cluster_name: "{{ slurm_cluster_name }}"
53+
disk_size_gb: 200
54+
a3mega_cluster_size: 2
55+
enable_ops_agent: "true"
56+
enable_nvidia_dcgm: "true"
57+
enable_nvidia_persistenced: true
58+
final_image_family: "{{ deployment_name}}-u22"
59+
a3mega_enable_spot_vm: true

0 commit comments

Comments
 (0)