diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm-flex.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm-flex.yaml deleted file mode 100644 index f0dbcefd7c..0000000000 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm-flex.yaml +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -tags: -- m.custom-image -- m.cloud-storage-bucket -- m.pre-existing-network-storage -- m.filestore -- m.gpu-rdma-vpc -- m.schedmd-slurm-gcp-v6-controller -- m.schedmd-slurm-gcp-v6-login -- m.schedmd-slurm-gcp-v6-nodeset -- m.schedmd-slurm-gcp-v6-partition -- m.startup-script -- m.vpc -- slurm6 - -timeout: 14400s # 4hr -steps: -# While using static network names we are gaurding against more than 1 instance running at a time (for multi-group tests) -- id: check_for_running_build - name: gcr.io/cloud-builders/gcloud - script: "tools/cloud-build/check_running_build.sh tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm-flex.yaml" - -- id: ml-a4-highgpu-slurm-flex - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - cd /workspace && make - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - REGION=europe-west1 - ZONE=europe-west1-b - BLUEPRINT="/workspace/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml" - sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT} - sed -i -e 's/\breservation_name\b/future_reservation/g' $${BLUEPRINT} - sed -i -e '/reason:/d' $${BLUEPRINT} - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="region=$${REGION} zone=$${ZONE}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm-flex.yml" diff --git a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm-flex.yml b/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm-flex.yml deleted file mode 100644 index 60e84db373..0000000000 --- a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm-flex.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -# region, zone must be defined in build file with --extra-vars flag! -test_name: a4h-slurm-flex -deployment_name: a4h-slurm-flex-{{ build }} -slurm_cluster_name: "a4hf{{ build[0:4] }}" -workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml" -login_node: "{{ slurm_cluster_name }}-slurm-login-*" -controller_node: "{{ slurm_cluster_name }}-controller" -region: us-central1 -zone: us-central1-b -network: "{{ test_name }}-net-0" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -- test-validation/test-default-partition.yml -- test-validation/test-enroot.yml -- test-validation/test-gpus-slurm.yml -post_destroy_tasks: -- post-destroy-tasks/delete-image.yml -custom_vars: - gpu_count: 8 - gpu_partition: a4high - test_persistenced: true - partitions: - - a4high - mounts: - - /home -cli_deployment_vars: - region: "{{ region }}" - zone: "{{ zone }}" - slurm_cluster_name: "{{ slurm_cluster_name }}" - disk_size_gb: 200 - a4h_cluster_size: 2 - base_network_name: "{{ test_name }}" - a4h_dws_flex_enabled: true