File tree Expand file tree Collapse file tree 5 files changed +119
-1
lines changed Expand file tree Collapse file tree 5 files changed +119
-1
lines changed Original file line number Diff line number Diff line change @@ -2,8 +2,9 @@ FROM debian:trixie
22
33# GNU parallel: bats may want to use that
44# gettext-base: provides envsubst, used by nickelpie
5+ # bc: by bash wrappers, for calculation
56RUN apt-get update && apt-get install -y -q --no-install-recommends \
6- parallel git ca-certificates curl make gettext-base jq && \
7+ parallel git ca-certificates curl make gettext-base jq bc yq && \
78 rm -rf /var/lib/apt/lists/*
89
910# Set by BuiltKit, of the form amd64/arm64.
@@ -15,6 +16,7 @@ RUN git clone https://github.com/bats-core/bats-core.git && cd bats-core && \
1516 git checkout 658f442f0fcdd6f9e2ea01625999217e8f7bfe7d && ./install.sh /usr/local
1617
1718RUN mkdir -p /bats-libraries
19+ # These are conservatively patched -- so far; maybe pin them in the future
1820RUN git clone https://github.com/bats-core/bats-support /bats-libraries/bats-support
1921RUN git clone https://github.com/bats-core/bats-assert /bats-libraries/bats-assert
2022RUN git clone https://github.com/bats-core/bats-file /bats-libraries/bats-file
Original file line number Diff line number Diff line change @@ -110,5 +110,6 @@ tests: image
110110 tests/bats/test_cd_imex_chan_inject.bats \
111111 tests/bats/test_cd_mnnvl_workload.bats \
112112 tests/bats/test_cd_logging.bats \
113+ tests/bats/test_cd_failover.bats \
113114 tests/bats/test_cd_updowngrade.bats \
114115 "
Original file line number Diff line number Diff line change @@ -56,6 +56,8 @@ timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection-all.yaml 2> /de
5656timeout -v 5 kubectl delete jobs nickelpie-test 2> /dev/null
5757timeout -v 5 kubectl delete computedomain nickelpie-test-compute-domain 2> /dev/null
5858timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /dev/null
59+ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /dev/null
60+ timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
5961timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
6062timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
6163
Original file line number Diff line number Diff line change 1+ ---
2+ apiVersion : resource.nvidia.com/v1beta1
3+ kind : ComputeDomain
4+ metadata :
5+ name : test-failover-cd
6+ spec :
7+ numNodes : 2
8+ channel :
9+ resourceClaimTemplate :
10+ name : test-failover-rct-channel
11+ ---
12+ apiVersion : kubeflow.org/v2beta1
13+ kind : MPIJob
14+ metadata :
15+ name : test-failover-job
16+ spec :
17+ slotsPerWorker : 2
18+ launcherCreationPolicy : WaitForWorkersReady
19+ runPolicy :
20+ cleanPodPolicy : Running
21+ sshAuthMountPath : /home/mpiuser/.ssh
22+ mpiReplicaSpecs :
23+ Launcher :
24+ replicas : 1
25+ template :
26+ metadata :
27+ labels :
28+ mpi-memcpy-dra-test-replica : mpi-launcher
29+ spec :
30+ containers :
31+ - image : ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
32+ name : mpi-launcher
33+ securityContext :
34+ runAsUser : 1000
35+ command :
36+ - mpirun
37+ args :
38+ - --show-progress
39+ - --bind-to
40+ - core
41+ - --map-by
42+ - ppr:2:node
43+ - -np
44+ - " 4"
45+ - --report-bindings
46+ - -q
47+ - nvbandwidth
48+ - --verbose
49+ - --testSamples
50+ - " 20"
51+ - --bufferSize
52+ - " 2048"
53+ - -t
54+ - multinode_device_to_device_memcpy_read_ce
55+ affinity :
56+ nodeAffinity :
57+ requiredDuringSchedulingIgnoredDuringExecution :
58+ nodeSelectorTerms :
59+ - matchExpressions :
60+ - key : node-role.kubernetes.io/control-plane
61+ operator : Exists
62+ Worker :
63+ replicas : 2
64+ template :
65+ metadata :
66+ labels :
67+ mpi-memcpy-dra-test-replica : mpi-worker
68+ spec :
69+ containers :
70+ - image : ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
71+ name : mpi-worker
72+ securityContext :
73+ runAsUser : 1000
74+ env :
75+ command :
76+ - /usr/sbin/sshd
77+ args :
78+ - -De
79+ - -f
80+ - /home/mpiuser/.sshd_config
81+ resources :
82+ limits :
83+ nvidia.com/gpu : 2
84+ claims :
85+ - name : test-failover-rc-channel
86+ resourceClaims :
87+ - name : test-failover-rc-channel
88+ resourceClaimTemplateName : test-failover-rct-channel
Original file line number Diff line number Diff line change 1+ # shellcheck disable=SC2148
2+ # shellcheck disable=SC2329
3+
4+ setup_file () {
5+ load ' helpers.sh'
6+ _common_setup
7+
8+ local _iargs=(" --set" " logVerbosity=6" )
9+ iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " _iargs
10+ }
11+
12+
13+ @test " CD failover nvb2 0" {
14+ bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 0
15+ }
16+
17+
18+ @test " CD failover nvb2 1" {
19+ bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 1
20+ }
21+
22+
23+ @test " CD failover nvb2 2" {
24+ bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 2
25+ }
You can’t perform that action at this time.
0 commit comments