Skip to content

Commit 45e0560

Browse files
authored
Add support for axlearn (#1285)
1 parent 1b6845a commit 45e0560

File tree

13 files changed

+742
-57
lines changed

13 files changed

+742
-57
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: Store & Delete GHCR Token
2+
description: Store and Delete the docker credentails for pulling from GHCR
3+
4+
outputs:
5+
token-name:
6+
description: Name of the K8s secret to delete
7+
value: ${{ steps.token.outputs.token-name }}
8+
9+
runs:
10+
using: "composite"
11+
steps:
12+
- name: Generate a UUID token
13+
shell: bash
14+
id: token
15+
run: |
16+
echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT
17+
- name: Delete GitHub Container Registry token
18+
uses: ./.github/actions/with-post-step
19+
with:
20+
main: |
21+
# Store GitHub Container Registry token as Kubernetes secret
22+
kubectl create secret generic \
23+
${{ steps.token.outputs.token-name }} \
24+
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
25+
--type=kubernetes.io/dockerconfigjson
26+
post: |
27+
kubectl delete secret ${{ steps.token.outputs.token-name }}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Submit & Delete K8s Job
2+
description: Submit and delete a K8s job after its execution
3+
4+
inputs:
5+
job-name:
6+
description: The job name
7+
required: true
8+
job-config-file:
9+
description: Path to the Kubernetes job YAML
10+
required: true
11+
12+
runs:
13+
using: "composite"
14+
steps:
15+
- name: Submit and Delete Kubernetes job
16+
uses: ./.github/actions/with-post-step
17+
with:
18+
main: |
19+
echo "Submit K8s job"
20+
kubectl apply -f "${{ inputs.job-config-file }}"
21+
22+
# Wait for job to be craeted
23+
kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s
24+
25+
# Wait for job to be unsuspended
26+
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s
27+
28+
# Wait for pods to be running
29+
kubectl wait --for=condition=Ready \
30+
--selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
31+
--timeout=600s pod
32+
33+
# Stream logs
34+
kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
35+
36+
post: |
37+
kubectl delete -f "${{ inputs.job-config-file }}"
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# ==================================================================================================================== #
2+
# Authors: #
3+
# Patrick Lehmann #
4+
# Unai Martinez-Corral #
5+
# #
6+
# ==================================================================================================================== #
7+
# Copyright 2020-2024 The pyTooling Authors #
8+
# #
9+
# Licensed under the Apache License, Version 2.0 (the "License"); #
10+
# you may not use this file except in compliance with the License. #
11+
# You may obtain a copy of the License at #
12+
# #
13+
# http://www.apache.org/licenses/LICENSE-2.0 #
14+
# #
15+
# Unless required by applicable law or agreed to in writing, software #
16+
# distributed under the License is distributed on an "AS IS" BASIS, #
17+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
18+
# See the License for the specific language governing permissions and #
19+
# limitations under the License. #
20+
# #
21+
# SPDX-License-Identifier: Apache-2.0 #
22+
# ==================================================================================================================== #
23+
name: With post step
24+
25+
description: 'Generic JS Action to execute a main command and set a command as a post step.'
26+
27+
inputs:
28+
main:
29+
description: 'Main command/script.'
30+
required: true
31+
post:
32+
description: 'Post command/script.'
33+
required: true
34+
key:
35+
description: 'Name of the state variable used to detect the post step.'
36+
required: false
37+
default: POST
38+
39+
runs:
40+
using: 'node20'
41+
main: 'main.js'
42+
post: 'main.js'
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/* ================================================================================================================== *
2+
* Authors: *
3+
* Unai Martinez-Corral *
4+
* *
5+
* ================================================================================================================== *
6+
* Copyright 2021-2022 Unai Martinez-Corral <[email protected]> *
7+
* Copyright 2022 Unai Martinez-Corral <[email protected]> *
8+
* *
9+
* Licensed under the Apache License, Version 2.0 (the "License"); *
10+
* you may not use this file except in compliance with the License. *
11+
* You may obtain a copy of the License at *
12+
* *
13+
* http://www.apache.org/licenses/LICENSE-2.0 *
14+
* *
15+
* Unless required by applicable law or agreed to in writing, software *
16+
* distributed under the License is distributed on an "AS IS" BASIS, *
17+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
18+
* See the License for the specific language governing permissions and *
19+
* limitations under the License. *
20+
* *
21+
* SPDX-License-Identifier: Apache-2.0 *
22+
* ================================================================================================================== *
23+
* *
24+
* Context: *
25+
* * https://github.com/docker/login-action/issues/72 *
26+
* * https://github.com/actions/runner/issues/1478 *
27+
* ================================================================================================================== */
28+
const { spawn } = require("child_process");
29+
const { appendFileSync } = require("fs");
30+
const { EOL } = require("os");
31+
32+
function run(cmd) {
33+
const subprocess = spawn(cmd, { stdio: "inherit", shell: true });
34+
subprocess.on("exit", (exitCode) => {
35+
process.exitCode = exitCode;
36+
});
37+
}
38+
39+
const key = process.env.INPUT_KEY.toUpperCase();
40+
41+
if ( process.env[`STATE_${key}`] !== undefined ) { // Are we in the 'post' step?
42+
run(process.env.INPUT_POST);
43+
} else { // Otherwise, this is the main step
44+
appendFileSync(process.env.GITHUB_STATE, `${key}=true${EOL}`);
45+
run(process.env.INPUT_MAIN);
46+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# syntax=docker/dockerfile:1-labs
2+
ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
3+
ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main
4+
ARG SRC_PATH_AXLEARN=/opt/axlearn
5+
6+
###############################################################################
7+
## Download source and configure dependencies
8+
###############################################################################
9+
FROM ${BASE_IMAGE} AS mealkit
10+
ARG URLREF_AXLEARN
11+
ARG SRC_PATH_AXLEARN
12+
13+
RUN git-clone.sh "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}"
14+
15+
# these packages are needed to run axlearn tests
16+
# https://github.com/apple/axlearn/blob/main/pyproject.toml as reference
17+
RUN <<"EOF" bash -ex
18+
echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in
19+
cat <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
20+
aqtp==0.8.2
21+
einops==0.8.0
22+
nltk==3.7
23+
portpicker==1.6.0
24+
seqio==0.0.18
25+
protobuf==3.20.3
26+
pytest>=7.4.3
27+
REQUIREMENTS
28+
EOF
29+
30+
31+
###############################################################################
32+
## Add test script to the path
33+
###############################################################################
34+
35+
ADD test-axlearn.sh /usr/local/bin/
36+
37+
###############################################################################
38+
## Install accumulated packages from the base image and the previous stage
39+
###############################################################################
40+
FROM mealkit AS final
41+
42+
RUN pip-finalize.sh
43+
44+
WORKDIR ${SRC_PATH_AXLEARN}

.github/container/pip-finalize.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
4646
exit 1
4747
fi
4848

49+
# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64
50+
if [ "$(uname -m)" = "x86_64" ]; then
51+
sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
52+
else
53+
echo "Skipping TF on $(uname -m)"
54+
fi
4955
# --no-deps is required since conflicts can still appear during pip-sync
5056
pip-sync --pip-args '--no-deps --src /opt' requirements.txt
5157

.github/container/test-axlearn.sh

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/bin/bash
2+
3+
set -uo pipefail
4+
5+
usage() {
6+
echo "Run tests in axlearn with specified options."
7+
echo ""
8+
echo "Usage: $0 [OPTIONS]"
9+
echo ""
10+
echo " OPTIONS DESCRIPTION"
11+
echo " -d, --directory DIR Directory to run tests in."
12+
echo " Default: 'axlearn/axlearn/common'."
13+
echo " -t, --test-files FILES Pattern for test files to run."
14+
echo " Default: '*_test.py'."
15+
echo " -o, --output DIRECTORY Output directory for logs and summary."
16+
echo " Default: 'test_runs/<timestamp>'."
17+
echo " -h, --help Show this help message and exit."
18+
exit 1
19+
}
20+
21+
# Default values
22+
DIR='axlearn/axlearn/common'
23+
TEST_FILES=()
24+
OUTPUT_DIRECTORY=''
25+
26+
# Parse args manually
27+
while [[ $# -gt 0 ]]; do
28+
key="$1"
29+
case $key in
30+
-d|--directory)
31+
if [[ -z "$2" ]]; then
32+
echo "Error: --directory requires an argument."
33+
usage
34+
fi
35+
DIR="$2"
36+
shift 2
37+
;;
38+
-t|--test-files)
39+
shift
40+
# Collect all arguments until the next option (starting with '-')
41+
if [[ $# -eq 0 ]]; then
42+
echo "Error: --test-files requires at least one file pattern."
43+
usage
44+
fi
45+
echo "Option -t|--test-files with arguments:"
46+
while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
47+
echo " $1"
48+
TEST_FILES+=("$1")
49+
shift
50+
done
51+
;;
52+
-o|--output)
53+
if [[ -z "$2" ]]; then
54+
echo "Error: --output requires an argument."
55+
usage
56+
fi
57+
OUTPUT_DIRECTORY="$2"
58+
shift 2
59+
;;
60+
-h|--help)
61+
usage
62+
;;
63+
*)
64+
echo "Unknown option: $1"
65+
usage
66+
;;
67+
esac
68+
done
69+
70+
71+
if [ -z "$OUTPUT_DIRECTORY" ]; then
72+
timestamp=$(date +%Y%m%d_%H%M%S)
73+
OUTPUT_DIRECTORY="test_runs/${timestamp}"
74+
fi
75+
LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs"
76+
77+
mkdir -p "${LOG_DIRECTORY}"
78+
79+
# Print out config for sanity check
80+
echo "Configuration:"
81+
echo " Directory: $DIR"
82+
if [ "${#TEST_FILES[@]}" -gt 0 ]; then
83+
echo " Test Files:"
84+
for f in "${TEST_FILES[@]}"; do
85+
echo " $f"
86+
done
87+
else
88+
echo " Test Files Pattern: '*_test.py' (default)"
89+
fi
90+
echo " Output Directory: $OUTPUT_DIRECTORY"
91+
92+
cd "$DIR" || exit 1
93+
94+
echo "Running tests..."
95+
96+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
97+
pip install timm transformers scikit-learn
98+
99+
100+
if [ "${#TEST_FILES[@]}" -eq 0 ]; then
101+
TEST_FILES=("*_test.py")
102+
fi
103+
104+
expanded_test_files=()
105+
for pattern in "${TEST_FILES[@]}"; do
106+
# retrieve all the files
107+
files=( $pattern )
108+
if [ "${#files[@]}" -gt 0 ]; then
109+
expanded_test_files+=( "${files[@]}" )
110+
else
111+
echo "Warning: No files matched pattern '$pattern'"
112+
fi
113+
done
114+
115+
if [ "${#expanded_test_files[@]}" -eq 0 ]; then
116+
echo "No test files found to run."
117+
exit 1
118+
fi
119+
120+
# in case we have the exclusion list file
121+
EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt"
122+
EXCLUDE_PATTERNS=()
123+
124+
if [ -f "$EXCLUDE_LIST_FILE" ]; then
125+
echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'"
126+
mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE"
127+
else
128+
echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'"
129+
fi
130+
131+
final_test_files=()
132+
133+
for test_file in "${expanded_test_files[@]}"; do
134+
exclude=false
135+
for pattern in "${EXCLUDE_PATTERNS[@]}"; do
136+
if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then
137+
exclude=true
138+
break
139+
fi
140+
done
141+
if [ "$exclude" = false ]; then
142+
final_test_files+=("$test_file")
143+
fi
144+
done
145+
146+
# Initialize counters for test
147+
failures=0
148+
passed=0
149+
SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
150+
151+
152+
for test_file in "${final_test_files[@]}"; do
153+
echo "Running: ${test_file}"
154+
log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
155+
log_file="${LOG_DIRECTORY}/${log_file_name}"
156+
# run the tests and save them as *.log
157+
pytest "${test_file}" --capture=tee-sys | tee "${log_file}"
158+
exit_code=${PIPESTATUS[0]}
159+
echo $exit_code
160+
# write number of tests passed and failed
161+
if [ $exit_code -eq 0 ]; then
162+
echo "${test_file}: PASSED" >> "${SUMMARY_FILE}"
163+
((passed++))
164+
else
165+
echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}"
166+
((failures++))
167+
fi
168+
echo ""
169+
done

0 commit comments

Comments
 (0)