NVIDIA
diff --git a/‎.github/actions/store-delete-k8s-ghcr/action.yml‎
Lines changed: 27 additions & 0 deletions b/‎.github/actions/store-delete-k8s-ghcr/action.yml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎.github/actions/submit-delete-k8s-job/action.yml‎
Lines changed: 37 additions & 0 deletions b/‎.github/actions/submit-delete-k8s-job/action.yml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎.github/actions/with-post-step/action.yml‎
Lines changed: 42 additions & 0 deletions b/‎.github/actions/with-post-step/action.yml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎.github/actions/with-post-step/main.js‎
Lines changed: 46 additions & 0 deletions b/‎.github/actions/with-post-step/main.js‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/container/Dockerfile.axlearn‎
Lines changed: 44 additions & 0 deletions b/‎.github/container/Dockerfile.axlearn‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎.github/container/pip-finalize.sh‎
Lines changed: 6 additions & 0 deletions b/‎.github/container/pip-finalize.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/container/test-axlearn.sh‎
Lines changed: 169 additions & 0 deletions b/‎.github/container/test-axlearn.sh‎
Lines changed: 169 additions & 0 deletions
@@ -0,0 +1,27 @@
+name: Store & Delete GHCR Token
+description: Store and Delete the docker credentails for pulling from GHCR
+
+outputs:
+  token-name:
+    description: Name of the K8s secret to delete
+    value: ${{ steps.token.outputs.token-name }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Generate a UUID token 
+      shell: bash 
+      id: token
+      run: | 
+        echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT
+    - name: Delete GitHub Container Registry token
+      uses: ./.github/actions/with-post-step
+      with: 
+        main: | 
+          # Store GitHub Container Registry token as Kubernetes secret
+          kubectl create secret generic \
+          ${{ steps.token.outputs.token-name }} \
+          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+          --type=kubernetes.io/dockerconfigjson
+        post: |
+          kubectl delete secret ${{ steps.token.outputs.token-name }}
@@ -0,0 +1,37 @@
+name: Submit & Delete K8s Job
+description: Submit and delete a K8s job after its execution
+
+inputs:
+  job-name:
+    description: The job name
+    required: true
+  job-config-file:
+    description: Path to the Kubernetes job YAML
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Submit and Delete Kubernetes job
+      uses: ./.github/actions/with-post-step 
+      with: 
+        main: |
+          echo "Submit K8s job" 
+          kubectl apply -f "${{ inputs.job-config-file }}"
+          
+          # Wait for job to be craeted
+          kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s
+          
+          # Wait for job to be unsuspended
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s
+          
+          # Wait for pods to be running
+          kubectl wait --for=condition=Ready \
+            --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
+            --timeout=600s pod
+          
+          # Stream logs
+          kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
+          
+        post: | 
+          kubectl delete -f "${{ inputs.job-config-file }}"
@@ -0,0 +1,42 @@
+# ==================================================================================================================== #
+# Authors:                                                                                                             #
+#   Patrick Lehmann                                                                                                    #
+#   Unai Martinez-Corral                                                                                               #
+#                                                                                                                      #
+# ==================================================================================================================== #
+# Copyright 2020-2024 The pyTooling Authors                                                                            #
+#                                                                                                                      #
+# Licensed under the Apache License, Version 2.0 (the "License");                                                      #
+# you may not use this file except in compliance with the License.                                                     #
+# You may obtain a copy of the License at                                                                              #
+#                                                                                                                      #
+#   http://www.apache.org/licenses/LICENSE-2.0                                                                         #
+#                                                                                                                      #
+# Unless required by applicable law or agreed to in writing, software                                                  #
+# distributed under the License is distributed on an "AS IS" BASIS,                                                    #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                             #
+# See the License for the specific language governing permissions and                                                  #
+# limitations under the License.                                                                                       #
+#                                                                                                                      #
+# SPDX-License-Identifier: Apache-2.0                                                                                  #
+# ==================================================================================================================== #
+name: With post step
+
+description: 'Generic JS Action to execute a main command and set a command as a post step.'
+
+inputs:
+  main:
+    description: 'Main command/script.'
+    required: true
+  post:
+    description: 'Post command/script.'
+    required: true
+  key:
+    description: 'Name of the state variable used to detect the post step.'
+    required: false
+    default: POST
+
+runs:
+  using: 'node20'
+  main: 'main.js'
+  post: 'main.js'
@@ -0,0 +1,46 @@
+/* ================================================================================================================== *
+ * Authors:                                                                                                           *
+ *   Unai Martinez-Corral                                                                                             *
+ *                                                                                                                    *
+ * ================================================================================================================== *
+ * Copyright 2021-2022 Unai Martinez-Corral <[email protected]>                                             *
+ * Copyright 2022 Unai Martinez-Corral <[email protected]>                                                 *
+ *                                                                                                                    *
+ * Licensed under the Apache License, Version 2.0 (the "License");                                                    *
+ * you may not use this file except in compliance with the License.                                                   *
+ * You may obtain a copy of the License at                                                                            *
+ *                                                                                                                    *
+ *     http://www.apache.org/licenses/LICENSE-2.0                                                                     *
+ *                                                                                                                    *
+ * Unless required by applicable law or agreed to in writing, software                                                *
+ * distributed under the License is distributed on an "AS IS" BASIS,                                                  *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                           *
+ * See the License for the specific language governing permissions and                                                *
+ * limitations under the License.                                                                                     *
+ *                                                                                                                    *
+ * SPDX-License-Identifier: Apache-2.0                                                                                *
+ * ================================================================================================================== *
+ *                                                                                                                    *
+ * Context:                                                                                                           *
+ * * https://github.com/docker/login-action/issues/72                                                                 *
+ * * https://github.com/actions/runner/issues/1478                                                                    *
+ * ================================================================================================================== */
+const { spawn } = require("child_process");
+const { appendFileSync } = require("fs");
+const { EOL } = require("os");
+
+function run(cmd) {
+  const subprocess = spawn(cmd, { stdio: "inherit", shell: true });
+  subprocess.on("exit", (exitCode) => {
+    process.exitCode = exitCode;
+  });
+}
+
+const key = process.env.INPUT_KEY.toUpperCase();
+
+if ( process.env[`STATE_${key}`] !== undefined ) { // Are we in the 'post' step?
+  run(process.env.INPUT_POST);
+} else { // Otherwise, this is the main step
+  appendFileSync(process.env.GITHUB_STATE, `${key}=true${EOL}`);
+  run(process.env.INPUT_MAIN);
+}
@@ -0,0 +1,44 @@
+# syntax=docker/dockerfile:1-labs
+ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
+ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main
+ARG SRC_PATH_AXLEARN=/opt/axlearn
+
+###############################################################################
+## Download source and configure dependencies
+###############################################################################
+FROM ${BASE_IMAGE} AS mealkit
+ARG URLREF_AXLEARN
+ARG SRC_PATH_AXLEARN
+
+RUN git-clone.sh "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}"
+
+# these packages are needed to run axlearn tests
+# https://github.com/apple/axlearn/blob/main/pyproject.toml as reference
+RUN <<"EOF" bash -ex
+  echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in
+  cat <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
+aqtp==0.8.2
+einops==0.8.0
+nltk==3.7
+portpicker==1.6.0
+seqio==0.0.18
+protobuf==3.20.3  
+pytest>=7.4.3
+REQUIREMENTS
+EOF
+
+
+###############################################################################
+## Add test script to the path
+###############################################################################
+
+ADD test-axlearn.sh /usr/local/bin/
+
+###############################################################################
+## Install accumulated packages from the base image and the previous stage
+###############################################################################
+FROM mealkit AS final
+
+RUN pip-finalize.sh
+
+WORKDIR ${SRC_PATH_AXLEARN}
@@ -46,6 +46,12 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
   exit 1
 fi
 
+# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64
+if [ "$(uname -m)" = "x86_64" ]; then
+  sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
+else
+  echo "Skipping TF on $(uname -m)"
+fi
 # --no-deps is required since conflicts can still appear during pip-sync
 pip-sync --pip-args '--no-deps --src /opt' requirements.txt
 
 
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+set -uo pipefail
+
+usage() {
+    echo "Run tests in axlearn with specified options."
+    echo ""
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "  OPTIONS                       DESCRIPTION"
+    echo "  -d, --directory DIR           Directory to run tests in."
+    echo "                                Default: 'axlearn/axlearn/common'."
+    echo "  -t, --test-files FILES        Pattern for test files to run."
+    echo "                                Default: '*_test.py'."
+    echo "  -o, --output DIRECTORY        Output directory for logs and summary."
+    echo "                                Default: 'test_runs/<timestamp>'."
+    echo "  -h, --help                    Show this help message and exit."
+    exit 1
+}
+
+# Default values
+DIR='axlearn/axlearn/common'
+TEST_FILES=()
+OUTPUT_DIRECTORY=''
+
+# Parse args manually
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        -d|--directory)
+            if [[ -z "$2" ]]; then
+                echo "Error: --directory requires an argument."
+                usage
+            fi
+            DIR="$2"
+            shift 2
+            ;;
+        -t|--test-files)
+            shift
+            # Collect all arguments until the next option (starting with '-')
+            if [[ $# -eq 0 ]]; then
+                echo "Error: --test-files requires at least one file pattern."
+                usage
+            fi
+            echo "Option -t|--test-files with arguments:"
+            while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
+                echo "  $1"
+                TEST_FILES+=("$1")
+                shift
+            done
+            ;;
+        -o|--output)
+            if [[ -z "$2" ]]; then
+                echo "Error: --output requires an argument."
+                usage
+            fi
+            OUTPUT_DIRECTORY="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+
+if [ -z "$OUTPUT_DIRECTORY" ]; then
+    timestamp=$(date +%Y%m%d_%H%M%S)
+    OUTPUT_DIRECTORY="test_runs/${timestamp}"
+fi
+LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs"
+
+mkdir -p "${LOG_DIRECTORY}"
+
+# Print out config for sanity check
+echo "Configuration:"
+echo "  Directory: $DIR"
+if [ "${#TEST_FILES[@]}" -gt 0 ]; then
+    echo "  Test Files:"
+    for f in "${TEST_FILES[@]}"; do
+        echo "    $f"
+    done
+else
+    echo "  Test Files Pattern: '*_test.py' (default)"
+fi
+echo "  Output Directory: $OUTPUT_DIRECTORY"
+
+cd "$DIR" || exit 1
+
+echo "Running tests..."
+
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+pip install timm transformers scikit-learn 
+
+
+if [ "${#TEST_FILES[@]}" -eq 0 ]; then
+    TEST_FILES=("*_test.py")
+fi
+
+expanded_test_files=()
+for pattern in "${TEST_FILES[@]}"; do
+    # retrieve all the files
+    files=( $pattern )
+    if [ "${#files[@]}" -gt 0 ]; then
+        expanded_test_files+=( "${files[@]}" )
+    else
+        echo "Warning: No files matched pattern '$pattern'"
+    fi
+done
+
+if [ "${#expanded_test_files[@]}" -eq 0 ]; then
+    echo "No test files found to run."
+    exit 1
+fi
+
+# in case we have the exclusion list file 
+EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt"
+EXCLUDE_PATTERNS=()
+
+if [ -f "$EXCLUDE_LIST_FILE" ]; then
+    echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'"
+    mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE"
+else
+    echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'"
+fi
+
+final_test_files=()
+
+for test_file in "${expanded_test_files[@]}"; do 
+    exclude=false 
+    for pattern in "${EXCLUDE_PATTERNS[@]}"; do 
+        if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then
+            exclude=true 
+            break 
+        fi 
+    done 
+    if [ "$exclude" = false ]; then 
+        final_test_files+=("$test_file")
+    fi 
+done
+
+# Initialize counters for test
+failures=0
+passed=0
+SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
+
+
+for test_file in "${final_test_files[@]}"; do
+    echo "Running: ${test_file}"
+    log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
+    log_file="${LOG_DIRECTORY}/${log_file_name}"
+    # run the tests and save them as *.log
+    pytest "${test_file}" --capture=tee-sys | tee "${log_file}"
+    exit_code=${PIPESTATUS[0]}
+    echo $exit_code
+    # write number of tests passed and failed
+    if [ $exit_code -eq 0 ]; then
+        echo "${test_file}: PASSED" >> "${SUMMARY_FILE}"
+        ((passed++))
+    else
+        echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}"
+        ((failures++))
+    fi
+    echo ""
+done