diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index 678e22439..45ee34f4c 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -24,7 +24,6 @@ runs: echo "Submit K8s job" kubectl apply -f "${{ inputs.job-config-file }}" kubectl get event | grep ${{ inputs.job-name }} - # Wait for job to be created kubectl wait --for=create job/${{ inputs.job-name }} --timeout=$TIMEOUT_JOB_CREATION diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 276a0bda2..4be031003 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -25,6 +25,8 @@ seqio==0.0.18 protobuf==3.20.3 pytest>=7.4.3 tensorflow==2.18.1 +pytest-xdist +pytest-reportlog REQUIREMENTS EOF diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index d1993cc03..9cefa64ce 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -2,7 +2,9 @@ set -uo pipefail +# HELPER FUNCTIONS usage() { + # Function to handle all the inputs echo "Run tests in axlearn with specified options." echo "" echo "Usage: $0 [OPTIONS]" @@ -18,12 +20,30 @@ usage() { exit 1 } -# Default values -DIR='axlearn/axlearn/common' +run_tests() { + # Function to run tests for AXLearn + local env_spec=$1 + local marker=$2 + local suffix=$3 + shift 3 + local -a test_files=("$@") + + local junit="log_${suffix}.xml" + local log="log_${suffix}.log" + + cmd="${env_spec:+${env_spec} }pytest -m \"${marker}\" ${test_files[@]}\ + --capture=tee-sys -v \ + --junit-xml=${LOG_DIRECTORY}/${junit} | tee ${LOG_DIRECTORY}/${log}" + echo "Running command ${cmd}" + eval "${cmd}" +} + +# DEFAULT VALUES +DIR='/opt/axlearn/axlearn/common' TEST_FILES=() OUTPUT_DIRECTORY='' -# Parse args manually +# INPUT PARSING while [[ $# -gt 0 ]]; do key="$1" case $key in @@ -66,19 +86,15 @@ while [[ $# -gt 0 ]]; do ;; esac done - - +cd "$DIR" if [ -z "$OUTPUT_DIRECTORY" ]; then timestamp=$(date +%Y%m%d_%H%M%S) - OUTPUT_DIRECTORY="test_runs/${timestamp}" + OUTPUT_DIRECTORY="output/${timestamp}" fi LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs" mkdir -p "${LOG_DIRECTORY}" -# Print out config for sanity check -echo "Configuration:" -echo " Directory: $DIR" if [ "${#TEST_FILES[@]}" -gt 0 ]; then echo " Test Files:" for f in "${TEST_FILES[@]}"; do @@ -87,16 +103,18 @@ if [ "${#TEST_FILES[@]}" -gt 0 ]; then else echo " Test Files Pattern: '*_test.py' (default)" fi -echo " Output Directory: $OUTPUT_DIRECTORY" - -cd "$DIR" || exit 1 - -echo "Running tests..." +# DEPENDENCIES pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu -pip install timm transformers scikit-learn - - +pip install timm transformers scikit-learn grain evaluate prefixed wandb +echo "Downloading input data..." +mkdir -p /opt/axlearn/axlearn/data/tokenizers/sentencepiece +mkdir -p /opt/axlearn/axlearn/data/tokenizers/bpe +curl https://huggingface.co/t5-base/resolve/main/spiece.model -o /opt/axlearn/axlearn/data/tokenizers/sentencepiece/t5-base +curl https://huggingface.co/FacebookAI/roberta-base/raw/main/merges.txt -o /opt/axlearn/axlearn/data/tokenizers/bpe/roberta-base-merges.txt +curl https://huggingface.co/FacebookAI/roberta-base/raw/main/vocab.json -o /opt/axlearn/axlearn/data/tokenizers/bpe/roberta-base-vocab.json + +# RETRIEVE TEST FILES if [ "${#TEST_FILES[@]}" -eq 0 ]; then TEST_FILES=("*_test.py") fi @@ -117,53 +135,111 @@ if [ "${#expanded_test_files[@]}" -eq 0 ]; then exit 1 fi -# in case we have the exclusion list file -EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt" -EXCLUDE_PATTERNS=() - -if [ -f "$EXCLUDE_LIST_FILE" ]; then - echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'" - mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE" -else - echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'" -fi - +EXCLUDE_PATTERNS=("array_serialization_test.py" + "t5_test.py" # tensorflow bug + "loss_test.py" + "input_t5_test.py" + "layers_test.py" # tensorflow bug + "checkpointer_orbax_test.py" + "checkpointer_orbax_emergency_test.py" + "checkpointer_test.py" + "input_glue_test.py" + "deberta_test.py" + "orbax_checkpointer" + "loss_test.py" # optax bug + "quantizer_test.py" + "test_utils_test.py" + "update_transformation_test.py" + "env_test.py" + "causal_lm_test.py" + "gradient_accumulation_test.py" + "file_system_test.py" + "compiler_options_test.py" # tpu only + "metrics_correlation_test.py" # manual only + "metrics_glue_test.py" + "ssm_test.py" # test on ssm + "summary_test.py" # wandb test + "param_converter_test.py" + "attention_test.py" # assertion errors to fix + # run these as part of the for_8_devices: + "gda_test.py" + "input_base_test.py" + "input_dispatch_test.py" + "trainer_test.py" + "utils_test.py" + ) final_test_files=() -for test_file in "${expanded_test_files[@]}"; do - exclude=false - for pattern in "${EXCLUDE_PATTERNS[@]}"; do +for test_file in "${expanded_test_files[@]}"; do + exclude=false + for pattern in "${EXCLUDE_PATTERNS[@]}"; do if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then - exclude=true - break - fi - done - if [ "$exclude" = false ]; then + exclude=true + break + fi + done + if [ "$exclude" = false ]; then final_test_files+=("$test_file") - fi + fi done -# Initialize counters for test -failures=0 -passed=0 -SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" - - -for test_file in "${final_test_files[@]}"; do - echo "Running: ${test_file}" - log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log - log_file="${LOG_DIRECTORY}/${log_file_name}" - # run the tests and save them as *.log - pytest "${test_file}" --capture=tee-sys | tee "${log_file}" - exit_code=${PIPESTATUS[0]} - echo $exit_code - # write number of tests passed and failed - if [ $exit_code -eq 0 ]; then - echo "${test_file}: PASSED" >> "${SUMMARY_FILE}" - ((passed++)) + +# RUN TESTS +TEST_8_DEVICES_FILES=("gda_test.py" + "input_base_test.py" + "input_dispatch_test.py" + "trainer_test.py" + "utils_test.py" +) +TEST_8_DEVICES_WITH_PATHS=() +for file in "${TEST_8_DEVICES_FILES[@]}"; do + found_files=$(find . -name "$file" -type f 2>/dev/null) + if [[ -n "$found_files" ]]; then + while IFS= read -r found_file; do + TEST_8_DEVICES_WITH_PATHS+=("$found_file") + done <<< "$found_files" else - echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}" - ((failures++)) + echo "Warning: Test file $file not found in current directory structure" fi - echo "" done + +run_tests "" "for_8_devices" "8_dev" "${TEST_8_DEVICES_WITH_PATHS[@]}" +# All the other tests +runs=( + "|not (gs_login or tpu or high_cpu or fp64 or for_8_devices)|base" + "JAX_ENABLE_X64=1|fp64|fp64" +) +for spec in "${runs[@]}"; do + IFS='|' read -r env_spec marker suffix <<< "${spec}" + echo "Running tests with ${env_spec}, ${marker}, ${suffix}" + run_tests "${env_spec}" "${marker}" "${suffix}" "${final_test_files[@]}" + echo "Test run" +done + +# SUMMARY STATUS +passed=0 +failed=0 +error=0 +skipped=0 +for log in ${LOG_DIRECTORY}/log_*.log; do + count_pass=$(grep -Eo '[0-9]+ passed' "${log}" | awk '{print $1}' || true) + count_fail=$(grep -Eo '[0-9]+ failed' "${log}" | awk '{print $1}' || true) + count_error=$(grep -Eo '[0-9]+ error' "${log}" | awk '{print $1}' || true) + count_skipped=$(grep -Eo '[0-9]+ skipped' "${log}" | awk '{print $1}' || true) + # in case of None + count_pass=${count_pass:-0} + count_fail=${count_fail:-0} + count_error=${count_error:-0} + count_skipped=${count_skipped:-0} + # count all the tests + (( passed += count_pass )) + (( failed += count_fail )) + (( failed += count_error )) + (( skipped += count_skipped )) +done + +echo "Total number of passed tests ${passed}" +echo "Total number of failed tests ${failed}" +echo "Total number of skipped tests ${skipped}" +# add those to summary.txt and we're using it for extracting values +echo "PASSED: ${passed} FAILED: ${failed} SKIPPED: ${skipped}" >> ${LOG_DIRECTORY}/summary.txt diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index e027a434e..24e7ca8e7 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -21,12 +21,8 @@ spec: LOG_DIR="/output/${RUN_ID}" mkdir -p ${LOG_DIR} - - # Start MPS daemon - nvidia-cuda-mps-control -d - # Run tests - pytest-xdist.sh 1 6 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "/opt/axlearn/axlearn/common/*_test.py" | tee -a ${LOG_DIR}/pytest_stdout.log - + # test on JAX, make sure 8 devices are visible + pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "/opt/axlearn/axlearn/common/*_test.py" env: - name: RUN_ID value: PLACEHOLDER diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 6f9b9f4d9..86517a405 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -26,6 +26,11 @@ on: description: 'A JSON object containing git url+refs for softwares to be built' required: false default: '{}' + MODE: + type: string + description: 'Mode selection for running specific tests only' + required: false + default: full outputs: DOCKER_TAGS: description: 'JSON object containing tags of all docker images built' @@ -203,7 +208,12 @@ jobs: test-jax: needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'jax' + ) uses: ./.github/workflows/_test_unit.yaml with: TEST_NAME: jax @@ -242,7 +252,12 @@ jobs: test-nsys-jax: needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'jax' + ) uses: ./.github/workflows/_test_unit.yaml with: TEST_NAME: nsys-jax @@ -345,7 +360,12 @@ jobs: # not already have nsys-jax installed test-nsys-jax-archive: needs: test-nsys-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'jax' + ) strategy: matrix: os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] @@ -373,7 +393,12 @@ jobs: test-nsys-jax-eks: needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'jax' + ) runs-on: eks env: JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} @@ -451,7 +476,12 @@ jobs: test-te-h100: needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'te' + ) uses: ./.github/workflows/_transformer_engine_eks.yaml with: JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} @@ -463,7 +493,12 @@ jobs: test-te-unit-a100: needs: build-jax secrets: inherit - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'te' + ) uses: ./.github/workflows/_test_unit.yaml with: TEST_NAME: te @@ -511,9 +546,15 @@ jobs: pytest-report-L0-unittest.jsonl pytest-report-L0-distributed-unittest.jsonl pytest-report-L1-distributed-unittest.jsonl + test-rosetta-t5x: needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 't5x' + ) uses: ./.github/workflows/_test_t5x_rosetta.yaml with: T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} @@ -521,7 +562,12 @@ jobs: test-maxtext: needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'maxtext' + ) uses: ./.github/workflows/_test_maxtext.yaml with: MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} @@ -529,7 +575,12 @@ jobs: test-axlearn-eks: needs: build-axlearn - if: inputs.ARCHITECTURE == 'amd64' + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'axlearn' + ) runs-on: eks env: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} @@ -565,16 +616,19 @@ jobs: id: log-s3 run: | mkdir -p axlearn-output - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/ - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log" - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/axlearn-unittests.jsonl axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log" + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml" - passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true) - failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true) - total_tests=$((failed_tests + passed_tests)) + + passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + total_tests=$((failed_tests + passed_tests + skipped_tests)) echo "Passed tests: $passed_tests" echo "Failed tests: $failed_tests" + echo "Skipped tests: $skipped_tests" echo "Total tests: $total_tests" echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT @@ -626,7 +680,12 @@ jobs: test-axlearn-fuji-models-eks: needs: build-axlearn - if: inputs.ARCHITECTURE == 'amd64' + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'axlearn' + ) runs-on: eks env: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 004cacc59..a2294fc9c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -40,6 +40,18 @@ on: PACKAGE∊{JAX,XLA,Flax,transformer-engine,airio,axlearn,equinox,T5X,maxtext,google-jetstream} (case-insensitive) default: '' required: false + MODE: + type: choice + description: | + This option is to run just specific part in the _ci step. + - full - everything will be run (default) + - jax - run all the tests related to jax system + - te - run all the tests related to TE + - t5x - run build rosetta + - maxtext - run only the tests for maxtext + - axlearn - run only the tests for axlearn + options: [full, jax, te, t5x, maxtext, axlearn] + default: full concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -197,6 +209,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} + MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -208,6 +221,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} + MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds