Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/container/Dockerfile.axlearn
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
ARG URLREF_AXLEARN=https://github.com/Steboss/axlearn.git#main
ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git
ARG SRC_PATH_AXLEARN=/opt/axlearn

###############################################################################
Expand Down
5 changes: 5 additions & 0 deletions .github/container/manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,8 @@ pathwaysutils:
tracking_ref: main
latest_verified_commit: 359776d454940ffaa337c36d1df16308d44a95a9
mode: pip-vcs
axlearn:
url: https://github.com/Steboss/axlearn.git
tracking_ref: sbosisio/tree_util
latest_verified_commit:
mode: git-clone
2 changes: 2 additions & 0 deletions .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ spec:
mkdir -p ${TRAINER_DIR}
OUTPUT_LOG_FILE=${TRAINER_DIR}/output.log

export XLA_FLAGS="--xla_gpu_enable_command_buffer="

python3 /usr/local/bin/fuji-train-perf.py \
--module=text.gpt.c4_trainer \
--config=${CONFIG} \
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ jobs:
CONTAINER_NAME: axlearn
DOCKERFILE: .github/container/Dockerfile.axlearn
RUNNER_SIZE: large
EXTRA_BUILD_ARGS: |
URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
secrets: inherit

collect-docker-tags:
Expand Down Expand Up @@ -521,7 +523,7 @@ jobs:
# merge the log files
cat \
/log/pytest-report-L0-unittest.jsonl
/log/pytest-report-L0-distributed-unittest.jsonl
/log/pytest-report-L0-distributed-unittest.jsonl
> /log/pytest-report.jsonl
EOF
STATISTICS_SCRIPT: |
Expand Down
Loading