Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ on:
branches:
- dev
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
- 'pull-request/[0-9]+'
- 'deploy-release/*'
merge_group:
types: [checks_requested]
workflow_dispatch:
Expand Down Expand Up @@ -112,7 +112,7 @@ jobs:
with:
issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
repository: ${{ github.repository }}
body-includes: "<!--external-contributor-comment-->"
body-includes: '<!--external-contributor-comment-->'

- name: Delete comment
uses: actions/github-script@v7
Expand Down Expand Up @@ -254,11 +254,9 @@ jobs:

- name: Download test data
shell: bash
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
echo "::group::Download test data"
pip install --no-cache-dir pygithub click
pip install --no-cache-dir click requests
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
echo "::endgroup::"

Expand Down Expand Up @@ -368,7 +366,7 @@ jobs:
- cicd-container-build
- cicd-parse-unit-tests
runs-on: nvidia-ci-aws-gpu-x8
name: "${{ matrix.bucket }} - latest"
name: '${{ matrix.bucket }} - latest'
environment: nemo-ci
if: |
(
Expand All @@ -391,7 +389,7 @@ jobs:
test_case: ${{ matrix.bucket }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "true"
is_unit_test: 'true'
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}

Expand Down Expand Up @@ -482,7 +480,7 @@ jobs:
- cicd-parse-integration-tests
- cicd-unit-tests-latest
runs-on: nvidia-ci-aws-gpu-x8
name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
name: '${{ matrix.model }}/${{ matrix.test_case }} - latest'
environment: nemo-ci
env:
PIP_DISABLE_PIP_VERSION_CHECK: 1
Expand All @@ -506,7 +504,7 @@ jobs:
model: ${{ matrix.model }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "false"
is_unit_test: 'false'
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}

Expand Down
5 changes: 3 additions & 2 deletions .gitlab/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ eval "IMAGE=\$$IMAGE"
# Start a named container in detached mode
docker run -d --name download_test_data -w /workdir/ python:3.12-slim bash -c 'sleep infinity'
docker cp tests/. download_test_data:/workdir/tests
docker exec -e GH_TOKEN=$GH_TOKEN download_test_data bash -c '
docker exec download_test_data bash -c '
ls -al /workdir/
pip install --no-cache-dir pygithub click
pip install --no-cache-dir click requests
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
'
docker cp download_test_data:/workdir/assets ./
Expand Down Expand Up @@ -50,6 +50,7 @@ DOCKER_BUILDKIT=1 docker build \
--builder=container \
--build-arg JET_API_VERSION=$JET_API_VERSION \
--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \
--cache-from type=registry,ref=${IMAGE}-buildcache:dev \
--cache-from type=registry,ref=${IMAGE}-buildcache:main \
--build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
--push \
Expand Down
2 changes: 1 addition & 1 deletion megatron/core/dist_checkpointing/exchange_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class ShardDistribution(NamedTuple):
def _shard_size(sh_ten: ShardedTensor):
"""Returns size in bytes of a given sharded tensor."""
if sh_ten.flattened_range is None:
numel = np.product(sh_ten.local_shape)
numel = np.prod(sh_ten.local_shape)
else:
numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start
return numel * torch._utils._element_size(sh_ten.dtype)
Expand Down
2 changes: 1 addition & 1 deletion megatron/core/dist_checkpointing/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def local_coordinates(self) -> Tuple[np.ndarray, ...]:
)

# TODO: np.unravel_index?
mask = np.zeros(np.product(self.local_shape), dtype=bool)
mask = np.zeros(np.prod(self.local_shape), dtype=bool)
mask[self.flattened_range] = True
return np.nonzero(mask.reshape(self.local_shape))

Expand Down
2 changes: 1 addition & 1 deletion megatron/core/dist_checkpointing/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def _validate_sharding_for_key_flattened(tensors_by_shard):
all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))

starts, stops = map(np.asarray, zip(*sorted(all_slices)))
expected_size = np.product(local_shape)
expected_size = np.prod(local_shape)
if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]):
raise CheckpointingException(
f"Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}"
Expand Down
16 changes: 8 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dynamic = ["version", "readme"]
description = "Megatron Core - a library for efficient and scalable training of transformer based models"
requires-python = ">=3.10"
license = { text = "Apache 2.0" }
dependencies = ["torch", "numpy<2.0.0", "packaging>=24.2"]
dependencies = ["torch", "numpy", "packaging>=24.2"]
authors = [{ name = "NVIDIA", email = "[email protected]" }]
maintainers = [{ name = "NVIDIA", email = "[email protected]" }]
keywords = [
Expand Down Expand Up @@ -68,23 +68,23 @@ mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"]

dev = [
"nvidia-modelopt[torch]; sys_platform != 'darwin'",
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
"transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.10.0",
"nvidia-resiliency-ext",
"tqdm",
"einops~=0.8",
"tensorstore~=0.1,!=0.1.46,!=0.1.72",
"nvtx~=0.2",
"multi-storage-client~=0.27",
"opentelemetry-api~=1.33.1",
"setuptools<80.0.0",
"mamba-ssm~=2.2",
"causal-conv1d~=1.5",
"nv-grouped-gemm~=1.1",
"megatron-energon[av_decode]~=6.0",
"av<16.0.0", # At the time, av 16.0.0 is not compatible with Python 3.12
"av",
"flashinfer-python",
"wget",
"onnxscript",
"fastapi~=0.50", # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
]

lts = [
Expand All @@ -94,15 +94,15 @@ lts = [
"nvtx~=0.2",
"multi-storage-client~=0.27",
"opentelemetry-api~=1.33.1",
"setuptools<80.0.0",
"mamba-ssm~=2.2",
"causal-conv1d~=1.5",
"nv-grouped-gemm~=1.1",
"megatron-energon[av_decode]~=6.0",
"av<16.0.0", # At the time, av 16.0.0 is not compatible with Python 3.12
"av",
"flashinfer-python",
"wget",
"onnxscript",
"fastapi~=0.50", # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
]

[dependency-groups]
Expand Down Expand Up @@ -146,7 +146,7 @@ linting = [
"pylint==3.2.6",
]
ci = ["python-gitlab", "slack-sdk", "pandas"]
flash_mla = ["flash_mla"]
no_pypi_wheels = ["flash_mla", "emerging_optimizers"]

[tool.uv]
default-groups = ["linting", "build", "test"]
Expand All @@ -173,7 +173,7 @@ override-dependencies = [
flash_mla = [
{ git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" },
]
transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9`
# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this commented out, where would TE come from now ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After commenting this out, TE comes from PyPI. This will speed up our build since we don't need to build the transformer-engine-cu13 wheel ourselves anymore

nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "01a9a8ba360f7b2908728ad0516e0ad9d936966d" }
emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "fb1add873e7851ec34b48581ea1b15761b73d189" }

Expand Down
Loading
Loading