NVIDIA · ko3n1g · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
@@ -20,8 +20,8 @@ on:
     branches:
       - dev
       - main
-      - "pull-request/[0-9]+"
-      - "deploy-release/*"
+      - 'pull-request/[0-9]+'
+      - 'deploy-release/*'
   merge_group:
     types: [checks_requested]
   workflow_dispatch:
@@ -112,7 +112,7 @@ jobs:
         with:
           issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
           repository: ${{ github.repository }}
-          body-includes: "<!--external-contributor-comment-->"
+          body-includes: '<!--external-contributor-comment-->'
 
       - name: Delete comment
         uses: actions/github-script@v7
@@ -254,11 +254,9 @@ jobs:
 
       - name: Download test data
         shell: bash
-        env:
-          GH_TOKEN: ${{ secrets.PAT }}
         run: |
           echo "::group::Download test data"
-          pip install --no-cache-dir pygithub click
+          pip install --no-cache-dir click requests
           python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
           echo "::endgroup::"
 
@@ -368,7 +366,7 @@ jobs:
       - cicd-container-build
       - cicd-parse-unit-tests
     runs-on: nvidia-ci-aws-gpu-x8
-    name: "${{ matrix.bucket }} - latest"
+    name: '${{ matrix.bucket }} - latest'
     environment: nemo-ci
     if: |
       (
@@ -391,7 +389,7 @@ jobs:
           test_case: ${{ matrix.bucket }}
           tag: latest
           timeout: ${{ matrix.timeout || 30 }}
-          is_unit_test: "true"
+          is_unit_test: 'true'
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
 
@@ -482,7 +480,7 @@ jobs:
       - cicd-parse-integration-tests
       - cicd-unit-tests-latest
     runs-on: nvidia-ci-aws-gpu-x8
-    name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
+    name: '${{ matrix.model }}/${{ matrix.test_case }} - latest'
     environment: nemo-ci
     env:
       PIP_DISABLE_PIP_VERSION_CHECK: 1
@@ -506,7 +504,7 @@ jobs:
           model: ${{ matrix.model }}
           tag: latest
           timeout: ${{ matrix.timeout || 30 }}
-          is_unit_test: "false"
+          is_unit_test: 'false'
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
 

@@ -7,9 +7,9 @@ eval "IMAGE=\$$IMAGE"
 # Start a named container in detached mode
 docker run -d --name download_test_data -w /workdir/ python:3.12-slim bash -c 'sleep infinity'
 docker cp tests/. download_test_data:/workdir/tests
-docker exec -e GH_TOKEN=$GH_TOKEN download_test_data bash -c '
+docker exec download_test_data bash -c '
     ls -al /workdir/
-    pip install --no-cache-dir pygithub click
+    pip install --no-cache-dir click requests
     python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
 '
 docker cp download_test_data:/workdir/assets ./
@@ -50,6 +50,7 @@ DOCKER_BUILDKIT=1 docker build \
     --builder=container \
     --build-arg JET_API_VERSION=$JET_API_VERSION \
     --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \
+    --cache-from type=registry,ref=${IMAGE}-buildcache:dev \
     --cache-from type=registry,ref=${IMAGE}-buildcache:main \
     --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
     --push \

@@ -63,7 +63,7 @@ class ShardDistribution(NamedTuple):
 def _shard_size(sh_ten: ShardedTensor):
     """Returns size in bytes of a given sharded tensor."""
     if sh_ten.flattened_range is None:
-        numel = np.product(sh_ten.local_shape)
+        numel = np.prod(sh_ten.local_shape)
     else:
         numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start
     return numel * torch._utils._element_size(sh_ten.dtype)

@@ -216,7 +216,7 @@ def local_coordinates(self) -> Tuple[np.ndarray, ...]:
             )
 
         # TODO: np.unravel_index?
-        mask = np.zeros(np.product(self.local_shape), dtype=bool)
+        mask = np.zeros(np.prod(self.local_shape), dtype=bool)
         mask[self.flattened_range] = True
         return np.nonzero(mask.reshape(self.local_shape))
 

@@ -519,7 +519,7 @@ def _validate_sharding_for_key_flattened(tensors_by_shard):
         all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
 
     starts, stops = map(np.asarray, zip(*sorted(all_slices)))
-    expected_size = np.product(local_shape)
+    expected_size = np.prod(local_shape)
     if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]):
         raise CheckpointingException(
             f"Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}"

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ dynamic = ["version", "readme"]
 description = "Megatron Core - a library for efficient and scalable training of transformer based models"
 requires-python = ">=3.10"
 license = { text = "Apache 2.0" }
-dependencies = ["torch", "numpy<2.0.0", "packaging>=24.2"]
+dependencies = ["torch", "numpy", "packaging>=24.2"]
 authors = [{ name = "NVIDIA", email = "[email protected]" }]
 maintainers = [{ name = "NVIDIA", email = "[email protected]" }]
 keywords = [
@@ -68,23 +68,23 @@ mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"]
 
 dev = [
     "nvidia-modelopt[torch]; sys_platform != 'darwin'",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
+    "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.10.0",
     "nvidia-resiliency-ext",
     "tqdm",
     "einops~=0.8",
     "tensorstore~=0.1,!=0.1.46,!=0.1.72",
     "nvtx~=0.2",
     "multi-storage-client~=0.27",
     "opentelemetry-api~=1.33.1",
-    "setuptools<80.0.0",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
-    "av<16.0.0",                                        # At the time, av 16.0.0 is not compatible with Python 3.12
+    "av",
     "flashinfer-python",
     "wget",
     "onnxscript",
+    "fastapi~=0.50",                                          # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
 ]
 
 lts = [
@@ -94,15 +94,15 @@ lts = [
     "nvtx~=0.2",
     "multi-storage-client~=0.27",
     "opentelemetry-api~=1.33.1",
-    "setuptools<80.0.0",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
-    "av<16.0.0",                          # At the time, av 16.0.0 is not compatible with Python 3.12
+    "av",
     "flashinfer-python",
     "wget",
     "onnxscript",
+    "fastapi~=0.50",                      # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
 ]
 
 [dependency-groups]
@@ -146,7 +146,7 @@ linting = [
     "pylint==3.2.6",
 ]
 ci = ["python-gitlab", "slack-sdk", "pandas"]
-flash_mla = ["flash_mla"]
+no_pypi_wheels = ["flash_mla", "emerging_optimizers"]
 
 [tool.uv]
 default-groups = ["linting", "build", "test"]
@@ -173,7 +173,7 @@ override-dependencies = [
 flash_mla = [
     { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" },
 ]
-transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9`
+# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9`
 nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "01a9a8ba360f7b2908728ad0516e0ad9d936966d" }
 emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "fb1add873e7851ec34b48581ea1b15761b73d189" }