lmcafee-nvidia · tdene · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,47 +1,50 @@
-megatron/core @NVIDIA/core-adlr @NVIDIA/core-nemo
+megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo
 
-megatron/core/models/gpt/ @NVIDIA/gpt
+megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt
 
-megatron/core/models/multimodal/ @NVIDIA/multi-modal
+megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal
 
-megatron/core/models/mamba/ @NVIDIA/hybrid-mamba
+megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
+megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
 
-megatron/core/dist_checkpointing/ @NVIDIA/dist-checkpointing
+megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
 
-megatron/core/optimizer/distrib_optimizer/ @NVIDIA/dist-optimizer
+megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
 
-megatron/core/inference/modelopt_support @NVIDIA/quantization-and-inference
+megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
 
-# megatron/core/datasets/ @NVIDIA/datasets
+megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing
 
-megatron/core/pipeline_parallel/ @NVIDIA/pipeline-parallelism
+megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer
+
+megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference
+
+megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
+
+megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism
 
 megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo
 
-megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-devtech
+megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech
 
-# megatron/core/inference/ @NVIDIA/inference
+megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference
 
-megatron/core/parallel_state.py @NVIDIA/core-nemo
+megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo
 
-megatron/core/post_training/ @NVIDIA/post-training
-megatron/post_training @NVIDIA/post-training
+megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
+
+megatron/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
 
 .gitlab/ @NVIDIA/ci
 .github/ @NVIDIA/ci
 .gitlab-ci.yml @NVIDIA/ci
 docker/  @NVIDIA/ci
-tests/unit_tests/run_ci_test.sh @NVIDIA/ci
-tests/test_utils/python_scripts/ 
 tests/functional_tests/python_test_utils/ @NVIDIA/ci
 tests/functional_tests/shell_test_utils/ @NVIDIA/ci
-megatron/core/transformer/transformer_block.py @NVIDIA/ci
-megatron/core/transformer/transformer_layer.py @NVIDIA/ci
-tests/functional_tests/test_cases/ @NVIDIA/ci
-tests/functional_tests/recipes/ @NVIDIA/ci
-tests/unit_tests/ @NVIDIA/ci
+tests/test_utils/recipes/ @NVIDIA/ci
+tests/unit_tests/run_ci_test.sh @NVIDIA/ci
 
 megatron/rl/ @NVIDIA/reinforcement-learning
 examples/rl/ @NVIDIA/reinforcement-learning
 test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
-train_rl.py @NVIDIA/reinforcement-learning
+train_rl.py @NVIDIA/reinforcement-learning
diff --git a/.github/actions/action.yml b/.github/actions/action.yml
@@ -78,7 +78,7 @@ runs:
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
         pip install --no-cache-dir uv
-        uv sync --only-group test 
+        uv sync --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
           --scope unit-tests \
           --model unit-tests \
@@ -90,7 +90,7 @@ runs:
 
         RUN_TEST_EOF
         )
-        echo "$cmd" | tee "job.sh"        
+        echo "$cmd" | tee "job.sh"
         echo "::endgroup::"
 
     - name: Get PR info
@@ -125,23 +125,34 @@ runs:
         #!/bin/bash
         set -euxo pipefail
 
+        if [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
+            ARGS=(
+              --scope mr-github
+              --enable-lightweight-mode
+            )
+          else
+            ARGS=(
+              --scope mr-slim
+              --enable-lightweight-mode
+            )
+          fi
+
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
         pip install --no-cache-dir uv
-        uv sync --only-group test 
+        uv sync --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
-          --scope mr \
+          ${ARGS[@]} \
           --model ${{ inputs.model }} \
           --test-case ${{ inputs.test_case }} \
           --environment dev \
           --platform dgx_h100 \
           --container-image ${{ inputs.container-image }} \
           --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
-          --enable-lightweight-mode
 
         RUN_TEST_EOF
         )
-        echo "$cmd" | tee "job.sh"        
+        echo "$cmd" | tee "job.sh"
         echo "::endgroup::"
 
     - name: Set timeout

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -1,3 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "guyueh1", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
diff --git a/.github/workflows/_build_test_publish_wheel.yml b/.github/workflows/_build_test_publish_wheel.yml
@@ -0,0 +1,157 @@
+on:
+  workflow_call:
+    secrets:
+      TWINE_USERNAME:
+        required: true
+      TWINE_PASSWORD:
+        required: true
+
+jobs:
+  build-and-test-wheels:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - PACKAGE: megatron-core
+            PLATFORM: arm64
+            IMAGE: quay.io/pypa/manylinux_2_28_aarch64
+          - PACKAGE: megatron-core
+            PLATFORM: amd64
+            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
+          - PACKAGE: megatron-fsdp
+            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
+            PLATFORM: amd64
+    runs-on: ${{ matrix.PLATFORM == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
+    env:
+      PACKAGE: ${{ matrix.PACKAGE }}
+      IMAGE: ${{ matrix.IMAGE }}
+      PLATFORM: ${{ matrix.PLATFORM }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Build wheel
+        id: build-wheel
+        run: |
+          set -x
+
+          PUBLISH_DRYRUN=yes
+
+          if [ "$PACKAGE" = "megatron-core" ]; then
+            ROOTDIR="megatron/core"
+            BUILD_DIR="."
+          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
+            ROOTDIR="megatron/core/distributed/fsdp/src/megatron_fsdp"
+            BUILD_DIR="megatron/core/distributed/fsdp/src"
+          else
+            echo Unknown package: $PACKAGE
+            exit 1
+          fi
+
+          if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+            PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py)
+            sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py
+          fi
+
+          pushd $BUILD_DIR
+            rm LICENSE || true
+            docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\
+              for python_version in cp310 cp311 cp312 cp313; do \
+                /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools>=80.0.0" build; \
+              done && \
+              for python_version in cp310 cp311 cp312 cp313; do \
+                /opt/python/${python_version}-${python_version}/bin/python -m build; \
+              done \
+            '
+
+            PLATFORM_WHEELS=$(find dist -name "*.whl" -not -name "*-none-any.whl")
+            if [ -n "$PLATFORM_WHEELS" ]; then
+                echo "Found platform wheels to repair: $PLATFORM_WHEELS"
+                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE auditwheel repair $PLATFORM_WHEELS
+                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE rm -rf dist/*.whl
+                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE cp -a wheelhouse/* dist/
+            fi
+          popd
+
+          pushd $ROOTDIR
+            EXPECTED_RELEASE_NUMBER=$(python -c "import package_info; print(package_info.__version__)")
+          popd
+
+          echo "expected-release-number=$EXPECTED_RELEASE_NUMBER" | tee -a "${GITHUB_OUTPUT}"
+
+          if [ "$PACKAGE" = "megatron-fsdp" ]; then
+            mkdir -p dist/
+            cp -a megatron/core/distributed/fsdp/src/dist/* dist/
+          fi
+
+          ls -al dist/
+
+      - name: Test wheels
+        run: |
+          ls -al dist/
+
+          if [ "$PACKAGE" = "megatron-core" ]; then
+            ROOTPATH="megatron.core"
+            WHEEL_PREFIX="megatron_core"
+          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
+            ROOTPATH="megatron_fsdp"
+            WHEEL_PREFIX="megatron_fsdp"
+          else
+            echo Unknown package: $PACKAGE
+            exit 1
+          fi
+
+          if [ "$PACKAGE" = "megatron-core" ]; then
+            if [[ "$PLATFORM" == "arm64" ]]; then
+              for file in dist/$WHEEL_PREFIX*cp310*aarch64.whl; do
+                pip install --no-cache-dir "$file"
+              done
+            else
+              for file in dist/$WHEEL_PREFIX*cp310*x86_64.whl; do
+                pip install --no-cache-dir "$file"
+              done
+            fi
+          else
+            pip install --no-cache-dir dist/$WHEEL_PREFIX*.whl
+          fi
+
+          sudo rm -rf megatron/
+
+          RELEASE_NUMBER=$(python -c "import $ROOTPATH; print($ROOTPATH.__version__)")
+          test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"
+
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}
+          path: dist/
+
+  publish-wheels:
+    needs: [build-and-test-wheels]
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/main'
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - PACKAGE: megatron_core
+          - PACKAGE: megatron_fsdp
+    env:
+      PACKAGE: ${{ matrix.PACKAGE }}
+    steps:
+      - name: Download wheels
+        uses: actions/download-artifact@v4
+        with:
+          path: dist/
+          merge-multiple: true
+
+      - name: Publish wheels
+        env:
+          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+          TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }}
+        run: |
+          ls -al dist/$PACKAGE*
+          pip install twine
+          twine upload -r $TWINE_REPOSITORY -u $TWINE_USERNAME -p $TWINE_PASSWORD dist/$PACKAGE*
diff --git a/.github/workflows/auto-update-copy-pr-bot.yml b/.github/workflows/auto-update-copy-pr-bot.yml
@@ -0,0 +1,63 @@
+name: Auto Update Copy PR Bot
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+jobs:
+  auto-update-copy-pr-bot:
+    runs-on: ubuntu-latest
+    environment: nemo-ci
+    if: github.repository == 'NVIDIA/Megatron-LM'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Fetch list of members in mcore-reviewers team
+        shell: bash -euxo pipefail {0}
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
+        run: |
+          #!/bin/bash
+
+          get_members() {
+              local org=$1 team=$2 seen_file=$3    
+
+              gh api "/orgs/$org/teams/$team/members" --paginate --jq '.[].login' >> "$seen_file"
+
+              gh api "/orgs/$org/teams/$team/teams" --paginate --jq '.[].slug' | while read -r child; do
+                  get_members "$org" "$child" "$seen_file"
+              done
+
+              cat "$seen_file"
+          }
+
+          tmp=$(mktemp)
+          echo "" > final.txt
+          get_members "NVIDIA" "mcore-engineers" "$tmp" | sort -u >> final.txt && rm "$tmp"
+
+          tmp=$(mktemp)
+          get_members "NVIDIA" "mcore-reviewers" "$tmp" | sort -u >> final.txt && rm "$tmp"
+
+          cat final.txt | jq -sR 'split("\n") | map(select(. != "")) | flatten | unique'
+
+          export TRUSTEES=$(cat final.txt | jq -csR 'split("\n") | map(select(. != "")) | flatten | unique')
+          yq '.trustees_override = env(TRUSTEES)' .github/copy-pr-bot.yaml | yq -o yaml > .github/copy-pr-bot.yaml.new
+
+          mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml
+
+      - name: Commit changes
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
+        run: |
+          git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git add .github/copy-pr-bot.yaml
+          if git diff --cached --exit-code --quiet; then
+            echo "No changes to commit. Exiting gracefully."
+            exit 0
+          fi
+          git commit -m "Update copy-pr-bot.yaml [skip ci]"
+          git push -u origin main