NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 0 additions & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.gitlab/release.yml‎
Lines changed: 1 addition & 1 deletion b/‎.gitlab/release.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitlab/tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.gitlab/tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.rst‎
Lines changed: 15 additions & 15 deletions b/‎CHANGELOG.rst‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎examples/deepseek/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎examples/deepseek/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/deepseek/README.md‎
Lines changed: 37 additions & 7 deletions b/‎examples/deepseek/README.md‎
Lines changed: 37 additions & 7 deletions
diff --git a/‎examples/deepseek/ds_kernel.py‎
Lines changed: 110 additions & 0 deletions b/‎examples/deepseek/ds_kernel.py‎
Lines changed: 110 additions & 0 deletions
@@ -44,7 +44,6 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 /examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
 /examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
 /examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
-/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
 /examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
 /examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
 /examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
 
@@ -54,12 +54,11 @@ jobs:
       checks: read
     secrets: inherit
     with:
-      match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
+      match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
   example-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-h100-latest-1
     timeout-minutes: 90
     strategy:
@@ -84,8 +83,7 @@ jobs:
           pytest -s tests/examples/${{ matrix.EXAMPLE }}
   example-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
-    runs-on: linux-amd64-gpu-h100-latest-1
+    runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: 90
     strategy:
       matrix:
 
@@ -54,12 +54,11 @@ jobs:
       checks: read
     secrets: inherit
     with:
-      match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
+      match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
   gpu-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: 120
     container: &gpu_container
@@ -78,8 +77,7 @@ jobs:
         run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
-    runs-on: linux-amd64-gpu-h100-latest-1
+    runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: 120
     container: *gpu_container
     steps: *gpu_steps
 
@@ -39,7 +39,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests
-        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch28-tf_latest-unit
+        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
@@ -57,7 +57,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
-        run: pip install tox && tox -e py312-torch28-tf_latest-unit
+        run: pip install tox && tox -e py312-torch29-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
     needs: [linux]
@@ -72,15 +72,15 @@ jobs:
         with:
           python-version: "3.${{ matrix.py }}"
       - name: Run unit tests
-        run: pip install tox && tox -e py3${{ matrix.py }}-torch28-tf_latest-unit
+        run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
   multi-torch:
     if: github.event_name == 'pull_request'
     needs: [linux]
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       matrix:
-        torch: [26, 27]
+        torch: [26, 27, 28]
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
@@ -102,7 +102,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests
-        run: pip install tox && tox -e py312-torch28-tf_${{ matrix.tf }}-unit
+        run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
   partial-install:
     if: github.event_name == 'pull_request'
     needs: [linux]
 
@@ -17,8 +17,8 @@ build-and-upload-wheels:
         TWINE_PASSWORD: $ARTIFACTORY_TOKEN # Configured in GitLab > Settings > CI/CD
         REPO_URL: https://urm.nvidia.com/artifactory/api/pypi/sw-dl-algo-ammo-pypi-local
     - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: manual
       variables:
-        when: manual
         RELEASE: "false"
         TWINE_USERNAME: gitlab-ci-token
         TWINE_PASSWORD: $CI_JOB_TOKEN
 
@@ -15,7 +15,7 @@ unit:
   timeout: 30m
   variables:
     PYTHON: 12
-    TORCH: 28
+    TORCH: 29
     TRANSFORMERS: latest
   image: python:3.$PYTHON
   before_script:
 
@@ -1,31 +1,30 @@
 Model Optimizer Changelog (Linux)
 =================================
-0.41 (2025-12-xx)
-^^^^^^^^^^^^^^^^^
-
-**Deprecations**
-
-**New Features**
-
-- Add support for PyTorch Geometric quantization.
-
-**Misc**
-
-- Bump minimum recommended transformers version to 4.53.
-
 
-0.40 (2025-12-xx)
+0.40 (2025-12-11)
 ^^^^^^^^^^^^^^^^^
 
 **Bug Fixes**
 
 - Fix a bug in FastNAS pruning (computer vision models) where the model parameters were sorted twice messing up the ordering.
+- Fix Q/DQ/Cast node placements in 'FP32 required' tensors in custom ops in the ONNX quantization workflow.
 
 **New Features**
 
 - Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
 - Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
+- Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
+- Add support for PyTorch Geometric quantization.
+- Add per tensor and per channel MSE calibrator support.
+
+**Documentation**
+
+- Deprecate ``examples/megatron-lm`` in favor of more detailed documentation in `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_.
+
+**Misc**
+
+- Bump minimum recommended transformers version to 4.53.
 
 0.39 (2025-11-11)
 ^^^^^^^^^^^^^^^^^
@@ -49,6 +48,7 @@ Model Optimizer Changelog (Linux)
 - Enabled native Modelopt quantization support for FP8 and NVFP4 formats in SGLang. See `SGLang quantization documentation <https://github.com/sgl-project/sglang/blob/main/docs/advanced_features/quantization.md#using-nvidia-modelopt>`_ for more details.
 - Added modelopt quantized checkpoints in vLLM/SGLang CI/CD pipelines (PRs are under review).
 - Add support for exporting QLoRA checkpoint fintuned using ModelOpt.
+- Update NVFP4 AWQ checkpoint export. It now fuses scaling factors of o_proj and down_proj layers into the model when possible to facilitate deployment.
 
 **Documentation**
 
@@ -72,7 +72,7 @@ Model Optimizer Changelog (Linux)
 - Upgrade TensorRT-LLM dependency to 1.1.0rc2.
 - Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``.
 - Support storing and restoring Minitron pruning activations and scores for re-pruning without running the forward loop again.
-- Add Minitron pruning example for Megatron-LM framework. See ``examples/megatron-lm`` for more details.
+- Add Minitron pruning example for Megatron-LM framework. See `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_ for more details.
 
 0.35 (2025-09-04)
 ^^^^^^^^^^^^^^^^^
 
@@ -1 +1,2 @@
 DeepSeek-V3/
+DeepSeek-V3.2-Exp/
@@ -1,39 +1,69 @@
-# Quantize Deepseek R1 to FP4
+# Quantize Deepseek models to FP4
 
-This example will demonstrate the steps to quantize DeepSeek R1 model to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
+This example will demonstrate the steps to quantize DeepSeek models to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
 
 ## Setup
 
 Due to the model size, currently it requires 8xH200 or 16xH100 to quantize the FP8 model, we will use 8xH200 as example.
 
-### Convert the HF checkpoint for deepseek FP8 inference
+## Convert the HF checkpoint for deepseek FP8 inference
 
 ```bash
 # set up variables to run the example
 export HF_FP8_CKPT={path_to_downloaded_hf_checkpoint}
 export DS_CKPT={path_to_save_converted_checkpoint}
 export FP4_QUANT_PATH={path_to_save_quantization_results}
 export HF_FP4_PATH={path_to_save_the_final_FP4_checkpoint}
+```
+
+### DeepSeek V3 R1 V3.1
 
-# download the FP8 checkpoint from Hugginface
+```bash
+# download the FP8 checkpoint from Hugginface. This is an example of DeepSeek-R1
 huggingface-cli download deepseek-ai/DeepSeek-R1 --local-dir $HF_FP8_CKPT
 
 # clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
 git clone https://github.com/deepseek-ai/DeepSeek-V3.git && cd DeepSeek-V3 && git checkout 1398800
+```
+
+### [Experimental] DeepSeek V3.2
 
+```bash
+# download the FP8 checkpoint from Hugginface.
+huggingface-cli download deepseek-ai/DeepSeek-V3.2-Exp --local-dir $HF_FP8_CKPT
+
+# clone DeepSeek-V3.2 Github repository for FP8 inference,
+git clone https://github.com/deepseek-ai/DeepSeek-V3.2-Exp.git && cd DeepSeek-V3.2-Exp && git checkout 3b99a53
+
+# Install requirements
+pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
+pip install -r inference/requirements.txt
+```
+
+### Convert the Checkpoint
+
+```bash
 # convert the HF checkpoint to a specific format for Deepseek
 python inference/convert.py --hf-ckpt-path $HF_FP8_CKPT --save-path $DS_CKPT --n-experts 256 --model-parallel 8
 ```
 
-### Post-training quantization
+## Post-training quantization
+
+### Run the calibration scripts
 
-#### Run the calibration scripts
+DeepSeek V3, R1, V3.1
 
 ```bash
 torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3/inference/configs/config_671B.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
 ```
 
-#### Quantize the FP8 hf checkpoint to FP4
+DeepSeek V3.2
+
+```bash
+torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
+```
+
+### Quantize the FP8 hf checkpoint to FP4
 
 We provide a one-step-script which will:
 
 
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MIT License
+
+# Copyright (c) 2023 DeepSeek
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import triton
+import triton.language as tl
+
+"""Reference: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py"""
+
+
+@triton.jit
+def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+    """
+    Dequantizes weights using the provided scaling factors and stores the result.
+
+    Args:
+        x_ptr (tl.pointer): Pointer to the quantized weights.
+        s_ptr (tl.pointer): Pointer to the scaling factors.
+        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
+        M (int): Number of rows in the weight matrix.
+        N (int): Number of columns in the weight matrix.
+        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    n = tl.cdiv(N, BLOCK_SIZE)
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs = offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+    s = tl.load(s_ptr + pid_m * n + pid_n)
+    y = x * s
+    tl.store(y_ptr + offs, y, mask=mask)
+
+
+def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    """
+    Dequantizes the given weight tensor using the provided scale tensor.
+
+    Args:
+        x (torch.Tensor): The quantized weight tensor of shape (M, N).
+        s (torch.Tensor): The scale tensor of shape (M//block_size, N//block_size).
+        block_size (int, optional): The block size to use for dequantization. Defaults to 128.
+
+    Returns:
+        torch.Tensor: The dequantized weight tensor of the same shape as `x`.
+
+    Raises:
+        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
+    """
+    assert x.is_contiguous() and s.is_contiguous(), "Input tensors must be contiguous"
+    assert x.dim() == 2 and s.dim() == 2, "Input tensors must have 2 dimensions"
+    M, N = x.size()
+    y = torch.empty_like(x, dtype=torch.get_default_dtype())
+    grid = lambda meta: (triton.cdiv(M, meta["BLOCK_SIZE"]), triton.cdiv(N, meta["BLOCK_SIZE"]))
+    weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
+    return y