Skip to content

Commit e7d09bb

Browse files
authored
Merge branch 'main' into main
2 parents 21b9891 + a5025a2 commit e7d09bb

File tree

85 files changed

+3118
-1716
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+3118
-1716
lines changed

.github/CODEOWNERS

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
4444
/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
4545
/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
4646
/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47-
/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
4847
/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
4948
/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
5049
/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners

.github/workflows/example_tests.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,11 @@ jobs:
5454
checks: read
5555
secrets: inherit
5656
with:
57-
match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
57+
match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
5858
delay: 300s
5959
example-tests-pr:
6060
needs: [check-file-changes, wait-checks]
6161
if: needs.check-file-changes.outputs.any_changed == 'true'
62-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
6362
runs-on: linux-amd64-gpu-h100-latest-1
6463
timeout-minutes: 90
6564
strategy:
@@ -84,8 +83,7 @@ jobs:
8483
pytest -s tests/examples/${{ matrix.EXAMPLE }}
8584
example-tests-non-pr:
8685
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
87-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
88-
runs-on: linux-amd64-gpu-h100-latest-1
86+
runs-on: linux-amd64-gpu-h100-latest-2
8987
timeout-minutes: 90
9088
strategy:
9189
matrix:

.github/workflows/gpu_tests.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,11 @@ jobs:
5454
checks: read
5555
secrets: inherit
5656
with:
57-
match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
57+
match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
5858
delay: 300s
5959
gpu-tests-pr:
6060
needs: [check-file-changes, wait-checks]
6161
if: needs.check-file-changes.outputs.any_changed == 'true'
62-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
6362
runs-on: linux-amd64-gpu-l4-latest-1
6463
timeout-minutes: 120
6564
container: &gpu_container
@@ -78,8 +77,7 @@ jobs:
7877
run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
7978
gpu-tests-non-pr:
8079
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
81-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
82-
runs-on: linux-amd64-gpu-h100-latest-1
80+
runs-on: linux-amd64-gpu-h100-latest-2
8381
timeout-minutes: 120
8482
container: *gpu_container
8583
steps: *gpu_steps

.github/workflows/unit_tests.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939
with:
4040
python-version: "3.12"
4141
- name: Run unit tests
42-
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch28-tf_latest-unit
42+
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
4343
- name: Upload coverage reports to Codecov
4444
uses: codecov/codecov-action@v5
4545
with:
@@ -57,7 +57,7 @@ jobs:
5757
with:
5858
python-version: "3.12"
5959
- name: Run unit tests (without coverage)
60-
run: pip install tox && tox -e py312-torch28-tf_latest-unit
60+
run: pip install tox && tox -e py312-torch29-tf_latest-unit
6161
multi-py:
6262
if: github.event_name == 'pull_request'
6363
needs: [linux]
@@ -72,15 +72,15 @@ jobs:
7272
with:
7373
python-version: "3.${{ matrix.py }}"
7474
- name: Run unit tests
75-
run: pip install tox && tox -e py3${{ matrix.py }}-torch28-tf_latest-unit
75+
run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
7676
multi-torch:
7777
if: github.event_name == 'pull_request'
7878
needs: [linux]
7979
runs-on: ubuntu-latest
8080
timeout-minutes: 30
8181
strategy:
8282
matrix:
83-
torch: [26, 27]
83+
torch: [26, 27, 28]
8484
steps:
8585
- uses: actions/checkout@v4
8686
- uses: actions/setup-python@v5
@@ -102,7 +102,7 @@ jobs:
102102
with:
103103
python-version: "3.12"
104104
- name: Run unit tests
105-
run: pip install tox && tox -e py312-torch28-tf_${{ matrix.tf }}-unit
105+
run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
106106
partial-install:
107107
if: github.event_name == 'pull_request'
108108
needs: [linux]

.gitlab/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ build-and-upload-wheels:
1717
TWINE_PASSWORD: $ARTIFACTORY_TOKEN # Configured in GitLab > Settings > CI/CD
1818
REPO_URL: https://urm.nvidia.com/artifactory/api/pypi/sw-dl-algo-ammo-pypi-local
1919
- if: $CI_PIPELINE_SOURCE == "schedule"
20+
when: manual
2021
variables:
21-
when: manual
2222
RELEASE: "false"
2323
TWINE_USERNAME: gitlab-ci-token
2424
TWINE_PASSWORD: $CI_JOB_TOKEN

.gitlab/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ unit:
1515
timeout: 30m
1616
variables:
1717
PYTHON: 12
18-
TORCH: 28
18+
TORCH: 29
1919
TRANSFORMERS: latest
2020
image: python:3.$PYTHON
2121
before_script:

CHANGELOG.rst

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,30 @@
11
Model Optimizer Changelog (Linux)
22
=================================
3-
0.41 (2025-12-xx)
4-
^^^^^^^^^^^^^^^^^
5-
6-
**Deprecations**
7-
8-
**New Features**
9-
10-
- Add support for PyTorch Geometric quantization.
11-
12-
**Misc**
13-
14-
- Bump minimum recommended transformers version to 4.53.
15-
163

17-
0.40 (2025-12-xx)
4+
0.40 (2025-12-11)
185
^^^^^^^^^^^^^^^^^
196

207
**Bug Fixes**
218

229
- Fix a bug in FastNAS pruning (computer vision models) where the model parameters were sorted twice messing up the ordering.
10+
- Fix Q/DQ/Cast node placements in 'FP32 required' tensors in custom ops in the ONNX quantization workflow.
2311

2412
**New Features**
2513

2614
- Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
2715
- Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
2816
- Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
17+
- Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
18+
- Add support for PyTorch Geometric quantization.
19+
- Add per tensor and per channel MSE calibrator support.
20+
21+
**Documentation**
22+
23+
- Deprecate ``examples/megatron-lm`` in favor of more detailed documentation in `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_.
24+
25+
**Misc**
26+
27+
- Bump minimum recommended transformers version to 4.53.
2928

3029
0.39 (2025-11-11)
3130
^^^^^^^^^^^^^^^^^
@@ -49,6 +48,7 @@ Model Optimizer Changelog (Linux)
4948
- Enabled native Modelopt quantization support for FP8 and NVFP4 formats in SGLang. See `SGLang quantization documentation <https://github.com/sgl-project/sglang/blob/main/docs/advanced_features/quantization.md#using-nvidia-modelopt>`_ for more details.
5049
- Added modelopt quantized checkpoints in vLLM/SGLang CI/CD pipelines (PRs are under review).
5150
- Add support for exporting QLoRA checkpoint fintuned using ModelOpt.
51+
- Update NVFP4 AWQ checkpoint export. It now fuses scaling factors of o_proj and down_proj layers into the model when possible to facilitate deployment.
5252

5353
**Documentation**
5454

@@ -72,7 +72,7 @@ Model Optimizer Changelog (Linux)
7272
- Upgrade TensorRT-LLM dependency to 1.1.0rc2.
7373
- Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``.
7474
- Support storing and restoring Minitron pruning activations and scores for re-pruning without running the forward loop again.
75-
- Add Minitron pruning example for Megatron-LM framework. See ``examples/megatron-lm`` for more details.
75+
- Add Minitron pruning example for Megatron-LM framework. See `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_ for more details.
7676

7777
0.35 (2025-09-04)
7878
^^^^^^^^^^^^^^^^^

examples/deepseek/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
DeepSeek-V3/
2+
DeepSeek-V3.2-Exp/

examples/deepseek/README.md

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,69 @@
1-
# Quantize Deepseek R1 to FP4
1+
# Quantize Deepseek models to FP4
22

3-
This example will demonstrate the steps to quantize DeepSeek R1 model to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
3+
This example will demonstrate the steps to quantize DeepSeek models to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
44

55
## Setup
66

77
Due to the model size, currently it requires 8xH200 or 16xH100 to quantize the FP8 model, we will use 8xH200 as example.
88

9-
### Convert the HF checkpoint for deepseek FP8 inference
9+
## Convert the HF checkpoint for deepseek FP8 inference
1010

1111
```bash
1212
# set up variables to run the example
1313
export HF_FP8_CKPT={path_to_downloaded_hf_checkpoint}
1414
export DS_CKPT={path_to_save_converted_checkpoint}
1515
export FP4_QUANT_PATH={path_to_save_quantization_results}
1616
export HF_FP4_PATH={path_to_save_the_final_FP4_checkpoint}
17+
```
18+
19+
### DeepSeek V3 R1 V3.1
1720

18-
# download the FP8 checkpoint from Hugginface
21+
```bash
22+
# download the FP8 checkpoint from Hugginface. This is an example of DeepSeek-R1
1923
huggingface-cli download deepseek-ai/DeepSeek-R1 --local-dir $HF_FP8_CKPT
2024

2125
# clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
2226
git clone https://github.com/deepseek-ai/DeepSeek-V3.git && cd DeepSeek-V3 && git checkout 1398800
27+
```
28+
29+
### [Experimental] DeepSeek V3.2
2330

31+
```bash
32+
# download the FP8 checkpoint from Hugginface.
33+
huggingface-cli download deepseek-ai/DeepSeek-V3.2-Exp --local-dir $HF_FP8_CKPT
34+
35+
# clone DeepSeek-V3.2 Github repository for FP8 inference,
36+
git clone https://github.com/deepseek-ai/DeepSeek-V3.2-Exp.git && cd DeepSeek-V3.2-Exp && git checkout 3b99a53
37+
38+
# Install requirements
39+
pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
40+
pip install -r inference/requirements.txt
41+
```
42+
43+
### Convert the Checkpoint
44+
45+
```bash
2446
# convert the HF checkpoint to a specific format for Deepseek
2547
python inference/convert.py --hf-ckpt-path $HF_FP8_CKPT --save-path $DS_CKPT --n-experts 256 --model-parallel 8
2648
```
2749

28-
### Post-training quantization
50+
## Post-training quantization
51+
52+
### Run the calibration scripts
2953

30-
#### Run the calibration scripts
54+
DeepSeek V3, R1, V3.1
3155

3256
```bash
3357
torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3/inference/configs/config_671B.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
3458
```
3559

36-
#### Quantize the FP8 hf checkpoint to FP4
60+
DeepSeek V3.2
61+
62+
```bash
63+
torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
64+
```
65+
66+
### Quantize the FP8 hf checkpoint to FP4
3767

3868
We provide a one-step-script which will:
3969

examples/deepseek/ds_kernel.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# MIT License
17+
18+
# Copyright (c) 2023 DeepSeek
19+
20+
# Permission is hereby granted, free of charge, to any person obtaining a copy
21+
# of this software and associated documentation files (the "Software"), to deal
22+
# in the Software without restriction, including without limitation the rights
23+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
24+
# copies of the Software, and to permit persons to whom the Software is
25+
# furnished to do so, subject to the following conditions:
26+
27+
# The above copyright notice and this permission notice shall be included in all
28+
# copies or substantial portions of the Software.
29+
30+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
33+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
34+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
35+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36+
# SOFTWARE.
37+
38+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
39+
# SPDX-License-Identifier: Apache-2.0
40+
#
41+
# Licensed under the Apache License, Version 2.0 (the "License");
42+
# you may not use this file except in compliance with the License.
43+
# You may obtain a copy of the License at
44+
#
45+
# http://www.apache.org/licenses/LICENSE-2.0
46+
#
47+
# Unless required by applicable law or agreed to in writing, software
48+
# distributed under the License is distributed on an "AS IS" BASIS,
49+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
50+
# See the License for the specific language governing permissions and
51+
# limitations under the License.
52+
53+
import torch
54+
import triton
55+
import triton.language as tl
56+
57+
"""Reference: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py"""
58+
59+
60+
@triton.jit
61+
def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
62+
"""
63+
Dequantizes weights using the provided scaling factors and stores the result.
64+
65+
Args:
66+
x_ptr (tl.pointer): Pointer to the quantized weights.
67+
s_ptr (tl.pointer): Pointer to the scaling factors.
68+
y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
69+
M (int): Number of rows in the weight matrix.
70+
N (int): Number of columns in the weight matrix.
71+
BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
72+
73+
Returns:
74+
None
75+
"""
76+
pid_m = tl.program_id(axis=0)
77+
pid_n = tl.program_id(axis=1)
78+
n = tl.cdiv(N, BLOCK_SIZE)
79+
offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
80+
offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
81+
offs = offs_m[:, None] * N + offs_n[None, :]
82+
mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
83+
x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
84+
s = tl.load(s_ptr + pid_m * n + pid_n)
85+
y = x * s
86+
tl.store(y_ptr + offs, y, mask=mask)
87+
88+
89+
def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
90+
"""
91+
Dequantizes the given weight tensor using the provided scale tensor.
92+
93+
Args:
94+
x (torch.Tensor): The quantized weight tensor of shape (M, N).
95+
s (torch.Tensor): The scale tensor of shape (M//block_size, N//block_size).
96+
block_size (int, optional): The block size to use for dequantization. Defaults to 128.
97+
98+
Returns:
99+
torch.Tensor: The dequantized weight tensor of the same shape as `x`.
100+
101+
Raises:
102+
AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
103+
"""
104+
assert x.is_contiguous() and s.is_contiguous(), "Input tensors must be contiguous"
105+
assert x.dim() == 2 and s.dim() == 2, "Input tensors must have 2 dimensions"
106+
M, N = x.size()
107+
y = torch.empty_like(x, dtype=torch.get_default_dtype())
108+
grid = lambda meta: (triton.cdiv(M, meta["BLOCK_SIZE"]), triton.cdiv(N, meta["BLOCK_SIZE"]))
109+
weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
110+
return y

0 commit comments

Comments
 (0)