NVIDIA · danielkorzekwa · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 28, 2025
@@ -24,6 +24,7 @@ modelopt/torch/nas @NVIDIA/modelopt-torch-nas-prune-codeowners
 modelopt/torch/opt @NVIDIA/modelopt-torch-opt-codeowners
 modelopt/torch/peft @NVIDIA/modelopt-torch-peft-codeowners
 modelopt/torch/prune @NVIDIA/modelopt-torch-nas-prune-codeowners
+modelopt/torch/puzzletron @NVIDIA/modelopt-torch-puzzletron-codeowners
 modelopt/torch/quantization @NVIDIA/modelopt-torch-quantization-codeowners
 modelopt/torch/sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
 modelopt/torch/speculative @NVIDIA/modelopt-torch-speculative-codeowners

@@ -125,14 +125,14 @@ jobs:
     strategy: &nemo_strategy
       fail-fast: false
       matrix:
-        example: [megatron_bridge]
+        example: [megatron_bridge, puzzletron]
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
       docker_image: "nvcr.io/nvidia/nemo:26.02"
       example: ${{ matrix.example }}
       timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
+      pip_install_extras: "[hf,puzzletron,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-1
 
   nemo-non-pr:
@@ -144,7 +144,7 @@ jobs:
       docker_image: "nvcr.io/nvidia/nemo:26.02"
       example: ${{ matrix.example }}
       timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
+      pip_install_extras: "[hf,puzzletron,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-2
 
   ##### ONNX/TensorRT Example Tests #####

@@ -85,6 +85,8 @@ jobs:
       - name: Setup environment variables
         run: |
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
+      - name: Install dependencies for mip
+        run: apt-get update && apt-get install -y libffi-dev
       - name: Run gpu tests
         run: pip install tox-current-env && tox -e cuda13-${{ matrix.example }} --current-env
   gpu-tests-non-pr:

@@ -79,6 +79,7 @@ repos:
               modelopt/onnx/quantization/ort_patching.py|
               modelopt/torch/_deploy/utils/onnx_utils.py|
               modelopt/torch/export/transformer_engine.py|
+              modelopt/torch/puzzletron/anymodel/models/gpt_oss/gpt_oss_pruned_to_mxfp4.py|
               modelopt/torch/quantization/export_onnx.py|
               modelopt/torch/quantization/plugins/attention.py|
               modelopt/torch/speculative/eagle/utils.py|
@@ -95,6 +96,7 @@ repos:
               examples/llm_eval/modeling.py|
               examples/llm_qat/main.py|
               examples/llm_sparsity/weight_sparsity/finetune.py|
+              examples/puzzletron/evaluation/lm_eval_anymodel.py|
               examples/specdec_bench/specdec_bench/models/specbench_medusa.py|
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|

@@ -7,6 +7,7 @@ Pruning can involve removal (prune) of Linear and Conv layers; and Transformer a
 This section focuses on applying Model Optimizer's state-of-the-art complementary pruning modes to enable you to search for the best subnet architecture from your provided base model:
 
 1. [Minitron](https://arxiv.org/pdf/2408.11796): A pruning method developed by NVIDIA Research for pruning GPT (and later extended to Mamba, MoE, and Hybrid Transformer Mamba) models in NVIDIA Megatron-LM (M-LM) or Megatron-Bridge (M-Bridge) framework. It uses the activation magnitudes to prune the embedding hidden size; mlp ffn hidden size; transformer attention heads; mamba heads and head dimension; MoE number of experts, ffn hidden size, and shared expert intermediate size; and number of layers of the model.
+1. [Puzzletron](../puzzletron/README.md): An advanced pruning method by NVIDIA using Mixed Integer Programming (MIP) based NAS search algorithm.
 1. FastNAS: A pruning method recommended for Computer Vision models. Given a pretrained model, FastNAS finds the subnet which maximizes the score function while meeting the given constraints.
 1. GradNAS: A light-weight pruning method recommended for language models like Hugging Face BERT, GPT-J. It uses the gradient information to prune the model's linear layers and attention heads to meet the given constraints.
 

@@ -0,0 +1,14 @@
+
+## GptOss
+
+With this release Puzzle algorithm supports only experts removal for `Gpt-Oss`.
+
+This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with _MXFP4_ format.
+In the pruning steps puzzle utilizes decompressed model (back to BF16) for statistics and scores computation.
+This means, during the conversion to puzzle format we decompress the model and store it as a BF16.
+Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the _MXFP4_ format of the checkpoint.
+To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in _MXFP4_ format.
+
+```bash
+python -m modelopt.torch.puzzletron.anymodel.models.gpt_oss.gpt_oss_pruned_to_mxfp4 --student-path /workspaces/any_model_gpt_oss/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/  --num-layers 24
+```