NVIDIA · bmhowe23 · Nov 19, 2025 · Nov 6, 2025 · Nov 7, 2025 · Nov 12, 2025
diff --git a/.github/workflows/all_libs.yaml b/.github/workflows/all_libs.yaml
@@ -109,7 +109,7 @@ jobs:
           # Install the correct torch first.
           cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//')
           pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot}
-          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09.1
+          pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09.1
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only

diff --git a/.github/workflows/all_libs_release.yaml b/.github/workflows/all_libs_release.yaml
@@ -133,7 +133,7 @@ jobs:
           # Install the correct torch first.
           cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//')
           pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot}
-          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09.1
+          pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09.1
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only

diff --git a/.github/workflows/lib_qec.yaml b/.github/workflows/lib_qec.yaml
@@ -106,7 +106,7 @@ jobs:
           # Install the correct torch first.
           cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//')
           pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot}
-          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09.1
+          pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09.1
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only

diff --git a/.github/workflows/lib_solvers.yaml b/.github/workflows/lib_solvers.yaml
@@ -94,7 +94,7 @@ jobs:
           # Install the correct torch first.
           cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//')
           pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot}
-          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers pytest
+          pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers pytest
 
 
       - name: Run Python tests

diff --git a/...decoders/trt_decoder/train_mlp_decoder.py → .../examples/qec/python/train_mlp_decoder.py b/...decoders/trt_decoder/train_mlp_decoder.py → .../examples/qec/python/train_mlp_decoder.py
@@ -1,3 +1,20 @@
+# ============================================================================ #
+# Copyright (c) 2025 NVIDIA Corporation & Affiliates.                          #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+# [Begin Documentation]
+
+import sys
+import platform
+if platform.machine().lower() in ("arm64", "aarch64"):
+    print(
+        "Warning: stim is not supported on manylinux ARM64/aarch64. Skipping this example..."
+    )
+    sys.exit(0)
+
 import stim
 import torch
 import torch.nn as nn
@@ -13,7 +30,7 @@
 num_val_samples = 1000  # Validation samples
 num_test_samples = 1000  # Test samples
 hidden_dim = 128  # Larger model capacity
-error_prob = 0.18  # Balanced error rate for better learning
+error_prob = 0.005  # Balanced error rate for better learning
 
 # --------------------------
 # Build the surface code circuit
@@ -30,7 +47,7 @@
 # Convert to detector error model
 dem = circuit.detector_error_model()
 num_detectors = dem.num_detectors
-num_data_qubits = circuit.num_qubits - num_detectors  # approx
+num_data_qubits = circuit.num_qubits - num_detectors
 
 print(f"Num data qubits: {num_data_qubits}, Num detectors: {num_detectors}")
 
@@ -69,9 +86,6 @@ def sample_data(num_samples):
 num_observables = Y_train.shape[1]
 print(f"Num observables: {num_observables}")
 
-print(f"X_test: {X_test}")
-print(f"Y_test: {Y_test}")
-
 
 # --------------------------
 # Improved Torch NN decoder with dropout and deeper architecture
@@ -178,8 +192,8 @@ def compute_accuracy(predictions, targets, threshold=0.5):
             val_correct += ((val_output > 0.5).float() == batch_Y).sum().item()
             val_total += batch_Y.numel()
 
-    print(f"logical_error_rate (raw): {batch_Y.sum().item() / batch_Y.numel()}")
-    print(f"cum_ler: {cum_ler / len(val_loader.dataset)}")
+    # print(f"logical_error_rate (raw): {batch_Y.sum().item() / batch_Y.numel()}")
+    # print(f"cum_ler: {cum_ler / len(val_loader.dataset)}")
 
     val_loss_avg = val_loss_total / len(val_loader.dataset)
     val_acc = val_correct / val_total

diff --git a/docs/sphinx/examples_rst/qec/decoders.rst b/docs/sphinx/examples_rst/qec/decoders.rst
@@ -134,4 +134,185 @@ The decoder returns the probability that the logical observable has flipped for
 
 See Also:
 
-- ``cudaq_qec.plugins.decoders.tensor_network_decoder``
+- ``cudaq_qec.plugins.decoders.tensor_network_decoder``
+
+.. _deploying-ai-decoders:
+
+Deploying AI Decoders with TensorRT
++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Starting with CUDA-Q QEC v0.5.0, a GPU-accelerated TensorRT-based decoder is included with the
+CUDA-Q QEC library. The TensorRT decoder (``trt_decoder``) enables users to leverage custom AI
+models for quantum error correction, providing a flexible framework for deploying trained models
+with optimized inference performance on NVIDIA GPUs.
+
+Unlike traditional algorithmic decoders, neural network decoders can be trained on specific error
+models and code structures, potentially achieving superior performance for certain noise regimes.
+The TensorRT decoder supports loading models in ONNX format and provides configurable precision
+modes (fp16, bf16, int8, fp8, tf32) to balance accuracy and inference speed.
+
+This tutorial demonstrates the complete workflow for training a simple multi-layer perceptron (MLP)
+to decode surface code syndromes using PyTorch and Stim, exporting the model to ONNX format, and
+deploying it with the TensorRT decoder for accelerated inference.
+
+Overview of the Training-to-Deployment Pipeline
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The workflow consists of three main stages:
+
+1. **Data Generation**: Use Stim to generate synthetic quantum error correction data by simulating
+   surface code circuits with realistic noise models. This produces detector measurements (syndromes)
+   and observable flips (logical errors) that serve as training data.
+
+2. **Model Training**: Train a neural network (in this case, an MLP) using PyTorch to learn the
+   mapping from syndromes to logical error predictions. The model is trained with standard deep
+   learning techniques including dropout regularization, learning rate scheduling, and validation monitoring.
+
+3. **ONNX Export and Deployment**: Export the trained PyTorch model to ONNX format, which can then
+   be loaded by the TensorRT decoder for optimized GPU inference in production QEC workflows.
+
+Training a Neural Network Decoder with PyTorch and Stim
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following example shows how to generate training data using Stim's built-in surface code
+generator, train an MLP decoder with PyTorch, and export the model to ONNX format.
+For instructions on installing PyTorch, see :ref:`Installing PyTorch <installing-pytorch>`.
+
+.. literalinclude:: ../../examples/qec/python/train_mlp_decoder.py
+   :language: python
+   :start-after: [Begin Documentation]
+
+Using the TensorRT Decoder in CUDA-Q QEC
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once you have a trained ONNX model, you can load it with the TensorRT decoder for accelerated
+inference. The decoder can be used in both C++ and Python workflows.
+
+**Loading from ONNX (with automatic TensorRT optimization)**:
+
+.. tab:: Python
+
+   .. code-block:: python
+
+      import cudaq_qec as qec
+      import numpy as np
+
+      # Note: The AI decoder doesn't use the parity check matrix.
+      # A placeholder matrix is provided here to satisfy the API.
+      H = np.array([[1, 0, 0, 1, 0, 1, 1],
+                    [0, 1, 0, 1, 1, 0, 1],
+                    [0, 0, 1, 0, 1, 1, 1]], dtype=np.uint8)
+
+      # Create TensorRT decoder from ONNX model
+      decoder = qec.get_decoder("trt_decoder", H,
+                                onnx_load_path="ai_decoder.onnx")
+
+      # Decode a syndrome
+      syndrome = np.array([1.0, 0.0, 1.0], dtype=np.float32)
+      result = decoder.decode(syndrome)
+      print(f"Predicted error: {result}")
+
+.. tab:: C++
+
+   .. code-block:: cpp
+
+      #include "cudaq/qec/decoder.h"
+      #include "cuda-qx/core/tensor.h"
+      #include "cuda-qx/core/heterogeneous_map.h"
+
+      int main() {
+          // Note: The AI decoder doesn't use the parity check matrix.
+          // A placeholder matrix is provided here to satisfy the API.
+          std::vector<std::vector<uint8_t>> H_vec = {
+              {1, 0, 0, 1, 0, 1, 1},
+              {0, 1, 0, 1, 1, 0, 1},
+              {0, 0, 1, 0, 1, 1, 1}
+          };
+
+          // Convert to tensor
+          cudaqx::tensor<uint8_t> H({3, 7});
+          for (size_t i = 0; i < 3; ++i) {
+              for (size_t j = 0; j < 7; ++j) {
+                  H.at({i, j}) = H_vec[i][j];
+              }
+          }
+
+          // Create decoder parameters
+          cudaqx::heterogeneous_map params;
+          params.insert("onnx_load_path", "ai_decoder.onnx");
+          params.insert("precision", "fp16");
+
+          // Create TensorRT decoder
+          auto decoder = cudaq::qec::get_decoder("trt_decoder", H, params);
+
+          // Decode syndrome
+          std::vector<cudaq::qec::float_t> syndrome = {1.0, 0.0, 1.0};
+          auto result = decoder->decode(syndrome);
+
+          return 0;
+      }
+
+**Loading a pre-built TensorRT engine (for fastest initialization)**:
+
+If you've already converted your ONNX model to a TensorRT engine using the provided utility script,
+you can load it directly:
+
+.. tab:: Python
+
+   .. code-block:: python
+
+      decoder = qec.get_decoder("trt_decoder", H,
+                                engine_load_path="surface_code_decoder.trt")
+
+Converting ONNX Models to TensorRT Engines
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For production deployments where initialization time is critical, you can pre-build a TensorRT
+engine from your ONNX model using the ``trtexec`` command-line tool that comes with TensorRT:
+
+.. code-block:: bash
+
+   # Build with FP16 precision
+   trtexec --onnx=surface_code_decoder.onnx \
+           --saveEngine=surface_code_decoder.trt \
+           --fp16
+
+   # Build with best precision for your GPU
+   trtexec --onnx=surface_code_decoder.onnx \
+           --saveEngine=surface_code_decoder.trt \
+           --best
+
+   # Build with specific input shape (optional, for optimization)
+   trtexec --onnx=surface_code_decoder.onnx \
+           --saveEngine=surface_code_decoder.trt \
+           --fp16 \
+           --shapes=detectors:1x24
+
+Pre-built engines offer several advantages:
+
+- **Faster initialization**: Engine loading is significantly faster than ONNX parsing and optimization
+- **Reproducible optimization**: The same optimization decisions are made every time
+- **Version control**: Engines can be versioned alongside code for reproducible deployments
+
+
+Dependencies and Requirements
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The TensorRT decoder requires:
+
+- **TensorRT**: Version 10.13.3.9 or higher
+- **CUDA**: Version 12.0 or higher for x86 and 13.0 for ARM.
+- **GPU**: NVIDIA GPU with compute capability 6.0+ (Pascal architecture or newer)
+
+For training:
+
+- **PyTorch**: Version 2.0+ recommended
+- **Stim**: For quantum circuit simulation and data generation
+
+See Also
+^^^^^^^^
+
+- :class:`cudaq_qec.Decoder` - Base decoder interface
+- `ONNX <https://onnx.ai/>`_ - Open Neural Network Exchange format
+- `TensorRT Documentation <https://docs.nvidia.com/deeplearning/tensorrt/>`_ - NVIDIA TensorRT
+- `Stim Documentation <https://github.com/quantumlib/Stim>`_ - Fast stabilizer circuit simulator
diff --git a/docs/sphinx/quickstart/installation.rst b/docs/sphinx/quickstart/installation.rst
@@ -76,14 +76,32 @@ Building from Source
 The instructions for building CUDA-QX from source are maintained on our GitHub
 repository: `Building CUDA-QX from Source <https://github.com/NVIDIA/cudaqx/blob/main/Building.md>`__.
 
-Known Blackwell Issues
-----------------------
-.. note::
-    If you are attempting to use torch on Blackwell, you will need to install the nightly version of torch.
-    You can do this by running:
+.. _installing-pytorch:
+
+Installing PyTorch
+------------------
 
-    .. code-block:: bash
+PyTorch (``torch``) is required for several CUDA-QX features:
 
-        python3 -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
+* **Tensor Network Decoder**: Used by the QEC library for tensor network-based decoding (CPU version of PyTorch is sufficient)
+* **GQE Algorithm**: Used by the Solvers library for the Generative Quantum Eigensolver
+* **Training AI Decoders**: Optionally used for training custom neural network decoders (see :ref:`Deploying AI Decoders with TensorRT <deploying-ai-decoders>`)
 
-    torch is a dependency of the tensor network decoder and the GQE algorithm.
+PyTorch is automatically installed when you install the optional components:
+
+.. code-block:: bash
+
+    # Installs PyTorch as a dependency
+    pip install cudaq-qec[tensor-network-decoder]
+    pip install cudaq-solvers[gqe]
+
+Alternatively, you can install PyTorch directly. For detailed installation instructions, visit the 
+`PyTorch installation page <https://pytorch.org/get-started/locally/>`_.
+
+.. code-block:: bash
+
+    pip install torch
+
+.. note::
+    Users with NVIDIA Blackwell architecture GPUs require PyTorch with CUDA 12.8 or later support. 
+    When installing PyTorch, make sure to select the appropriate CUDA version for your system.