NVIDIA · bmhowe23 · Nov 19, 2025 · Nov 6, 2025 · Nov 7, 2025 · Nov 12, 2025
diff --git a/...decoders/trt_decoder/train_mlp_decoder.py → .../examples/qec/python/train_mlp_decoder.py b/...decoders/trt_decoder/train_mlp_decoder.py → .../examples/qec/python/train_mlp_decoder.py
@@ -1,3 +1,4 @@
+# [Begin Documentation]
 import stim
 import torch
 import torch.nn as nn
@@ -13,7 +14,7 @@
 num_val_samples = 1000  # Validation samples
 num_test_samples = 1000  # Test samples
 hidden_dim = 128  # Larger model capacity
-error_prob = 0.18  # Balanced error rate for better learning
+error_prob = 0.005  # Balanced error rate for better learning
 
 # --------------------------
 # Build the surface code circuit
@@ -30,7 +31,7 @@
 # Convert to detector error model
 dem = circuit.detector_error_model()
 num_detectors = dem.num_detectors
-num_data_qubits = circuit.num_qubits - num_detectors  # approx
+num_data_qubits = circuit.num_qubits - num_detectors
 
 print(f"Num data qubits: {num_data_qubits}, Num detectors: {num_detectors}")
 
@@ -69,9 +70,6 @@ def sample_data(num_samples):
 num_observables = Y_train.shape[1]
 print(f"Num observables: {num_observables}")
 
-print(f"X_test: {X_test}")
-print(f"Y_test: {Y_test}")
-
 
 # --------------------------
 # Improved Torch NN decoder with dropout and deeper architecture
@@ -178,8 +176,8 @@ def compute_accuracy(predictions, targets, threshold=0.5):
             val_correct += ((val_output > 0.5).float() == batch_Y).sum().item()
             val_total += batch_Y.numel()
 
-    print(f"logical_error_rate (raw): {batch_Y.sum().item() / batch_Y.numel()}")
-    print(f"cum_ler: {cum_ler / len(val_loader.dataset)}")
+    # print(f"logical_error_rate (raw): {batch_Y.sum().item() / batch_Y.numel()}")
+    # print(f"cum_ler: {cum_ler / len(val_loader.dataset)}")
 
     val_loss_avg = val_loss_total / len(val_loader.dataset)
     val_acc = val_correct / val_total

diff --git a/docs/sphinx/examples_rst/qec/decoders.rst b/docs/sphinx/examples_rst/qec/decoders.rst
@@ -113,4 +113,176 @@ The decoder returns the probability that the logical observable has flipped for
 
 See Also:
 
-- ``cudaq_qec.plugins.decoders.tensor_network_decoder``
+- ``cudaq_qec.plugins.decoders.tensor_network_decoder``
+
+.. _deploying-ai-decoders:
+
+Deploying AI Decoders with TensorRT
++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Starting with CUDA-Q QEC v0.5.0, a GPU-accelerated TensorRT-based decoder is included with the
+CUDA-Q QEC library. The TensorRT decoder (``trt_decoder``) enables users to leverage custom AI
+models for quantum error correction, providing a flexible framework for deploying trained models
+with optimized inference performance on NVIDIA GPUs.
+
+Unlike traditional algorithmic decoders, neural network decoders can be trained on specific error
+models and code structures, potentially achieving superior performance for certain noise regimes.
+The TensorRT decoder supports loading models in ONNX format and provides configurable precision
+modes (fp16, bf16, int8, fp8, tf32) to balance accuracy and inference speed.
+
+This tutorial demonstrates the complete workflow for training a simple multi-layer perceptron (MLP)
+to decode surface code syndromes using PyTorch and Stim, exporting the model to ONNX format, and
+deploying it with the TensorRT decoder for accelerated inference.
+
+Overview of the Training-to-Deployment Pipeline
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The workflow consists of three main stages:
+
+1. **Data Generation**: Use Stim to generate synthetic quantum error correction data by simulating
+   surface code circuits with realistic noise models. This produces detector measurements (syndromes)
+   and observable flips (logical errors) that serve as training data.
+
+2. **Model Training**: Train a neural network (in this case, an MLP) using PyTorch to learn the
+   mapping from syndromes to logical error predictions. The model is trained with standard deep
+   learning techniques including dropout regularization, learning rate scheduling, and validation monitoring.
+
+3. **ONNX Export and Deployment**: Export the trained PyTorch model to ONNX format, which can then
+   be loaded by the TensorRT decoder for optimized GPU inference in production QEC workflows.
+
+Training a Neural Network Decoder with PyTorch and Stim
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following example shows how to generate training data using Stim's built-in surface code
+generator, train an MLP decoder with PyTorch, and export the model to ONNX format.
+For instructions on installing PyTorch, see :ref:`Optional Dependencies <optional-dependencies>`.
+
+.. literalinclude:: ../../examples/qec/python/train_mlp_decoder.py
+   :language: python
+   :start-after: [Begin Documentation]
+
+Using the TensorRT Decoder in CUDA-Q QEC
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once you have a trained ONNX model, you can load it with the TensorRT decoder for accelerated
+inference. The decoder can be used in both C++ and Python workflows.
+
+**Loading from ONNX (with automatic TensorRT optimization)**:
+
+.. tab:: Python
+
+   .. code-block:: python
+
+      import cudaq_qec as qec
+      import numpy as np
+
+      # Note: The AI decoder doesn't use the parity check matrix.
+      # A placeholder matrix is provided here to satisfy the API.
+      H = np.array([[1, 0, 0, 1, 0, 1, 1],
+                    [0, 1, 0, 1, 1, 0, 1],
+                    [0, 0, 1, 0, 1, 1, 1]], dtype=np.uint8)
+
+      # Create TensorRT decoder from ONNX model
+      decoder = qec.get_decoder("trt_decoder", H,
+                                onnx_load_path="surface_code_decoder.onnx",
+                                precision="fp16")
+
+      # Decode a syndrome
+      syndrome = np.array([1, 0, 1], dtype=np.uint8)
+      result = decoder.decode(syndrome)
+      print(f"Predicted error: {result}")
+
+.. tab:: C++
+
+   .. code-block:: cpp
+
+      #include <cudaq/qec.h>
+
+      int main() {
+          // Note: The AI decoder doesn't use the parity check matrix.
+          // A placeholder matrix is provided here to satisfy the API.
+          std::vector<std::vector<uint8_t>> H = {
+              {1, 0, 0, 1, 0, 1, 1},
+              {0, 1, 0, 1, 1, 0, 1},
+              {0, 0, 1, 0, 1, 1, 1}
+          };
+
+          // Create decoder parameters
+          cudaq::heterogeneous_map params;
+          params.insert("onnx_load_path", "surface_code_decoder.onnx");
+          params.insert("precision", "fp16");
+
+          // Create TensorRT decoder
+          auto decoder = cudaq::qec::get_decoder("trt_decoder", H, params);
+
+          // Decode syndrome
+          std::vector<uint8_t> syndrome = {1, 0, 1};
+          auto result = decoder->decode(syndrome);
+
+          return 0;
+      }
+
+**Loading a pre-built TensorRT engine (for fastest initialization)**:
+
+If you've already converted your ONNX model to a TensorRT engine using the provided utility script,
+you can load it directly:
+
+.. tab:: Python
+
+   .. code-block:: python
+
+      decoder = qec.get_decoder("trt_decoder", H,
+                                engine_load_path="surface_code_decoder.trt")
+
+Converting ONNX Models to TensorRT Engines
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For production deployments where initialization time is critical, you can pre-build a TensorRT
+engine from your ONNX model using the ``trtexec`` command-line tool that comes with TensorRT:
+
+.. code-block:: bash
+
+   # Build with FP16 precision
+   trtexec --onnx=surface_code_decoder.onnx \
+           --saveEngine=surface_code_decoder.trt \
+           --fp16
+
+   # Build with best precision for your GPU
+   trtexec --onnx=surface_code_decoder.onnx \
+           --saveEngine=surface_code_decoder.trt \
+           --best
+
+   # Build with specific input shape (optional, for optimization)
+   trtexec --onnx=surface_code_decoder.onnx \
+           --saveEngine=surface_code_decoder.trt \
+           --fp16 \
+           --shapes=detectors:1x24
+
+Pre-built engines offer several advantages:
+
+- **Faster initialization**: Engine loading is significantly faster than ONNX parsing and optimization
+- **Reproducible optimization**: The same optimization decisions are made every time
+- **Version control**: Engines can be versioned alongside code for reproducible deployments
+
+
+Dependencies and Requirements
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The TensorRT decoder requires:
+
+- **TensorRT**: Version 10.13.3.9 or higher
+- **CUDA**: Version 12.0 or higher
+- **GPU**: NVIDIA GPU with compute capability 6.0+ (Pascal architecture or newer)
+
+For training:
+
+- **PyTorch**: Version 2.0+ recommended
+- **Stim**: For quantum circuit simulation and data generation
+
+See Also
+^^^^^^^^
+
+- :class:`cudaq_qec.Decoder` - Base decoder interface
+- `ONNX <https://onnx.ai/>`_ - Open Neural Network Exchange format
+- `TensorRT Documentation <https://docs.nvidia.com/deeplearning/tensorrt/>`_ - NVIDIA TensorRT
+- `Stim Documentation <https://github.com/quantumlib/Stim>`_ - Fast stabilizer circuit simulator
diff --git a/docs/sphinx/quickstart/installation.rst b/docs/sphinx/quickstart/installation.rst
@@ -76,14 +76,35 @@ Building from Source
 The instructions for building CUDA-QX from source are maintained on our GitHub
 repository: `Building CUDA-QX from Source <https://github.com/NVIDIA/cudaqx/blob/main/Building.md>`__.
 
-Known Blackwell Issues
-----------------------
-.. note::
-    If you are attempting to use torch on Blackwell, you will need to install the nightly version of torch.
-    You can do this by running:
+.. _optional-dependencies:
+
+Optional Dependencies
+---------------------
+
+PyTorch
+^^^^^^^
+
+PyTorch (``torch``) is required for several CUDA-QX features:
+
+* **Tensor Network Decoder**: Used by the QEC library for tensor network-based decoding
+* **GQE Algorithm**: Used by the Solvers library for the Generative Quantum Eigensolver
+* **Training AI Decoders**: Optionally used for training custom neural network decoders (see :ref:`Deploying AI Decoders with TensorRT <deploying-ai-decoders>`)
+
+PyTorch is automatically installed when you install the optional components:
+
+.. code-block:: bash
 
-    .. code-block:: bash
+    # Installs PyTorch as a dependency
+    pip install cudaq-qec[tensor-network-decoder]
+    pip install cudaq-solvers[gqe]
 
-        python3 -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
+Alternatively, you can install PyTorch directly. For detailed installation instructions, visit the 
+`PyTorch installation page <https://pytorch.org/get-started/locally/>`_.
 
-    torch is a dependency of the tensor network decoder and the GQE algorithm.
+.. code-block:: bash
+
+    pip install torch
+
+.. note::
+    CUDA-QX requires PyTorch with CUDA 12.8 or later support. When installing PyTorch, make sure to 
+    select the appropriate CUDA version for your system.