From 26f5f50bb01f0214dd65a8c15b1103d9862cc107 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Thu, 6 Nov 2025 20:55:12 +0000
Subject: [PATCH 01/11] Added ability for surface code to run sliding_window.
 First attempt at overhaul of decoder::enqueue_syndrome

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp                      | 167 +++++++++++++++---
 .../realtime/app_examples/surface_code-1.cpp  | 129 +++++++++++---
 2 files changed, 245 insertions(+), 51 deletions(-)

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index 73a38707..f6093d9d 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -23,16 +23,35 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor<uint8_t> &,
 namespace cudaq::qec {
 
 struct decoder::rt_impl {
-  /// The number of measurement syndromes to be decoded per decode call (i.e.
-  /// the number of columns in the D_sparse matrix)
+  /// The number of syndromes per round (enables incremental detector computation)
+  uint32_t num_syndromes_per_round = 0;
+
+  /// The number of measurement syndromes to be decoded per decode call
+  /// (for incremental mode: one round; for batch mode: full D_sparse columns)
   uint32_t num_msyn_per_decode = 0;
 
-  /// The index of the next syndrome to be written in the msyn_buffer
-  uint32_t msyn_buffer_index = 0;
+  /// Counter of total syndromes buffered but not yet processed.
+  /// Used to detect complete rounds (when this is a multiple of num_msyn_per_decode).
+  /// Gets decremented after each round is decoded. Not a direct buffer index.
+  uint32_t num_syndromes_buffered_but_not_decoded = 0;
 
-  /// The buffer of measurement syndromes received from the client. Length is
-  /// num_msyn_per_decode.
+  /// The buffer of measurement syndromes received from the client. 
+  /// For incremental mode: size is calculated from max D_sparse column + 1
+  /// This allows buffering multiple rounds while still decoding incrementally
   std::vector<uint8_t> msyn_buffer;
+  
+  /// Total buffer capacity (max column index in D_sparse + 1)
+  uint32_t buffer_capacity = 0;
+
+  /// Track which round we're on (0 = reference round)
+  uint32_t current_round = 0;
+  
+  /// Circular buffer write position for the current round
+  // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then wrap around to 0.
+  uint32_t current_round_buffer_offset = 0;
+  
+  /// Circular buffer position of the previous round (for incremental XOR)
+  uint32_t prev_round_buffer_offset = 0;
 
   /// The current observable corrections. The length of this vector is the
   /// number of rows in the O_sparse matrix.
@@ -174,33 +193,111 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; }
 
 void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
   this->D_sparse = D_sparse;
-  pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse);
+  
+  // Infer num_syndromes_per_round from D_sparse timelike structure
+  // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds
+  // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24
+  if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) {
+    pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0];
+  } else {
+    // Fallback: assume 1:1 mapping
+    pimpl->num_syndromes_per_round = 1;
+  }
+  
+  // Calculate minimum buffer capacity from max column in D_sparse
+  uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse);
+  
+  // Enable incremental mode: process one round at a time
+  pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
+  
+  // Add one extra round to buffer capacity to guarantee no wraparound within operations
+  // This eliminates all wraparound checks in hot loops (write and detector computation)
+  pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round;
+  
+  // Allocate buffer to hold all syndromes plus extra round
   pimpl->msyn_buffer.clear();
-  pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
-  pimpl->msyn_buffer_index = 0;
+  pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
+  
+  pimpl->num_syndromes_buffered_but_not_decoded = 0;
+  pimpl->current_round = 0;
+  pimpl->current_round_buffer_offset = 0;
+  pimpl->prev_round_buffer_offset = 0;
 }
 
 void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
   set_sparse_from_vec(D_sparse_vec_in, this->D_sparse);
-  pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse);
+  
+  // Infer num_syndromes_per_round from D_sparse timelike structure
+  // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds
+  // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24
+  if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) {
+    pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0];
+  } else {
+    // Fallback: assume 1:1 mapping
+    pimpl->num_syndromes_per_round = 1;
+  }
+  
+  // Calculate minimum buffer capacity from max column in D_sparse
+  uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse);
+  
+  // Enable incremental mode: process one round at a time
+  pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
+  
+  // Add one extra round to buffer capacity to guarantee no wraparound within operations
+  // This eliminates all wraparound checks in hot loops (write and detector computation)
+  pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round;
+  
+  // Allocate buffer to hold all syndromes plus extra round
   pimpl->msyn_buffer.clear();
-  pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
-  pimpl->msyn_buffer_index = 0;
+  pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
+  
+  pimpl->num_syndromes_buffered_but_not_decoded = 0;
+  pimpl->current_round = 0;
+  pimpl->current_round_buffer_offset = 0;
+  pimpl->prev_round_buffer_offset = 0;
 }
 
 bool decoder::enqueue_syndrome(const uint8_t *syndrome,
                                std::size_t syndrome_length) {
-  if (pimpl->msyn_buffer_index + syndrome_length > pimpl->msyn_buffer.size()) {
-    // CUDAQ_WARN("Syndrome buffer overflow. Syndrome will be ignored.");
-    printf("Syndrome buffer overflow. Syndrome will be ignored.\n");
+  // position_in_round represents how many syndromes of the current round have already been buffered but not yet decoded
+  // Values range from 0 to num_msyn_per_decode - 1.
+  uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode;
+  
+  // Check if this write would overwrite the previous round
+  // We need to preserve prev_round_buffer for XOR computation, so the maximum
+  // safe write from the start of the current round is buffer_capacity minus one round
+  uint32_t max_safe_from_round_start = pimpl->buffer_capacity - pimpl->num_syndromes_per_round;
+  if (position_in_round + syndrome_length > max_safe_from_round_start) {
+    // CUDAQ_WARN("Syndrome data too large - would overwrite previous round. Data will be ignored.");
+    printf("Syndrome data too large - would overwrite previous round. Data will be ignored.\n");
     return false;
   }
   bool did_decode = false;
+  // Buffer the incoming syndromes
+  // No wraparound check needed: buffer is sized to guarantee operations never wrap mid-execution
+  uint32_t write_start = pimpl->current_round_buffer_offset + position_in_round;
   for (std::size_t i = 0; i < syndrome_length; i++) {
-    pimpl->msyn_buffer[pimpl->msyn_buffer_index] = syndrome[i];
-    pimpl->msyn_buffer_index++;
+    pimpl->msyn_buffer[write_start + i] = syndrome[i];
   }
-  if (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size()) {
+  pimpl->num_syndromes_buffered_but_not_decoded += syndrome_length;
+  
+  // Process all complete rounds that are now available
+  while ((pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode) == 0 && 
+         pimpl->num_syndromes_buffered_but_not_decoded > 0) {
+    pimpl->current_round++;
+    
+    // First round (round 1): store as reference, don't decode yet
+    if (pimpl->current_round == 1) {
+      // Previous round stays at current position for next round's XOR
+      pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset;
+      // Advance to next round position in circular buffer
+      pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode;
+      if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity)
+        pimpl->current_round_buffer_offset -= pimpl->buffer_capacity;
+      pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode;  // Decrement for next iteration
+      continue;  // Skip to next round
+    }
+    
     // These are just for logging. They are initialized in such a way to avoid
     // dynamic memory allocation if logging is disabled.
     std::vector<uint32_t> log_msyn;
@@ -222,12 +319,20 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
       log_observable_corrections.resize(O_sparse.size());
     }
 
-    // Decode now.
-    for (std::size_t i = 0; i < this->D_sparse.size(); i++) {
-      pimpl->persistent_detector_buffer[i] = 0;
-      for (auto col : this->D_sparse[i])
-        pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col];
+    // Compute detectors incrementally by XORing current round with previous round
+    // Using circular buffer offsets - no D_sparse access needed
+    // No wraparound checks needed: buffer is sized to guarantee operations never wrap mid-execution
+    for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
+      pimpl->persistent_detector_buffer[i] = 
+          pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ 
+          pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i];
     }
+    
+    // Update offsets for next round: current becomes previous, advance current
+    pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset;
+    pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode;
+    if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity)
+      pimpl->current_round_buffer_offset -= pimpl->buffer_capacity;
     if (should_log) {
       log_msyn.reserve(pimpl->msyn_buffer.size());
       for (std::size_t d = 0, D = pimpl->msyn_buffer.size(); d < D; d++) {
@@ -295,9 +400,11 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
         printf("%s\n", s.c_str());
     }
     did_decode = true;
-    // Prepare for more data.
-    pimpl->msyn_buffer_index = 0;
+    
+    // Decrement counter for next iteration of while loop
+    pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode;
   }
+  
   return did_decode;
 }
 
@@ -344,9 +451,15 @@ std::size_t decoder::get_num_observables() const { return O_sparse.size(); }
 
 void decoder::reset_decoder() {
   // Zero out all data that is considered "per-shot" memory.
-  pimpl->msyn_buffer_index = 0;
+  pimpl->num_syndromes_buffered_but_not_decoded = 0;
   pimpl->msyn_buffer.clear();
-  pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
+  pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
+  
+  // Reset incremental computation state
+  pimpl->current_round = 0;
+  pimpl->current_round_buffer_offset = 0;
+  pimpl->prev_round_buffer_offset = 0;
+  
   pimpl->corrections.clear();
   pimpl->corrections.resize(O_sparse.size());
   const bool log_due_to_log_level =
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
index 61a7aef5..1d34aaa5 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
+++ b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
@@ -32,14 +32,16 @@
 
 void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
                       std::string dem_filename, uint64_t numSyndromesPerRound,
-                      uint64_t numLogical) {
+                      uint64_t numLogical, const std::string &decoder_type,
+                      int decoder_window, int sw_window_size,
+                      int sw_step_size) {
   cudaq::qec::decoding::config::multi_decoder_config multi_config;
   for (uint64_t i = 0; i < numLogical; i++) {
     // We actually send 1 additional round in this example, so add 1.
     auto numRounds = dem.num_detectors() / numSyndromesPerRound + 1;
     cudaq::qec::decoding::config::decoder_config config;
     config.id = i;
-    config.type = "nv-qldpc-decoder";
+    config.type = decoder_type;  // Use parameter instead of hardcoded
     config.block_size = dem.num_error_mechanisms();
     config.syndrome_size = dem.num_detectors();
     config.num_syndromes_per_round = numSyndromesPerRound;
@@ -48,24 +50,53 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
         cudaq::qec::pcm_to_sparse_vec(dem.observables_flips_matrix);
     config.D_sparse = cudaq::qec::generate_timelike_sparse_detector_matrix(
         numSyndromesPerRound, numRounds, /*include_first_round=*/false);
-    config.decoder_custom_args =
-        cudaq::qec::decoding::config::nv_qldpc_decoder_config();
-    auto &nv_config =
-        std::get<cudaq::qec::decoding::config::nv_qldpc_decoder_config>(
-            config.decoder_custom_args);
-    nv_config.use_sparsity = true;
-    nv_config.error_rate_vec = dem.error_rates;
-    nv_config.use_osd = true;
-    nv_config.max_iterations = 50;
-    nv_config.osd_order = 60;
-    nv_config.osd_method = 3;
+    
+    if (decoder_type == "nv-qldpc-decoder") {
+      // Original NV-QLDPC configuration
+      config.decoder_custom_args =
+          cudaq::qec::decoding::config::nv_qldpc_decoder_config();
+      auto &nv_config =
+          std::get<cudaq::qec::decoding::config::nv_qldpc_decoder_config>(
+              config.decoder_custom_args);
+      nv_config.use_sparsity = true;
+      nv_config.error_rate_vec = dem.error_rates;
+      nv_config.use_osd = true;
+      nv_config.max_iterations = 50;
+      nv_config.osd_order = 60;
+      nv_config.osd_method = 3;
+      
+    } else if (decoder_type == "sliding_window") {
+      // Sliding window configuration
+      cudaq::qec::decoding::config::sliding_window_config sw_config;
+      sw_config.window_size = sw_window_size;
+      sw_config.step_size = sw_step_size;
+      sw_config.num_syndromes_per_round = numSyndromesPerRound;
+      sw_config.straddle_start_round = false;
+      sw_config.straddle_end_round = true;
+      sw_config.inner_decoder_name = "nv-qldpc-decoder";
+      sw_config.error_rate_vec = dem.error_rates;  // Required by sliding_window
+      
+      // Configure inner NV-QLDPC decoder
+      cudaq::qec::decoding::config::nv_qldpc_decoder_config nv_config;
+      nv_config.use_sparsity = true;
+      nv_config.error_rate_vec = dem.error_rates;
+      nv_config.use_osd = true;
+      nv_config.max_iterations = 50;
+      nv_config.osd_order = 60;
+      nv_config.osd_method = 3;
+      
+      sw_config.nv_qldpc_decoder_params = nv_config;
+      config.decoder_custom_args = sw_config;
+    }
+    
     multi_config.decoders.push_back(config);
   }
   std::string config_str = multi_config.to_yaml_str(200);
   std::ofstream config_file(dem_filename);
   config_file << config_str;
   config_file.close();
-  printf("Saved config to file: %s\n", dem_filename.c_str());
+  printf("Saved %s config to file: %s\n", decoder_type.c_str(),
+         dem_filename.c_str());
   return;
 }
 
@@ -82,14 +113,31 @@ void load_dem_from_file(const std::string &dem_filename,
       cudaq::qec::decoding::config::multi_decoder_config::from_yaml_str(
           dem_str);
   if (numLogical != config.decoders.size()) {
-    printf("ERROR: numLogical [%ld] !- config.decoders.size() [%ld]\n",
+    printf("ERROR: numLogical [%ld] != config.decoders.size() [%ld]\n",
            numLogical, config.decoders.size());
     exit(1);
   }
   auto decoder_config = config.decoders[0];
-  auto nv_qldpc_config =
-      std::get<cudaq::qec::decoding::config::nv_qldpc_decoder_config>(
-          decoder_config.decoder_custom_args);
+  
+  // Extract error rates based on decoder type
+  std::vector<cudaq::qec::float_t> error_rates;
+  
+  if (decoder_config.type == "nv-qldpc-decoder") {
+    auto nv_config =
+        std::get<cudaq::qec::decoding::config::nv_qldpc_decoder_config>(
+            decoder_config.decoder_custom_args);
+    error_rates = nv_config.error_rate_vec.value();
+    
+  } else if (decoder_config.type == "sliding_window") {
+    auto sw_config =
+        std::get<cudaq::qec::decoding::config::sliding_window_config>(
+            decoder_config.decoder_custom_args);
+    // Extract from top-level error_rate_vec (required for sliding_window)
+    if (!sw_config.error_rate_vec.empty()) {
+      error_rates = sw_config.error_rate_vec;
+    }
+  }
+  
   dem.detector_error_matrix = cudaq::qec::pcm_from_sparse_vec(
       decoder_config.H_sparse, decoder_config.syndrome_size,
       decoder_config.block_size);
@@ -99,10 +147,11 @@ void load_dem_from_file(const std::string &dem_filename,
                                       decoder_config.O_sparse.end(), -1);
   dem.observables_flips_matrix = cudaq::qec::pcm_from_sparse_vec(
       decoder_config.O_sparse, num_observables, decoder_config.block_size);
-  dem.error_rates = nv_qldpc_config.error_rate_vec.value();
-  printf("Loaded dem from file: %s\n", dem_filename.c_str());
+  dem.error_rates = error_rates;
+  printf("Loaded %s config from file: %s\n", decoder_config.type.c_str(),
+         dem_filename.c_str());
 
-  // Now configure the decoders
+  // Now configure the decoders (works for both types)
   cudaq::qec::decoding::config::configure_decoders(config);
 }
 
@@ -376,7 +425,9 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
                        double p_spam, cudaq::qec::operation statePrep,
                        std::size_t numShots, std::size_t numRounds,
                        std::size_t numLogical, std::string dem_filename,
-                       bool save_dem, bool load_dem, int decoder_window) {
+                       bool save_dem, bool load_dem, int decoder_window,
+                       const std::string &decoder_type, int sw_window_size,
+                       int sw_step_size) {
   if (!code.contains_operation(statePrep))
     throw std::runtime_error(
         "sample_memory_circuit_error - requested state prep kernel not found.");
@@ -532,7 +583,9 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
     dem.observables_flips_matrix.dump_bits();
 
     if (save_dem) {
-      save_dem_to_file(dem, dem_filename, numSyndromesPerRound, numLogical);
+      save_dem_to_file(dem, dem_filename, numSyndromesPerRound, numLogical,
+                       decoder_type, decoder_window, sw_window_size,
+                       sw_step_size);
       return;
     }
   }
@@ -603,6 +656,11 @@ void show_help() {
          "distance\n");
   printf("  --decoder_window <int>  Number of rounds to use for the decoder "
          "window. Default: distance\n");
+  printf("  --decoder_type <string> Decoder type: 'nv-qldpc-decoder' or "
+         "'sliding_window'. Default: nv-qldpc-decoder\n");
+  printf("  --sw_window_size <int>  Sliding window size (only for "
+         "sliding_window decoder). Default: decoder_window\n");
+  printf("  --sw_step_size <int>    Sliding window step size. Default: 1\n");
   printf("  --save_dem <string> Save the detector error model to a file.\n");
   printf("  --load_dem <string> Load the detector error model from a file. "
          "(Cannot be used with --save_dem)\n");
@@ -619,6 +677,11 @@ int main(int argc, char **argv) {
   bool save_dem = false;
   bool load_dem = false;
   std::string dem_filename;
+  
+  // Decoder type selection
+  std::string decoder_type = "nv-qldpc-decoder";  // Default
+  int sw_window_size = -1;  // For sliding_window, default to decoder_window
+  int sw_step_size = 1;     // For sliding_window
 
   // Parse the command line arguments
   for (int i = 1; i < argc; i++) {
@@ -644,6 +707,15 @@ int main(int argc, char **argv) {
     } else if (arg == "--decoder_window") {
       decoder_window = std::stoi(argv[i + 1]);
       i++;
+    } else if (arg == "--decoder_type") {
+      decoder_type = argv[i + 1];
+      i++;
+    } else if (arg == "--sw_window_size") {
+      sw_window_size = std::stoi(argv[i + 1]);
+      i++;
+    } else if (arg == "--sw_step_size") {
+      sw_step_size = std::stoi(argv[i + 1]);
+      i++;
     } else if (arg == "--save_dem") {
       save_dem = true;
       dem_filename = argv[i + 1];
@@ -671,6 +743,14 @@ int main(int argc, char **argv) {
     num_rounds = distance;
   if (decoder_window == -1)
     decoder_window = distance;
+  if (sw_window_size == -1)
+    sw_window_size = decoder_window;
+  
+  // Validate decoder type
+  if (decoder_type != "nv-qldpc-decoder" && decoder_type != "sliding_window") {
+    printf("Error: --decoder_type must be 'nv-qldpc-decoder' or 'sliding_window'\n");
+    return 1;
+  }
 
   // Validate that num_rounds >= distance
   if (num_rounds < distance || num_rounds % distance != 0) {
@@ -721,7 +801,8 @@ int main(int argc, char **argv) {
 
   demo_circuit_host(*code, distance, p_spam, cudaq::qec::operation::prep0,
                     num_shots, num_rounds, num_logical, dem_filename, save_dem,
-                    load_dem, decoder_window);
+                    load_dem, decoder_window, decoder_type, sw_window_size,
+                    sw_step_size);
 
   // Ensure clean shutdown
   cudaq::qec::decoding::config::finalize_decoders();

From 5e12fd88def5281dd500f1f61f0dc837437892df Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Thu, 6 Nov 2025 22:06:36 +0000
Subject: [PATCH 02/11] Bug fix + formatting

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp                      | 131 ++++++++++--------
 .../realtime/app_examples/surface_code-1.cpp  |  35 ++---
 2 files changed, 92 insertions(+), 74 deletions(-)

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index f6093d9d..90bf0b7e 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -23,7 +23,8 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor<uint8_t> &,
 namespace cudaq::qec {
 
 struct decoder::rt_impl {
-  /// The number of syndromes per round (enables incremental detector computation)
+  /// The number of syndromes per round (enables incremental detector
+  /// computation)
   uint32_t num_syndromes_per_round = 0;
 
   /// The number of measurement syndromes to be decoded per decode call
@@ -31,25 +32,27 @@ struct decoder::rt_impl {
   uint32_t num_msyn_per_decode = 0;
 
   /// Counter of total syndromes buffered but not yet processed.
-  /// Used to detect complete rounds (when this is a multiple of num_msyn_per_decode).
-  /// Gets decremented after each round is decoded. Not a direct buffer index.
+  /// Used to detect complete rounds (when this is a multiple of
+  /// num_msyn_per_decode). Gets decremented after each round is decoded. Not a
+  /// direct buffer index.
   uint32_t num_syndromes_buffered_but_not_decoded = 0;
 
-  /// The buffer of measurement syndromes received from the client. 
+  /// The buffer of measurement syndromes received from the client.
   /// For incremental mode: size is calculated from max D_sparse column + 1
   /// This allows buffering multiple rounds while still decoding incrementally
   std::vector<uint8_t> msyn_buffer;
-  
+
   /// Total buffer capacity (max column index in D_sparse + 1)
   uint32_t buffer_capacity = 0;
 
   /// Track which round we're on (0 = reference round)
   uint32_t current_round = 0;
-  
+
   /// Circular buffer write position for the current round
-  // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then wrap around to 0.
+  // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then
+  // wrap around to 0.
   uint32_t current_round_buffer_offset = 0;
-  
+
   /// Circular buffer position of the previous round (for incremental XOR)
   uint32_t prev_round_buffer_offset = 0;
 
@@ -193,31 +196,33 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; }
 
 void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
   this->D_sparse = D_sparse;
-  
+
   // Infer num_syndromes_per_round from D_sparse timelike structure
-  // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds
-  // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24
-  if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) {
-    pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0];
+  // For timelike detectors, each detector XORs two syndromes from consecutive
+  // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with
+  // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24
+  if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) {
+    pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0];
   } else {
     // Fallback: assume 1:1 mapping
     pimpl->num_syndromes_per_round = 1;
   }
-  
+
   // Calculate minimum buffer capacity from max column in D_sparse
   uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse);
-  
+
   // Enable incremental mode: process one round at a time
   pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
-  
-  // Add one extra round to buffer capacity to guarantee no wraparound within operations
-  // This eliminates all wraparound checks in hot loops (write and detector computation)
+
+  // Add one extra round to buffer capacity to guarantee no wraparound within
+  // operations This eliminates all wraparound checks in hot loops (write and
+  // detector computation)
   pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round;
-  
+
   // Allocate buffer to hold all syndromes plus extra round
   pimpl->msyn_buffer.clear();
   pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
-  
+
   pimpl->num_syndromes_buffered_but_not_decoded = 0;
   pimpl->current_round = 0;
   pimpl->current_round_buffer_offset = 0;
@@ -226,31 +231,33 @@ void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
 
 void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
   set_sparse_from_vec(D_sparse_vec_in, this->D_sparse);
-  
+
   // Infer num_syndromes_per_round from D_sparse timelike structure
-  // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds
-  // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24
-  if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) {
-    pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0];
+  // For timelike detectors, each detector XORs two syndromes from consecutive
+  // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with
+  // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24
+  if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) {
+    pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0];
   } else {
     // Fallback: assume 1:1 mapping
     pimpl->num_syndromes_per_round = 1;
   }
-  
+
   // Calculate minimum buffer capacity from max column in D_sparse
   uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse);
-  
+
   // Enable incremental mode: process one round at a time
   pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
-  
-  // Add one extra round to buffer capacity to guarantee no wraparound within operations
-  // This eliminates all wraparound checks in hot loops (write and detector computation)
+
+  // Add one extra round to buffer capacity to guarantee no wraparound within
+  // operations This eliminates all wraparound checks in hot loops (write and
+  // detector computation)
   pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round;
-  
+
   // Allocate buffer to hold all syndromes plus extra round
   pimpl->msyn_buffer.clear();
   pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
-  
+
   pimpl->num_syndromes_buffered_but_not_decoded = 0;
   pimpl->current_round = 0;
   pimpl->current_round_buffer_offset = 0;
@@ -259,33 +266,41 @@ void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
 
 bool decoder::enqueue_syndrome(const uint8_t *syndrome,
                                std::size_t syndrome_length) {
-  // position_in_round represents how many syndromes of the current round have already been buffered but not yet decoded
-  // Values range from 0 to num_msyn_per_decode - 1.
-  uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode;
-  
+  // position_in_round represents how many syndromes of the current round have
+  // already been buffered but not yet decoded Values range from 0 to
+  // num_msyn_per_decode - 1.
+  uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded %
+                               pimpl->num_msyn_per_decode;
+
   // Check if this write would overwrite the previous round
   // We need to preserve prev_round_buffer for XOR computation, so the maximum
-  // safe write from the start of the current round is buffer_capacity minus one round
-  uint32_t max_safe_from_round_start = pimpl->buffer_capacity - pimpl->num_syndromes_per_round;
+  // safe write from the start of the current round is buffer_capacity minus one
+  // round
+  uint32_t max_safe_from_round_start =
+      pimpl->buffer_capacity - pimpl->num_syndromes_per_round;
   if (position_in_round + syndrome_length > max_safe_from_round_start) {
-    // CUDAQ_WARN("Syndrome data too large - would overwrite previous round. Data will be ignored.");
-    printf("Syndrome data too large - would overwrite previous round. Data will be ignored.\n");
+    // CUDAQ_WARN("Syndrome data too large - would overwrite previous round.
+    // Data will be ignored.");
+    printf("Syndrome data too large - would overwrite previous round. Data "
+           "will be ignored.\n");
     return false;
   }
   bool did_decode = false;
   // Buffer the incoming syndromes
-  // No wraparound check needed: buffer is sized to guarantee operations never wrap mid-execution
+  // No wraparound check needed: buffer is sized to guarantee operations never
+  // wrap mid-execution
   uint32_t write_start = pimpl->current_round_buffer_offset + position_in_round;
   for (std::size_t i = 0; i < syndrome_length; i++) {
     pimpl->msyn_buffer[write_start + i] = syndrome[i];
   }
   pimpl->num_syndromes_buffered_but_not_decoded += syndrome_length;
-  
+
   // Process all complete rounds that are now available
-  while ((pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode) == 0 && 
+  while ((pimpl->num_syndromes_buffered_but_not_decoded %
+          pimpl->num_msyn_per_decode) == 0 &&
          pimpl->num_syndromes_buffered_but_not_decoded > 0) {
     pimpl->current_round++;
-    
+
     // First round (round 1): store as reference, don't decode yet
     if (pimpl->current_round == 1) {
       // Previous round stays at current position for next round's XOR
@@ -294,10 +309,11 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
       pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode;
       if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity)
         pimpl->current_round_buffer_offset -= pimpl->buffer_capacity;
-      pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode;  // Decrement for next iteration
-      continue;  // Skip to next round
+      pimpl->num_syndromes_buffered_but_not_decoded -=
+          pimpl->num_msyn_per_decode; // Decrement for next iteration
+      continue;                       // Skip to next round
     }
-    
+
     // These are just for logging. They are initialized in such a way to avoid
     // dynamic memory allocation if logging is disabled.
     std::vector<uint32_t> log_msyn;
@@ -319,15 +335,16 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
       log_observable_corrections.resize(O_sparse.size());
     }
 
-    // Compute detectors incrementally by XORing current round with previous round
-    // Using circular buffer offsets - no D_sparse access needed
-    // No wraparound checks needed: buffer is sized to guarantee operations never wrap mid-execution
+    // Compute detectors incrementally by XORing current round with previous
+    // round Using circular buffer offsets - no D_sparse access needed No
+    // wraparound checks needed: buffer is sized to guarantee operations never
+    // wrap mid-execution
     for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
-      pimpl->persistent_detector_buffer[i] = 
-          pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ 
+      pimpl->persistent_detector_buffer[i] =
+          pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^
           pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i];
     }
-    
+
     // Update offsets for next round: current becomes previous, advance current
     pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset;
     pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode;
@@ -400,11 +417,11 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
         printf("%s\n", s.c_str());
     }
     did_decode = true;
-    
+
     // Decrement counter for next iteration of while loop
     pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode;
   }
-  
+
   return did_decode;
 }
 
@@ -454,12 +471,12 @@ void decoder::reset_decoder() {
   pimpl->num_syndromes_buffered_but_not_decoded = 0;
   pimpl->msyn_buffer.clear();
   pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
-  
+
   // Reset incremental computation state
   pimpl->current_round = 0;
   pimpl->current_round_buffer_offset = 0;
   pimpl->prev_round_buffer_offset = 0;
-  
+
   pimpl->corrections.clear();
   pimpl->corrections.resize(O_sparse.size());
   const bool log_due_to_log_level =
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
index 1d34aaa5..5a8e15a5 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
+++ b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
@@ -41,7 +41,7 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
     auto numRounds = dem.num_detectors() / numSyndromesPerRound + 1;
     cudaq::qec::decoding::config::decoder_config config;
     config.id = i;
-    config.type = decoder_type;  // Use parameter instead of hardcoded
+    config.type = decoder_type; // Use parameter instead of hardcoded
     config.block_size = dem.num_error_mechanisms();
     config.syndrome_size = dem.num_detectors();
     config.num_syndromes_per_round = numSyndromesPerRound;
@@ -50,7 +50,7 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
         cudaq::qec::pcm_to_sparse_vec(dem.observables_flips_matrix);
     config.D_sparse = cudaq::qec::generate_timelike_sparse_detector_matrix(
         numSyndromesPerRound, numRounds, /*include_first_round=*/false);
-    
+
     if (decoder_type == "nv-qldpc-decoder") {
       // Original NV-QLDPC configuration
       config.decoder_custom_args =
@@ -64,7 +64,7 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
       nv_config.max_iterations = 50;
       nv_config.osd_order = 60;
       nv_config.osd_method = 3;
-      
+
     } else if (decoder_type == "sliding_window") {
       // Sliding window configuration
       cudaq::qec::decoding::config::sliding_window_config sw_config;
@@ -74,8 +74,8 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
       sw_config.straddle_start_round = false;
       sw_config.straddle_end_round = true;
       sw_config.inner_decoder_name = "nv-qldpc-decoder";
-      sw_config.error_rate_vec = dem.error_rates;  // Required by sliding_window
-      
+      sw_config.error_rate_vec = dem.error_rates; // Required by sliding_window
+
       // Configure inner NV-QLDPC decoder
       cudaq::qec::decoding::config::nv_qldpc_decoder_config nv_config;
       nv_config.use_sparsity = true;
@@ -84,11 +84,11 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
       nv_config.max_iterations = 50;
       nv_config.osd_order = 60;
       nv_config.osd_method = 3;
-      
+
       sw_config.nv_qldpc_decoder_params = nv_config;
       config.decoder_custom_args = sw_config;
     }
-    
+
     multi_config.decoders.push_back(config);
   }
   std::string config_str = multi_config.to_yaml_str(200);
@@ -118,16 +118,16 @@ void load_dem_from_file(const std::string &dem_filename,
     exit(1);
   }
   auto decoder_config = config.decoders[0];
-  
+
   // Extract error rates based on decoder type
   std::vector<cudaq::qec::float_t> error_rates;
-  
+
   if (decoder_config.type == "nv-qldpc-decoder") {
     auto nv_config =
         std::get<cudaq::qec::decoding::config::nv_qldpc_decoder_config>(
             decoder_config.decoder_custom_args);
     error_rates = nv_config.error_rate_vec.value();
-    
+
   } else if (decoder_config.type == "sliding_window") {
     auto sw_config =
         std::get<cudaq::qec::decoding::config::sliding_window_config>(
@@ -137,7 +137,7 @@ void load_dem_from_file(const std::string &dem_filename,
       error_rates = sw_config.error_rate_vec;
     }
   }
-  
+
   dem.detector_error_matrix = cudaq::qec::pcm_from_sparse_vec(
       decoder_config.H_sparse, decoder_config.syndrome_size,
       decoder_config.block_size);
@@ -677,11 +677,11 @@ int main(int argc, char **argv) {
   bool save_dem = false;
   bool load_dem = false;
   std::string dem_filename;
-  
+
   // Decoder type selection
-  std::string decoder_type = "nv-qldpc-decoder";  // Default
-  int sw_window_size = -1;  // For sliding_window, default to decoder_window
-  int sw_step_size = 1;     // For sliding_window
+  std::string decoder_type = "nv-qldpc-decoder"; // Default
+  int sw_window_size = -1; // For sliding_window, default to decoder_window
+  int sw_step_size = 1;    // For sliding_window
 
   // Parse the command line arguments
   for (int i = 1; i < argc; i++) {
@@ -745,10 +745,11 @@ int main(int argc, char **argv) {
     decoder_window = distance;
   if (sw_window_size == -1)
     sw_window_size = decoder_window;
-  
+
   // Validate decoder type
   if (decoder_type != "nv-qldpc-decoder" && decoder_type != "sliding_window") {
-    printf("Error: --decoder_type must be 'nv-qldpc-decoder' or 'sliding_window'\n");
+    printf("Error: --decoder_type must be 'nv-qldpc-decoder' or "
+           "'sliding_window'\n");
     return 1;
   }
 

From cc08dea7a78595bdf7b1531fddf7af544ff27803 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Fri, 7 Nov 2025 21:32:30 +0000
Subject: [PATCH 03/11] Add support for first round detectors

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp | 89 +++++++++++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 28 deletions(-)

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index 90bf0b7e..9358d9e4 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -72,6 +72,9 @@ struct decoder::rt_impl {
 
   /// The id of the decoder (for instrumentation)
   uint32_t decoder_id = 0;
+
+  /// Whether D_sparse has first-round detectors (determined in set_D_sparse)
+  bool has_first_round_detectors = false;
 };
 
 void decoder::rt_impl_deleter::operator()(rt_impl *p) const { delete p; }
@@ -197,15 +200,24 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; }
 void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
   this->D_sparse = D_sparse;
 
-  // Infer num_syndromes_per_round from D_sparse timelike structure
-  // For timelike detectors, each detector XORs two syndromes from consecutive
-  // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with
-  // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24
-  if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) {
-    pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0];
-  } else {
-    // Fallback: assume 1:1 mapping
-    pimpl->num_syndromes_per_round = 1;
+  // Analyze D_sparse structure (assumes well-formed D_sparse from generator):
+  // 1. First-round detectors (if any) are always at the beginning
+  // 2. All timelike detectors have the same stride (num_syndromes_per_round)
+  
+  // Check if first row is a first-round detector (single syndrome index)
+  pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1);
+  
+  // Find num_syndromes_per_round from first timelike detector
+  // (skip first-round detectors if present, they're all at the beginning)
+  pimpl->num_syndromes_per_round = 1; // Default fallback
+  for (const auto& detector_syndrome_indices : D_sparse) {
+    if (detector_syndrome_indices.size() >= 2) {
+      // First timelike detector found: XORs syndromes from consecutive rounds
+      // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2)
+      // so num_syndromes_per_round = 8
+      pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0];
+      break; // Found it, no need to continue
+    }
   }
 
   // Calculate minimum buffer capacity from max column in D_sparse
@@ -232,15 +244,24 @@ void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
 void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
   set_sparse_from_vec(D_sparse_vec_in, this->D_sparse);
 
-  // Infer num_syndromes_per_round from D_sparse timelike structure
-  // For timelike detectors, each detector XORs two syndromes from consecutive
-  // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with
-  // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24
-  if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) {
-    pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0];
-  } else {
-    // Fallback: assume 1:1 mapping
-    pimpl->num_syndromes_per_round = 1;
+  // Analyze D_sparse structure (assumes well-formed D_sparse from generator):
+  // 1. First-round detectors (if any) are always at the beginning
+  // 2. All timelike detectors have the same stride (num_syndromes_per_round)
+  
+  // Check if first row is a first-round detector (single syndrome index)
+  pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1);
+  
+  // Find num_syndromes_per_round from first timelike detector
+  // (skip first-round detectors if present, they're all at the beginning)
+  pimpl->num_syndromes_per_round = 1; // Default fallback
+  for (const auto& detector_syndrome_indices : D_sparse) {
+    if (detector_syndrome_indices.size() >= 2) {
+      // First timelike detector found: XORs syndromes from consecutive rounds
+      // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2)
+      // so num_syndromes_per_round = 8
+      pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0];
+      break; // Found it, no need to continue
+    }
   }
 
   // Calculate minimum buffer capacity from max column in D_sparse
@@ -301,8 +322,10 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
          pimpl->num_syndromes_buffered_but_not_decoded > 0) {
     pimpl->current_round++;
 
-    // First round (round 1): store as reference, don't decode yet
-    if (pimpl->current_round == 1) {
+    // First round (round 1): skip decoding (store as reference)
+    // UNLESS there are first-round detectors that need immediate decoding
+    // (first-round detector check is done once in set_D_sparse)
+    if (pimpl->current_round == 1 && !pimpl->has_first_round_detectors) {
       // Previous round stays at current position for next round's XOR
       pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset;
       // Advance to next round position in circular buffer
@@ -335,14 +358,24 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
       log_observable_corrections.resize(O_sparse.size());
     }
 
-    // Compute detectors incrementally by XORing current round with previous
-    // round Using circular buffer offsets - no D_sparse access needed No
-    // wraparound checks needed: buffer is sized to guarantee operations never
-    // wrap mid-execution
-    for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
-      pimpl->persistent_detector_buffer[i] =
-          pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^
-          pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i];
+    // Compute detectors based on whether first-round detectors exist
+    if (pimpl->has_first_round_detectors) {
+      // When first-round detectors exist, must use D_sparse for all detectors
+      // because first-round detectors reference only one syndrome (not two)
+      for (std::size_t i = 0; i < this->D_sparse.size(); i++) {
+        pimpl->persistent_detector_buffer[i] = 0;
+        for (auto col : this->D_sparse[i])
+          pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col];
+      }
+    } else {
+      // Pure timelike detectors: use incremental XOR (current ⊕ previous round)
+      // Using circular buffer offsets - no D_sparse access needed
+      // No wraparound checks needed: buffer is sized to guarantee operations never wrap
+      for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
+        pimpl->persistent_detector_buffer[i] =
+            pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^
+            pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i];
+      }
     }
 
     // Update offsets for next round: current becomes previous, advance current

From 7ccde1d236216f34fb7ff615f413dd82f3db89a9 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Fri, 7 Nov 2025 21:39:54 +0000
Subject: [PATCH 04/11] clang_format and minor function rename

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index 9358d9e4..6d47bff6 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -148,7 +148,7 @@ decoder::get(const std::string &name, const cudaqx::tensor<uint8_t> &H,
   return iter->second(H, param_map);
 }
 
-static uint32_t calculate_num_msyn_per_decode(
+static uint32_t calculate_syndrome_buffer_capacity(
     const std::vector<std::vector<uint32_t>> &D_sparse) {
   uint32_t max_col = 0;
   for (const auto &row : D_sparse)
@@ -203,25 +203,27 @@ void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
   // Analyze D_sparse structure (assumes well-formed D_sparse from generator):
   // 1. First-round detectors (if any) are always at the beginning
   // 2. All timelike detectors have the same stride (num_syndromes_per_round)
-  
+
   // Check if first row is a first-round detector (single syndrome index)
-  pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1);
-  
+  pimpl->has_first_round_detectors =
+      (D_sparse.size() > 0 && D_sparse[0].size() == 1);
+
   // Find num_syndromes_per_round from first timelike detector
   // (skip first-round detectors if present, they're all at the beginning)
   pimpl->num_syndromes_per_round = 1; // Default fallback
-  for (const auto& detector_syndrome_indices : D_sparse) {
+  for (const auto &detector_syndrome_indices : D_sparse) {
     if (detector_syndrome_indices.size() >= 2) {
       // First timelike detector found: XORs syndromes from consecutive rounds
       // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2)
       // so num_syndromes_per_round = 8
-      pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0];
+      pimpl->num_syndromes_per_round =
+          detector_syndrome_indices[1] - detector_syndrome_indices[0];
       break; // Found it, no need to continue
     }
   }
 
   // Calculate minimum buffer capacity from max column in D_sparse
-  uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse);
+  uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse);
 
   // Enable incremental mode: process one round at a time
   pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
@@ -247,25 +249,27 @@ void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
   // Analyze D_sparse structure (assumes well-formed D_sparse from generator):
   // 1. First-round detectors (if any) are always at the beginning
   // 2. All timelike detectors have the same stride (num_syndromes_per_round)
-  
+
   // Check if first row is a first-round detector (single syndrome index)
-  pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1);
-  
+  pimpl->has_first_round_detectors =
+      (D_sparse.size() > 0 && D_sparse[0].size() == 1);
+
   // Find num_syndromes_per_round from first timelike detector
   // (skip first-round detectors if present, they're all at the beginning)
   pimpl->num_syndromes_per_round = 1; // Default fallback
-  for (const auto& detector_syndrome_indices : D_sparse) {
+  for (const auto &detector_syndrome_indices : D_sparse) {
     if (detector_syndrome_indices.size() >= 2) {
       // First timelike detector found: XORs syndromes from consecutive rounds
       // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2)
       // so num_syndromes_per_round = 8
-      pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0];
+      pimpl->num_syndromes_per_round =
+          detector_syndrome_indices[1] - detector_syndrome_indices[0];
       break; // Found it, no need to continue
     }
   }
 
   // Calculate minimum buffer capacity from max column in D_sparse
-  uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse);
+  uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse);
 
   // Enable incremental mode: process one round at a time
   pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
@@ -370,7 +374,8 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
     } else {
       // Pure timelike detectors: use incremental XOR (current ⊕ previous round)
       // Using circular buffer offsets - no D_sparse access needed
-      // No wraparound checks needed: buffer is sized to guarantee operations never wrap
+      // No wraparound checks needed: buffer is sized to guarantee operations
+      // never wrap
       for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
         pimpl->persistent_detector_buffer[i] =
             pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^

From 8d8f02c8631e39e138bf85cdf550d179a008dfed Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Mon, 10 Nov 2025 18:56:55 +0000
Subject: [PATCH 05/11] Revert decoder.cpp back to original before reworking
 the approach

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp | 224 +++++----------------------------------
 1 file changed, 28 insertions(+), 196 deletions(-)

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index 6d47bff6..73a38707 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -23,39 +23,17 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor<uint8_t> &,
 namespace cudaq::qec {
 
 struct decoder::rt_impl {
-  /// The number of syndromes per round (enables incremental detector
-  /// computation)
-  uint32_t num_syndromes_per_round = 0;
-
-  /// The number of measurement syndromes to be decoded per decode call
-  /// (for incremental mode: one round; for batch mode: full D_sparse columns)
+  /// The number of measurement syndromes to be decoded per decode call (i.e.
+  /// the number of columns in the D_sparse matrix)
   uint32_t num_msyn_per_decode = 0;
 
-  /// Counter of total syndromes buffered but not yet processed.
-  /// Used to detect complete rounds (when this is a multiple of
-  /// num_msyn_per_decode). Gets decremented after each round is decoded. Not a
-  /// direct buffer index.
-  uint32_t num_syndromes_buffered_but_not_decoded = 0;
+  /// The index of the next syndrome to be written in the msyn_buffer
+  uint32_t msyn_buffer_index = 0;
 
-  /// The buffer of measurement syndromes received from the client.
-  /// For incremental mode: size is calculated from max D_sparse column + 1
-  /// This allows buffering multiple rounds while still decoding incrementally
+  /// The buffer of measurement syndromes received from the client. Length is
+  /// num_msyn_per_decode.
   std::vector<uint8_t> msyn_buffer;
 
-  /// Total buffer capacity (max column index in D_sparse + 1)
-  uint32_t buffer_capacity = 0;
-
-  /// Track which round we're on (0 = reference round)
-  uint32_t current_round = 0;
-
-  /// Circular buffer write position for the current round
-  // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then
-  // wrap around to 0.
-  uint32_t current_round_buffer_offset = 0;
-
-  /// Circular buffer position of the previous round (for incremental XOR)
-  uint32_t prev_round_buffer_offset = 0;
-
   /// The current observable corrections. The length of this vector is the
   /// number of rows in the O_sparse matrix.
   std::vector<uint8_t> corrections;
@@ -72,9 +50,6 @@ struct decoder::rt_impl {
 
   /// The id of the decoder (for instrumentation)
   uint32_t decoder_id = 0;
-
-  /// Whether D_sparse has first-round detectors (determined in set_D_sparse)
-  bool has_first_round_detectors = false;
 };
 
 void decoder::rt_impl_deleter::operator()(rt_impl *p) const { delete p; }
@@ -148,7 +123,7 @@ decoder::get(const std::string &name, const cudaqx::tensor<uint8_t> &H,
   return iter->second(H, param_map);
 }
 
-static uint32_t calculate_syndrome_buffer_capacity(
+static uint32_t calculate_num_msyn_per_decode(
     const std::vector<std::vector<uint32_t>> &D_sparse) {
   uint32_t max_col = 0;
   for (const auto &row : D_sparse)
@@ -199,148 +174,33 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; }
 
 void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
   this->D_sparse = D_sparse;
-
-  // Analyze D_sparse structure (assumes well-formed D_sparse from generator):
-  // 1. First-round detectors (if any) are always at the beginning
-  // 2. All timelike detectors have the same stride (num_syndromes_per_round)
-
-  // Check if first row is a first-round detector (single syndrome index)
-  pimpl->has_first_round_detectors =
-      (D_sparse.size() > 0 && D_sparse[0].size() == 1);
-
-  // Find num_syndromes_per_round from first timelike detector
-  // (skip first-round detectors if present, they're all at the beginning)
-  pimpl->num_syndromes_per_round = 1; // Default fallback
-  for (const auto &detector_syndrome_indices : D_sparse) {
-    if (detector_syndrome_indices.size() >= 2) {
-      // First timelike detector found: XORs syndromes from consecutive rounds
-      // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2)
-      // so num_syndromes_per_round = 8
-      pimpl->num_syndromes_per_round =
-          detector_syndrome_indices[1] - detector_syndrome_indices[0];
-      break; // Found it, no need to continue
-    }
-  }
-
-  // Calculate minimum buffer capacity from max column in D_sparse
-  uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse);
-
-  // Enable incremental mode: process one round at a time
-  pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
-
-  // Add one extra round to buffer capacity to guarantee no wraparound within
-  // operations This eliminates all wraparound checks in hot loops (write and
-  // detector computation)
-  pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round;
-
-  // Allocate buffer to hold all syndromes plus extra round
+  pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse);
   pimpl->msyn_buffer.clear();
-  pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
-
-  pimpl->num_syndromes_buffered_but_not_decoded = 0;
-  pimpl->current_round = 0;
-  pimpl->current_round_buffer_offset = 0;
-  pimpl->prev_round_buffer_offset = 0;
+  pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
+  pimpl->msyn_buffer_index = 0;
 }
 
 void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
   set_sparse_from_vec(D_sparse_vec_in, this->D_sparse);
-
-  // Analyze D_sparse structure (assumes well-formed D_sparse from generator):
-  // 1. First-round detectors (if any) are always at the beginning
-  // 2. All timelike detectors have the same stride (num_syndromes_per_round)
-
-  // Check if first row is a first-round detector (single syndrome index)
-  pimpl->has_first_round_detectors =
-      (D_sparse.size() > 0 && D_sparse[0].size() == 1);
-
-  // Find num_syndromes_per_round from first timelike detector
-  // (skip first-round detectors if present, they're all at the beginning)
-  pimpl->num_syndromes_per_round = 1; // Default fallback
-  for (const auto &detector_syndrome_indices : D_sparse) {
-    if (detector_syndrome_indices.size() >= 2) {
-      // First timelike detector found: XORs syndromes from consecutive rounds
-      // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2)
-      // so num_syndromes_per_round = 8
-      pimpl->num_syndromes_per_round =
-          detector_syndrome_indices[1] - detector_syndrome_indices[0];
-      break; // Found it, no need to continue
-    }
-  }
-
-  // Calculate minimum buffer capacity from max column in D_sparse
-  uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse);
-
-  // Enable incremental mode: process one round at a time
-  pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round;
-
-  // Add one extra round to buffer capacity to guarantee no wraparound within
-  // operations This eliminates all wraparound checks in hot loops (write and
-  // detector computation)
-  pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round;
-
-  // Allocate buffer to hold all syndromes plus extra round
+  pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse);
   pimpl->msyn_buffer.clear();
-  pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
-
-  pimpl->num_syndromes_buffered_but_not_decoded = 0;
-  pimpl->current_round = 0;
-  pimpl->current_round_buffer_offset = 0;
-  pimpl->prev_round_buffer_offset = 0;
+  pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
+  pimpl->msyn_buffer_index = 0;
 }
 
 bool decoder::enqueue_syndrome(const uint8_t *syndrome,
                                std::size_t syndrome_length) {
-  // position_in_round represents how many syndromes of the current round have
-  // already been buffered but not yet decoded Values range from 0 to
-  // num_msyn_per_decode - 1.
-  uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded %
-                               pimpl->num_msyn_per_decode;
-
-  // Check if this write would overwrite the previous round
-  // We need to preserve prev_round_buffer for XOR computation, so the maximum
-  // safe write from the start of the current round is buffer_capacity minus one
-  // round
-  uint32_t max_safe_from_round_start =
-      pimpl->buffer_capacity - pimpl->num_syndromes_per_round;
-  if (position_in_round + syndrome_length > max_safe_from_round_start) {
-    // CUDAQ_WARN("Syndrome data too large - would overwrite previous round.
-    // Data will be ignored.");
-    printf("Syndrome data too large - would overwrite previous round. Data "
-           "will be ignored.\n");
+  if (pimpl->msyn_buffer_index + syndrome_length > pimpl->msyn_buffer.size()) {
+    // CUDAQ_WARN("Syndrome buffer overflow. Syndrome will be ignored.");
+    printf("Syndrome buffer overflow. Syndrome will be ignored.\n");
     return false;
   }
   bool did_decode = false;
-  // Buffer the incoming syndromes
-  // No wraparound check needed: buffer is sized to guarantee operations never
-  // wrap mid-execution
-  uint32_t write_start = pimpl->current_round_buffer_offset + position_in_round;
   for (std::size_t i = 0; i < syndrome_length; i++) {
-    pimpl->msyn_buffer[write_start + i] = syndrome[i];
+    pimpl->msyn_buffer[pimpl->msyn_buffer_index] = syndrome[i];
+    pimpl->msyn_buffer_index++;
   }
-  pimpl->num_syndromes_buffered_but_not_decoded += syndrome_length;
-
-  // Process all complete rounds that are now available
-  while ((pimpl->num_syndromes_buffered_but_not_decoded %
-          pimpl->num_msyn_per_decode) == 0 &&
-         pimpl->num_syndromes_buffered_but_not_decoded > 0) {
-    pimpl->current_round++;
-
-    // First round (round 1): skip decoding (store as reference)
-    // UNLESS there are first-round detectors that need immediate decoding
-    // (first-round detector check is done once in set_D_sparse)
-    if (pimpl->current_round == 1 && !pimpl->has_first_round_detectors) {
-      // Previous round stays at current position for next round's XOR
-      pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset;
-      // Advance to next round position in circular buffer
-      pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode;
-      if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity)
-        pimpl->current_round_buffer_offset -= pimpl->buffer_capacity;
-      pimpl->num_syndromes_buffered_but_not_decoded -=
-          pimpl->num_msyn_per_decode; // Decrement for next iteration
-      continue;                       // Skip to next round
-    }
-
+  if (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size()) {
     // These are just for logging. They are initialized in such a way to avoid
     // dynamic memory allocation if logging is disabled.
     std::vector<uint32_t> log_msyn;
@@ -362,32 +222,12 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
       log_observable_corrections.resize(O_sparse.size());
     }
 
-    // Compute detectors based on whether first-round detectors exist
-    if (pimpl->has_first_round_detectors) {
-      // When first-round detectors exist, must use D_sparse for all detectors
-      // because first-round detectors reference only one syndrome (not two)
-      for (std::size_t i = 0; i < this->D_sparse.size(); i++) {
-        pimpl->persistent_detector_buffer[i] = 0;
-        for (auto col : this->D_sparse[i])
-          pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col];
-      }
-    } else {
-      // Pure timelike detectors: use incremental XOR (current ⊕ previous round)
-      // Using circular buffer offsets - no D_sparse access needed
-      // No wraparound checks needed: buffer is sized to guarantee operations
-      // never wrap
-      for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
-        pimpl->persistent_detector_buffer[i] =
-            pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^
-            pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i];
-      }
+    // Decode now.
+    for (std::size_t i = 0; i < this->D_sparse.size(); i++) {
+      pimpl->persistent_detector_buffer[i] = 0;
+      for (auto col : this->D_sparse[i])
+        pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col];
     }
-
-    // Update offsets for next round: current becomes previous, advance current
-    pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset;
-    pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode;
-    if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity)
-      pimpl->current_round_buffer_offset -= pimpl->buffer_capacity;
     if (should_log) {
       log_msyn.reserve(pimpl->msyn_buffer.size());
       for (std::size_t d = 0, D = pimpl->msyn_buffer.size(); d < D; d++) {
@@ -455,11 +295,9 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
         printf("%s\n", s.c_str());
     }
     did_decode = true;
-
-    // Decrement counter for next iteration of while loop
-    pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode;
+    // Prepare for more data.
+    pimpl->msyn_buffer_index = 0;
   }
-
   return did_decode;
 }
 
@@ -506,15 +344,9 @@ std::size_t decoder::get_num_observables() const { return O_sparse.size(); }
 
 void decoder::reset_decoder() {
   // Zero out all data that is considered "per-shot" memory.
-  pimpl->num_syndromes_buffered_but_not_decoded = 0;
+  pimpl->msyn_buffer_index = 0;
   pimpl->msyn_buffer.clear();
-  pimpl->msyn_buffer.resize(pimpl->buffer_capacity);
-
-  // Reset incremental computation state
-  pimpl->current_round = 0;
-  pimpl->current_round_buffer_offset = 0;
-  pimpl->prev_round_buffer_offset = 0;
-
+  pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
   pimpl->corrections.clear();
   pimpl->corrections.resize(O_sparse.size());
   const bool log_due_to_log_level =

From 58b68f281f180676ca84e7ada2dba21a0b29c938 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Mon, 10 Nov 2025 18:57:34 +0000
Subject: [PATCH 06/11] Added sliding window test

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 .../realtime/app_examples/CMakeLists.txt       | 14 ++++++++++++++
 .../app_examples/surface_code-1-test.sh        | 18 ++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/libs/qec/unittests/realtime/app_examples/CMakeLists.txt b/libs/qec/unittests/realtime/app_examples/CMakeLists.txt
index 6b948a49..d33a636b 100644
--- a/libs/qec/unittests/realtime/app_examples/CMakeLists.txt
+++ b/libs/qec/unittests/realtime/app_examples/CMakeLists.txt
@@ -38,6 +38,20 @@ add_test(
       ${CMAKE_BINARY_DIR}/lib
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
+
+# Test with sliding_window decoder
+add_test(
+  NAME app_examples.surface_code-1-local-test-distance-3-sliding-window
+  COMMAND
+    bash "${CMAKE_CURRENT_SOURCE_DIR}/surface_code-1-test.sh"
+      ${CMAKE_CURRENT_BINARY_DIR}/surface_code-1-local
+      ${CMAKE_CURRENT_BINARY_DIR}/surface_code-1-local
+      3 40 40 NULL 12 6
+      ${CMAKE_BINARY_DIR}/lib
+      sliding_window 6 1
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
 # This must be disabled for now because the multi_error_lut decoder is not
 # powerful enough to pass this test. The nv-qldpc-decoder can pass this test,
 # but that is not available on the GitHub repo.
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh b/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh
index 8b2c08f8..c14c6e80 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh
+++ b/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh
@@ -21,10 +21,13 @@ return_code=0
 #  num_rounds
 #  decoder_window
 #  Path to libcudaq-qec-realtime-decoding-quantinuum-private.so
+#  decoder_type (optional, defaults to multi_error_lut)
+#  sw_window_size (optional, for sliding_window decoder, defaults to decoder_window)
+#  sw_step_size (optional, for sliding_window decoder, defaults to 1)
 
-# Check that all 9 arguments are provided.
-if [[ $# -ne 9 ]]; then
-  echo "Error: Expected 9 arguments"
+# Check that at least 9 arguments are provided.
+if [[ $# -lt 9 ]]; then
+  echo "Error: Expected at least 9 arguments (got $#)"
   exit 1
 fi
 
@@ -37,6 +40,9 @@ SERVER_EXECUTABLE=$6
 NUM_ROUNDS=$7 
 DECODER_WINDOW=$8
 LIB_DIR=$9
+DECODER_TYPE=${10:-multi_error_lut}
+SW_WINDOW_SIZE=${11:-$DECODER_WINDOW}
+SW_STEP_SIZE=${12:-1}
 
 export CUDAQ_DEFAULT_SIMULATOR=stim
 
@@ -51,7 +57,7 @@ FULL_SUFFIX=$timestamp-$RNG_SUFFIX
 CONFIG_FILE=config-${FULL_SUFFIX}.yml
 
 # Generate the config file using the first executable.
-$EXE_PATH1 --distance $DISTANCE --num_rounds $NUM_ROUNDS --num_shots $NUM_SHOTS --save_dem $CONFIG_FILE --decoder_window $DECODER_WINDOW | tee save_dem-$FULL_SUFFIX.log
+$EXE_PATH1 --distance $DISTANCE --num_rounds $NUM_ROUNDS --num_shots $NUM_SHOTS --save_dem $CONFIG_FILE --decoder_window $DECODER_WINDOW --decoder_type $DECODER_TYPE --sw_window_size $SW_WINDOW_SIZE --sw_step_size $SW_STEP_SIZE | tee save_dem-$FULL_SUFFIX.log
 
 export CUDAQ_DUMP_JIT_IR=${CUDAQ_DUMP_JIT_IR:-0}
 
@@ -67,8 +73,8 @@ export CUDAQ_DUMP_JIT_IR=${CUDAQ_DUMP_JIT_IR:-0}
 
 
 # Use the config file using the second executable.
-echo Running $EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW
-$EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW |& tee load_dem-$FULL_SUFFIX.log
+echo Running $EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW --decoder_type $DECODER_TYPE --sw_window_size $SW_WINDOW_SIZE --sw_step_size $SW_STEP_SIZE
+$EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW --decoder_type $DECODER_TYPE --sw_window_size $SW_WINDOW_SIZE --sw_step_size $SW_STEP_SIZE |& tee load_dem-$FULL_SUFFIX.log
 
 
 # Look for results like this in the output:

From e441e4845f3309d190a29c8ead698641032788b7 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Wed, 12 Nov 2025 01:16:26 +0000
Subject: [PATCH 07/11] Reworked approach in place - sliding window test passes

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp                 | 104 ++-
 libs/qec/lib/decoders/sliding_window.cpp | 915 +++++++++++------------
 libs/qec/lib/decoders/sliding_window.h   | 149 ++++
 3 files changed, 676 insertions(+), 492 deletions(-)
 create mode 100644 libs/qec/lib/decoders/sliding_window.h

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index 73a38707..98200f7c 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -20,6 +20,9 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor<uint8_t> &)
 INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor<uint8_t> &,
                      const cudaqx::heterogeneous_map &)
 
+// Include decoder implementations AFTER registry instantiation
+#include "decoders/sliding_window.h"
+
 namespace cudaq::qec {
 
 struct decoder::rt_impl {
@@ -50,6 +53,18 @@ struct decoder::rt_impl {
 
   /// The id of the decoder (for instrumentation)
   uint32_t decoder_id = 0;
+
+  bool is_sliding_window = false;
+
+  /// The number of syndromes per round.  Only used for sliding window decoder.
+  size_t num_syndromes_per_round = 0;
+
+  /// Whether the first round detectors are included.  Only used for sliding
+  /// window decoder.
+  bool has_first_round_detectors = false;
+
+  /// The current round.  Only used for sliding window decoder.
+  uint32_t current_round = 0;
 };
 
 void decoder::rt_impl_deleter::operator()(rt_impl *p) const { delete p; }
@@ -174,6 +189,23 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; }
 
 void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
   this->D_sparse = D_sparse;
+  auto *sw_decoder = dynamic_cast<sliding_window *>(this);
+
+  if (sw_decoder != nullptr) {
+    pimpl->is_sliding_window = true;
+    pimpl->num_syndromes_per_round = sw_decoder->get_num_syndromes_per_round();
+    // Check if first row is a first-round detector (single syndrome index)
+    pimpl->has_first_round_detectors =
+        (D_sparse.size() > 0 && D_sparse[0].size() == 1);
+    pimpl->current_round = 0;
+    pimpl->persistent_detector_buffer.resize(pimpl->num_syndromes_per_round);
+    pimpl->persistent_soft_detector_buffer.resize(
+        pimpl->num_syndromes_per_round);
+
+  } else {
+    pimpl->is_sliding_window = false;
+  }
+
   pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse);
   pimpl->msyn_buffer.clear();
   pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
@@ -182,7 +214,23 @@ void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
 
 void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
   set_sparse_from_vec(D_sparse_vec_in, this->D_sparse);
-  pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse);
+  auto *sw_decoder = dynamic_cast<sliding_window *>(this);
+
+  if (sw_decoder != nullptr) {
+    pimpl->is_sliding_window = true;
+    pimpl->num_syndromes_per_round = sw_decoder->get_num_syndromes_per_round();
+    // Check if first row is a first-round detector (single syndrome index)
+    pimpl->has_first_round_detectors =
+        (this->D_sparse.size() > 0 && this->D_sparse[0].size() == 1);
+    pimpl->current_round = 0;
+    pimpl->persistent_detector_buffer.resize(pimpl->num_syndromes_per_round);
+    pimpl->persistent_soft_detector_buffer.resize(
+        pimpl->num_syndromes_per_round);
+  } else {
+    pimpl->is_sliding_window = false;
+  }
+
+  pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(this->D_sparse);
   pimpl->msyn_buffer.clear();
   pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
   pimpl->msyn_buffer_index = 0;
@@ -195,12 +243,23 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
     printf("Syndrome buffer overflow. Syndrome will be ignored.\n");
     return false;
   }
+
+  pimpl->current_round++;
   bool did_decode = false;
   for (std::size_t i = 0; i < syndrome_length; i++) {
     pimpl->msyn_buffer[pimpl->msyn_buffer_index] = syndrome[i];
     pimpl->msyn_buffer_index++;
   }
-  if (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size()) {
+
+  bool should_decode = false;
+  if (!pimpl->is_sliding_window) {
+    should_decode = (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size());
+  } else {
+    should_decode =
+        (pimpl->current_round >= 2) ||
+        (pimpl->current_round == 1 && pimpl->has_first_round_detectors);
+  }
+  if (should_decode) {
     // These are just for logging. They are initialized in such a way to avoid
     // dynamic memory allocation if logging is disabled.
     std::vector<uint32_t> log_msyn;
@@ -223,11 +282,34 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
     }
 
     // Decode now.
-    for (std::size_t i = 0; i < this->D_sparse.size(); i++) {
-      pimpl->persistent_detector_buffer[i] = 0;
-      for (auto col : this->D_sparse[i])
-        pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col];
+    if (!pimpl->is_sliding_window) {
+      for (std::size_t i = 0; i < this->D_sparse.size(); i++) {
+        pimpl->persistent_detector_buffer[i] = 0;
+        for (auto col : this->D_sparse[i])
+          pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col];
+      }
+    } else {
+      // For sliding window decoder, syndrome_length must equal
+      // num_syndromes_per_round
+      assert(syndrome_length == pimpl->num_syndromes_per_round);
+      if (pimpl->current_round == 1 && pimpl->has_first_round_detectors) {
+        // First round: only compute first-round detectors (direct copy)
+        for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
+          pimpl->persistent_detector_buffer[i] = pimpl->msyn_buffer[i];
+        }
+      } else {
+        // Buffer is full with 2 rounds: compute timelike detectors (XOR of two
+        // rounds)
+        for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
+          std::size_t index =
+              (pimpl->current_round - 2) * pimpl->num_syndromes_per_round;
+          pimpl->persistent_detector_buffer[i] =
+              pimpl->msyn_buffer[index + i] ^
+              pimpl->msyn_buffer[index + i + pimpl->num_syndromes_per_round];
+        }
+      }
     }
+
     if (should_log) {
       log_msyn.reserve(pimpl->msyn_buffer.size());
       for (std::size_t d = 0, D = pimpl->msyn_buffer.size(); d < D; d++) {
@@ -246,6 +328,14 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
     convert_vec_hard_to_soft(pimpl->persistent_detector_buffer,
                              pimpl->persistent_soft_detector_buffer);
     auto decoded_result = decode(pimpl->persistent_soft_detector_buffer);
+
+    // If we didn't get a decoded result, just return
+    if (pimpl->is_sliding_window) {
+      if (decoded_result.result.size() == 0) {
+        return false;
+      }
+    }
+
     if (should_log) {
       log_t2 = std::chrono::high_resolution_clock::now();
       for (std::size_t e = 0, E = decoded_result.result.size(); e < E; e++)
@@ -297,6 +387,7 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
     did_decode = true;
     // Prepare for more data.
     pimpl->msyn_buffer_index = 0;
+    pimpl->current_round = 0;
   }
   return did_decode;
 }
@@ -345,6 +436,7 @@ std::size_t decoder::get_num_observables() const { return O_sparse.size(); }
 void decoder::reset_decoder() {
   // Zero out all data that is considered "per-shot" memory.
   pimpl->msyn_buffer_index = 0;
+  pimpl->current_round = 0;
   pimpl->msyn_buffer.clear();
   pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
   pimpl->corrections.clear();
diff --git a/libs/qec/lib/decoders/sliding_window.cpp b/libs/qec/lib/decoders/sliding_window.cpp
index d57171e8..cb875c73 100644
--- a/libs/qec/lib/decoders/sliding_window.cpp
+++ b/libs/qec/lib/decoders/sliding_window.cpp
@@ -6,545 +6,488 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+#include "sliding_window.h"
 #include "common/Logger.h"
-#include "cudaq/qec/decoder.h"
 #include "cudaq/qec/pcm_utils.h"
 #include <cassert>
 #include <vector>
 
 namespace cudaq::qec {
 
-/// @brief This is a sliding window decoder that receives syndromes on a
-/// round-by-round basis, and decodes them according window-specific parameters
-/// provided in the decoder.
-class sliding_window : public decoder {
-private:
-  // --- Input parameters ---
+// ============================================================================
+// Private helper method implementations
+// ============================================================================
 
-  /// The number of rounds of syndrome data in each window.
-  std::size_t window_size = 1;
-  /// The number of rounds to advance the window by each time.
-  std::size_t step_size = 1;
-  /// The number of syndromes per round.
-  std::size_t num_syndromes_per_round = 0;
-  /// When forming a window, should error mechanisms that span the start round
-  /// and any preceding rounds be included?
-  bool straddle_start_round = false;
-  /// When forming a window, should error mechanisms that span the end round and
-  /// any subsequent rounds be included?
-  bool straddle_end_round = true;
-  /// The vector of error rates for the error mechanisms.
-  std::vector<cudaq::qec::float_t> error_rate_vec;
-  /// The name of the inner decoder to use.
-  std::string inner_decoder_name;
-  /// The parameters to pass to the inner decoder.
-  cudaqx::heterogeneous_map inner_decoder_params;
-
-  // Derived parameters.
-  std::size_t num_windows = 0;
-  std::size_t num_rounds = 0;
-  std::size_t num_syndromes_per_window = 0;
-  std::size_t num_rounds_since_last_decode = 0;
-  std::vector<std::unique_ptr<decoder>> inner_decoders;
-  std::vector<std::size_t> first_columns;
-  cudaqx::tensor<std::uint8_t> full_pcm;
-  cudaqx::tensor<std::uint8_t> full_pcm_T;
-
-  // Enum type for timing data.
-  enum WindowProcTimes {
-    INITIALIZE_WINDOW,     // 0
-    SLIDE_WINDOW,          // 1
-    COPY_DATA,             // 2
-    INDEX_CALCULATION,     // 3
-    MODIFY_SYNDROME_SLICE, // 4
-    INNER_DECODE,          // 5
-    CONVERT_TO_HARD,       // 6
-    COMMIT_TO_RESULT,      // 7
-    NUM_WINDOW_PROC_TIMES  // 8
-  };
-
-  // State data
-  std::vector<std::vector<cudaq::qec::float_t>>
-      rolling_window; // [batch_size, num_syndromes_per_window]
-  // rolling window read and write indices (circular buffer)
-  std::size_t rw_next_write_index = 0; // [0, num_syndromes_per_window)
-  std::size_t rw_next_read_index = 0;  // [0, num_syndromes_per_window)
-  std::size_t rw_filled = 0;
-  std::size_t num_windows_decoded = 0;
-  std::vector<std::vector<bool>> syndrome_mods; // [batch_size, syndrome_size]
-  std::vector<decoder_result> rw_results;       // [batch_size]
-  std::vector<double> window_proc_times;
-  std::array<double, WindowProcTimes::NUM_WINDOW_PROC_TIMES>
-      window_proc_times_arr = {};
-
-  void validate_inputs() {
-    if (window_size < 1 || window_size > num_rounds) {
-      throw std::invalid_argument(
-          fmt::format("sliding_window constructor: window_size ({}) must "
-                      "be between 1 and num_rounds ({})",
-                      window_size, num_rounds));
-    }
-    if (step_size < 1 || step_size > window_size) {
-      throw std::invalid_argument(
-          fmt::format("sliding_window constructor: step_size ({}) must "
-                      "be between 1 and window_size ({})",
-                      step_size, window_size));
-    }
-    if ((num_rounds - window_size) % step_size != 0) {
-      throw std::invalid_argument(
-          fmt::format("sliding_window constructor: num_rounds - "
-                      "window_size ({}) must be divisible by step_size ({})",
-                      num_rounds - window_size, step_size));
-    }
-    if (num_syndromes_per_round == 0) {
-      throw std::invalid_argument("sliding_window constructor: "
-                                  "num_syndromes_per_round must be non-zero");
-    }
-    if (H.shape()[0] % num_syndromes_per_round != 0) {
-      throw std::invalid_argument(
-          "sliding_window constructor: Number of rows in H must be divisible "
-          "by num_syndromes_per_round");
-    }
-    if (inner_decoder_name.empty()) {
-      throw std::invalid_argument(
-          "sliding_window constructor: inner_decoder_name must be non-empty");
-    }
-    if (inner_decoder_params.empty()) {
-      CUDAQ_WARN("sliding_window constructor: inner_decoder_params is empty. "
-                 "Is that intentional?");
-    }
-    if (error_rate_vec.empty()) {
-      throw std::invalid_argument(
-          "sliding_window constructor: error_rate_vec must be non-empty");
-    }
-
-    // Enforce that H is already sorted.
-    if (!cudaq::qec::pcm_is_sorted(H, num_syndromes_per_round)) {
-      throw std::invalid_argument("sliding_window constructor: PCM must be "
-                                  "sorted. See cudaq::qec::simplify_pcm.");
-    }
+/// Helper function to validate constructor inputs.
+void sliding_window::validate_inputs() {
+  if (window_size < 1 || window_size > num_rounds) {
+    throw std::invalid_argument(
+        fmt::format("sliding_window constructor: window_size ({}) must "
+                    "be between 1 and num_rounds ({})",
+                    window_size, num_rounds));
+  }
+  if (step_size < 1 || step_size > window_size) {
+    throw std::invalid_argument(
+        fmt::format("sliding_window constructor: step_size ({}) must "
+                    "be between 1 and window_size ({})",
+                    step_size, window_size));
+  }
+  if ((num_rounds - window_size) % step_size != 0) {
+    throw std::invalid_argument(
+        fmt::format("sliding_window constructor: num_rounds - "
+                    "window_size ({}) must be divisible by step_size ({})",
+                    num_rounds - window_size, step_size));
+  }
+  if (num_syndromes_per_round == 0) {
+    throw std::invalid_argument("sliding_window constructor: "
+                                "num_syndromes_per_round must be non-zero");
+  }
+  if (H.shape()[0] % num_syndromes_per_round != 0) {
+    throw std::invalid_argument(
+        "sliding_window constructor: Number of rows in H must be divisible "
+        "by num_syndromes_per_round");
+  }
+  if (inner_decoder_name.empty()) {
+    throw std::invalid_argument(
+        "sliding_window constructor: inner_decoder_name must be non-empty");
+  }
+  if (inner_decoder_params.empty()) {
+    CUDAQ_WARN("sliding_window constructor: inner_decoder_params is empty. "
+               "Is that intentional?");
+  }
+  if (error_rate_vec.empty()) {
+    throw std::invalid_argument(
+        "sliding_window constructor: error_rate_vec must be non-empty");
   }
 
-  /// Helper function to initialize the window.
-  /// @param num_syndromes The number of syndromes to initialize the window for.
-  /// This will be 1 for non-batched mode.
-  void initialize_window(std::size_t num_syndromes) {
-    // Initialize the syndrome mods and rw_results.
-    auto t0 = std::chrono::high_resolution_clock::now();
-    window_proc_times_arr.fill(0.0);
-    syndrome_mods.resize(num_syndromes);
-    for (std::size_t s = 0; s < num_syndromes; ++s) {
-      syndrome_mods[s].clear();
-      syndrome_mods[s].resize(this->syndrome_size);
-    }
-    rw_results.clear();
-    rw_results.resize(num_syndromes);
-    for (std::size_t s = 0; s < num_syndromes; ++s) {
-      rw_results[s].converged = true; // Gets set to false if we fail to decode
-      rw_results[s].result.resize(this->block_size);
-    }
-    rolling_window.resize(num_syndromes);
-    for (std::size_t s = 0; s < num_syndromes; ++s) {
-      rolling_window[s].clear();
-      rolling_window[s].resize(num_syndromes_per_window);
-    }
-    window_proc_times.resize(num_windows);
-    std::fill(window_proc_times.begin(), window_proc_times.end(), 0.0);
-    rw_next_write_index = 0;
-    rw_next_read_index = 0;
-    rw_filled = 0;
-    num_rounds_since_last_decode = 0;
-    CUDAQ_DBG("Initializing window");
-    auto t1 = std::chrono::high_resolution_clock::now();
-    window_proc_times_arr[WindowProcTimes::INITIALIZE_WINDOW] =
-        std::chrono::duration<double>(t1 - t0).count() * 1000;
+  // Enforce that H is already sorted.
+  if (!cudaq::qec::pcm_is_sorted(H, num_syndromes_per_round)) {
+    throw std::invalid_argument("sliding_window constructor: PCM must be "
+                                "sorted. See cudaq::qec::simplify_pcm.");
   }
+}
 
-  /// Helper function to add a single syndrome to the rolling window (circular
-  /// buffer).
-  void add_syndrome_to_rolling_window(const std::vector<float_t> &syndrome,
-                                      std::size_t syndrome_index,
-                                      bool update_next_write_index = true) {
-    // This assumes that the syndrome size evenly divides into the rolling
-    // window (of length num_syndromes_per_window), so verify that here.
-    if (num_syndromes_per_window % syndrome.size() != 0) {
-      throw std::invalid_argument(
-          fmt::format("add_syndrome_to_rolling_window: syndrome "
-                      "size ({}) must evenly divide into the rolling "
-                      "window size ({})",
-                      syndrome.size(), num_syndromes_per_window));
-    }
-    std::copy(syndrome.begin(), syndrome.end(),
-              rolling_window[syndrome_index].begin() + rw_next_write_index);
-    if (update_next_write_index) {
-      rw_next_write_index += syndrome.size();
-      if (rw_next_write_index >= num_syndromes_per_window)
-        rw_next_write_index = 0;
-    }
+/// Helper function to initialize the window.
+/// @param num_syndromes The number of syndromes to initialize the window for.
+/// This will be 1 for non-batched mode.
+void sliding_window::initialize_window(std::size_t num_syndromes) {
+  // Initialize the syndrome mods and rw_results.
+  auto t0 = std::chrono::high_resolution_clock::now();
+  window_proc_times_arr.fill(0.0);
+  syndrome_mods.resize(num_syndromes);
+  for (std::size_t s = 0; s < num_syndromes; ++s) {
+    syndrome_mods[s].clear();
+    syndrome_mods[s].resize(this->syndrome_size);
+  }
+  rw_results.clear();
+  rw_results.resize(num_syndromes);
+  for (std::size_t s = 0; s < num_syndromes; ++s) {
+    rw_results[s].converged = true; // Gets set to false if we fail to decode
+    rw_results[s].result.resize(this->block_size);
   }
+  rolling_window.resize(num_syndromes);
+  for (std::size_t s = 0; s < num_syndromes; ++s) {
+    rolling_window[s].clear();
+    rolling_window[s].resize(num_syndromes_per_window);
+  }
+  window_proc_times.resize(num_windows);
+  std::fill(window_proc_times.begin(), window_proc_times.end(), 0.0);
+  rw_next_write_index = 0;
+  rw_next_read_index = 0;
+  rw_filled = 0;
+  num_rounds_since_last_decode = 0;
+  CUDAQ_DBG("Initializing window");
+  auto t1 = std::chrono::high_resolution_clock::now();
+  window_proc_times_arr[WindowProcTimes::INITIALIZE_WINDOW] =
+      std::chrono::duration<double>(t1 - t0).count() * 1000;
+}
 
-  /// Helper function to add a batch of syndromes to the rolling window
-  /// (circular buffer).
-  void add_syndromes_to_rolling_window(
-      const std::vector<std::vector<float_t>> &syndromes) {
-    // Set update_next_write_index to false in the loop because we will update
-    // it once at the end.
-    for (std::size_t s = 0; s < syndromes.size(); ++s) {
-      add_syndrome_to_rolling_window(syndromes[s], s,
-                                     /*update_next_write_index=*/false);
-      if (syndromes[s].size() != syndromes[0].size()) {
-        throw std::invalid_argument(
-            fmt::format("add_syndromes_to_rolling_window: syndrome "
-                        "size ({}) must be the same as the first syndrome "
-                        "size ({})",
-                        syndromes[s].size(), syndromes[0].size()));
-      }
-    }
-    rw_next_write_index += syndromes[0].size();
+/// Helper function to add a single syndrome to the rolling window (circular
+/// buffer).
+void sliding_window::add_syndrome_to_rolling_window(
+    const std::vector<float_t> &syndrome, std::size_t syndrome_index,
+    bool update_next_write_index) {
+  // This assumes that the syndrome size evenly divides into the rolling
+  // window (of length num_syndromes_per_window), so verify that here.
+  if (num_syndromes_per_window % syndrome.size() != 0) {
+    throw std::invalid_argument(
+        fmt::format("add_syndrome_to_rolling_window: syndrome "
+                    "size ({}) must evenly divide into the rolling "
+                    "window size ({})",
+                    syndrome.size(), num_syndromes_per_window));
+  }
+  std::copy(syndrome.begin(), syndrome.end(),
+            rolling_window[syndrome_index].begin() + rw_next_write_index);
+  if (update_next_write_index) {
+    rw_next_write_index += syndrome.size();
     if (rw_next_write_index >= num_syndromes_per_window)
       rw_next_write_index = 0;
   }
+}
 
-  /// Helper function to get a single syndrome from the rolling window
-  /// (unwrapping a circular buffer).
-  std::vector<float_t>
-  get_syndrome_from_rolling_window(std::size_t syndrome_index) {
-    std::vector<float_t> syndrome(num_syndromes_per_window);
-    // Copy from rw_next_read_index to the end of the buffer.
-    std::copy(rolling_window[syndrome_index].begin() + rw_next_read_index,
-              rolling_window[syndrome_index].end(), syndrome.begin());
-    // Copy from the beginning of the rolling window to rw_next_read_index.
-    std::copy(rolling_window[syndrome_index].begin(),
-              rolling_window[syndrome_index].begin() + rw_next_read_index,
-              syndrome.end() - rw_next_read_index);
-    return syndrome;
-  }
-
-  /// Helper function to get a batch of syndromes from the rolling window
-  /// (unwrapping a circular buffer).
-  std::vector<std::vector<float_t>> get_syndromes_from_rolling_window() {
-    std::vector<std::vector<float_t>> syndromes(rolling_window.size());
-    for (std::size_t s = 0; s < rolling_window.size(); ++s) {
-      syndromes[s] = get_syndrome_from_rolling_window(s);
+/// Helper function to add a batch of syndromes to the rolling window
+/// (circular buffer).
+void sliding_window::add_syndromes_to_rolling_window(
+    const std::vector<std::vector<float_t>> &syndromes) {
+  // Set update_next_write_index to false in the loop because we will update
+  // it once at the end.
+  for (std::size_t s = 0; s < syndromes.size(); ++s) {
+    add_syndrome_to_rolling_window(syndromes[s], s,
+                                   /*update_next_write_index=*/false);
+    if (syndromes[s].size() != syndromes[0].size()) {
+      throw std::invalid_argument(
+          fmt::format("add_syndromes_to_rolling_window: syndrome "
+                      "size ({}) must be the same as the first syndrome "
+                      "size ({})",
+                      syndromes[s].size(), syndromes[0].size()));
     }
-    return syndromes;
   }
+  rw_next_write_index += syndromes[0].size();
+  if (rw_next_write_index >= num_syndromes_per_window)
+    rw_next_write_index = 0;
+}
 
-  /// Helper function to update the read index for the rolling window.
-  void update_rw_next_read_index() {
-    rw_next_read_index += step_size * num_syndromes_per_round;
-    if (rw_next_read_index >= num_syndromes_per_window)
-      rw_next_read_index -= num_syndromes_per_window;
+/// Helper function to get a single syndrome from the rolling window
+/// (unwrapping a circular buffer).
+std::vector<float_t>
+sliding_window::get_syndrome_from_rolling_window(std::size_t syndrome_index) {
+  std::vector<float_t> syndrome(num_syndromes_per_window);
+  // Copy from rw_next_read_index to the end of the buffer.
+  std::copy(rolling_window[syndrome_index].begin() + rw_next_read_index,
+            rolling_window[syndrome_index].end(), syndrome.begin());
+  // Copy from the beginning of the rolling window to rw_next_read_index.
+  std::copy(rolling_window[syndrome_index].begin(),
+            rolling_window[syndrome_index].begin() + rw_next_read_index,
+            syndrome.end() - rw_next_read_index);
+  return syndrome;
+}
+
+/// Helper function to get a batch of syndromes from the rolling window
+/// (unwrapping a circular buffer).
+std::vector<std::vector<float_t>>
+sliding_window::get_syndromes_from_rolling_window() {
+  std::vector<std::vector<float_t>> syndromes(rolling_window.size());
+  for (std::size_t s = 0; s < rolling_window.size(); ++s) {
+    syndromes[s] = get_syndrome_from_rolling_window(s);
   }
+  return syndromes;
+}
 
-public:
-  sliding_window(const cudaqx::tensor<uint8_t> &H,
-                 const cudaqx::heterogeneous_map &params)
-      : decoder(H), full_pcm(H) {
-    full_pcm_T = full_pcm.transpose();
-    // Fetch parameters from the params map.
-    window_size = params.get<std::size_t>("window_size", window_size);
-    step_size = params.get<std::size_t>("step_size", step_size);
-    num_syndromes_per_round = params.get<std::size_t>("num_syndromes_per_round",
-                                                      num_syndromes_per_round);
-    straddle_start_round =
-        params.get<bool>("straddle_start_round", straddle_start_round);
-    straddle_end_round =
-        params.get<bool>("straddle_end_round", straddle_end_round);
-    error_rate_vec = params.get<std::vector<cudaq::qec::float_t>>(
-        "error_rate_vec", error_rate_vec);
-    inner_decoder_name =
-        params.get<std::string>("inner_decoder_name", inner_decoder_name);
-    inner_decoder_params = params.get<cudaqx::heterogeneous_map>(
-        "inner_decoder_params", inner_decoder_params);
+/// Helper function to update the read index for the rolling window.
+void sliding_window::update_rw_next_read_index() {
+  rw_next_read_index += step_size * num_syndromes_per_round;
+  if (rw_next_read_index >= num_syndromes_per_window)
+    rw_next_read_index -= num_syndromes_per_window;
+}
 
-    num_rounds = H.shape()[0] / num_syndromes_per_round;
-    num_windows = (num_rounds - window_size) / step_size + 1;
-    num_syndromes_per_window = num_syndromes_per_round * window_size;
+// ============================================================================
+// Public method implementations
+// ============================================================================
 
-    validate_inputs();
+/// Constructor for the sliding window decoder.
+sliding_window::sliding_window(const cudaqx::tensor<uint8_t> &H,
+                               const cudaqx::heterogeneous_map &params)
+    : decoder(H), full_pcm(H) {
+  full_pcm_T = full_pcm.transpose();
+  // Fetch parameters from the params map.
+  window_size = params.get<std::size_t>("window_size", window_size);
+  step_size = params.get<std::size_t>("step_size", step_size);
+  num_syndromes_per_round = params.get<std::size_t>("num_syndromes_per_round",
+                                                    num_syndromes_per_round);
+  straddle_start_round =
+      params.get<bool>("straddle_start_round", straddle_start_round);
+  straddle_end_round =
+      params.get<bool>("straddle_end_round", straddle_end_round);
+  error_rate_vec = params.get<std::vector<cudaq::qec::float_t>>(
+      "error_rate_vec", error_rate_vec);
+  inner_decoder_name =
+      params.get<std::string>("inner_decoder_name", inner_decoder_name);
+  inner_decoder_params = params.get<cudaqx::heterogeneous_map>(
+      "inner_decoder_params", inner_decoder_params);
 
-    // Create the inner decoders.
-    for (std::size_t w = 0; w < num_windows; ++w) {
-      std::size_t start_round = w * step_size;
-      std::size_t end_round = start_round + window_size - 1;
-      auto [H_round, first_column, last_column] =
-          cudaq::qec::get_pcm_for_rounds(
-              H, num_syndromes_per_round, start_round, end_round,
-              straddle_start_round, straddle_end_round);
-      first_columns.push_back(first_column);
+  num_rounds = H.shape()[0] / num_syndromes_per_round;
+  num_windows = (num_rounds - window_size) / step_size + 1;
+  num_syndromes_per_window = num_syndromes_per_round * window_size;
 
-      // Slice the error vector to only include the current window.
-      auto inner_decoder_params_mod = inner_decoder_params;
-      std::vector<cudaq::qec::float_t> error_vec_mod(
-          error_rate_vec.begin() + first_column,
-          error_rate_vec.begin() + last_column + 1);
-      inner_decoder_params_mod.insert("error_rate_vec", error_vec_mod);
+  validate_inputs();
 
-      CUDAQ_INFO("Creating a decoder for rounds {}-{} (dims {} x {}) "
-                 "first_column = {}, last_column = {}",
-                 start_round, end_round, H_round.shape()[0], H_round.shape()[1],
-                 first_column, last_column);
-      auto inner_decoder =
-          decoder::get(inner_decoder_name, H_round, inner_decoder_params_mod);
-      inner_decoders.push_back(std::move(inner_decoder));
-    }
+  // Create the inner decoders.
+  for (std::size_t w = 0; w < num_windows; ++w) {
+    std::size_t start_round = w * step_size;
+    std::size_t end_round = start_round + window_size - 1;
+    auto [H_round, first_column, last_column] = cudaq::qec::get_pcm_for_rounds(
+        H, num_syndromes_per_round, start_round, end_round,
+        straddle_start_round, straddle_end_round);
+    first_columns.push_back(first_column);
+
+    // Slice the error vector to only include the current window.
+    auto inner_decoder_params_mod = inner_decoder_params;
+    std::vector<cudaq::qec::float_t> error_vec_mod(
+        error_rate_vec.begin() + first_column,
+        error_rate_vec.begin() + last_column + 1);
+    inner_decoder_params_mod.insert("error_rate_vec", error_vec_mod);
+
+    CUDAQ_INFO("Creating a decoder for rounds {}-{} (dims {} x {}) "
+               "first_column = {}, last_column = {}",
+               start_round, end_round, H_round.shape()[0], H_round.shape()[1],
+               first_column, last_column);
+    auto inner_decoder =
+        decoder::get(inner_decoder_name, H_round, inner_decoder_params_mod);
+    inner_decoders.push_back(std::move(inner_decoder));
   }
+}
 
-  virtual decoder_result decode(const std::vector<float_t> &syndrome) override {
-    if (syndrome.size() == this->syndrome_size) {
-      auto t0 = std::chrono::high_resolution_clock::now();
-      CUDAQ_DBG("Decoding whole block");
-      // Decode the whole thing, iterating over windows manually.
-      decoder_result result;
-      std::vector<float_t> syndrome_round(num_syndromes_per_round);
-      for (std::size_t r = 0; r < num_rounds; ++r) {
-        std::copy(syndrome.begin() + r * num_syndromes_per_round,
-                  syndrome.begin() + (r + 1) * num_syndromes_per_round,
-                  syndrome_round.begin());
-        result = decode(syndrome_round);
-        // Note: result will be empty until the final loop iteration.
-      }
-      auto t1 = std::chrono::high_resolution_clock::now();
-      std::chrono::duration<double> diff = t1 - t0;
-      CUDAQ_INFO("Whole block time: {:.3f} ms", diff.count() * 1000);
-      return result;
-    }
-    // Else we're receiving a single round.
-    if (rw_filled == 0) {
-      initialize_window(/*num_syndromes=*/1);
+/// Decode a syndrome vector (either full block or single round).
+decoder_result sliding_window::decode(const std::vector<float_t> &syndrome) {
+  if (syndrome.size() == this->syndrome_size) {
+    auto t0 = std::chrono::high_resolution_clock::now();
+    CUDAQ_DBG("Decoding whole block");
+    // Decode the whole thing, iterating over windows manually.
+    decoder_result result;
+    std::vector<float_t> syndrome_round(num_syndromes_per_round);
+    for (std::size_t r = 0; r < num_rounds; ++r) {
+      std::copy(syndrome.begin() + r * num_syndromes_per_round,
+                syndrome.begin() + (r + 1) * num_syndromes_per_round,
+                syndrome_round.begin());
+      result = decode(syndrome_round);
+      // Note: result will be empty until the final loop iteration.
     }
-    if (this->rw_filled == num_syndromes_per_window) {
-      auto t0 = std::chrono::high_resolution_clock::now();
-      CUDAQ_DBG("Window is full, sliding the window by one round");
-      add_syndrome_to_rolling_window(syndrome, 0);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = t1 - t0;
+    CUDAQ_INFO("Whole block time: {:.3f} ms", diff.count() * 1000);
+    return result;
+  }
+  // Else we're receiving a single round.
+  if (rw_filled == 0) {
+    initialize_window(/*num_syndromes=*/1);
+  }
+  if (this->rw_filled == num_syndromes_per_window) {
+    auto t0 = std::chrono::high_resolution_clock::now();
+    CUDAQ_DBG("Window is full, sliding the window by one round");
+    add_syndrome_to_rolling_window(syndrome, 0);
 
-      auto t1 = std::chrono::high_resolution_clock::now();
-      window_proc_times_arr[WindowProcTimes::SLIDE_WINDOW] +=
-          std::chrono::duration<double>(t1 - t0).count() * 1000;
-    } else {
-      // Just copy the data to the end of the rolling window.
-      auto t0 = std::chrono::high_resolution_clock::now();
-      CUDAQ_DBG("Copying data to the end of the rolling window");
-      add_syndrome_to_rolling_window(syndrome, 0);
-      this->rw_filled += num_syndromes_per_round;
-      auto t1 = std::chrono::high_resolution_clock::now();
-      window_proc_times_arr[WindowProcTimes::COPY_DATA] +=
-          std::chrono::duration<double>(t1 - t0).count() * 1000;
-    }
-    num_rounds_since_last_decode++;
-    if (rw_filled == num_syndromes_per_window &&
-        num_rounds_since_last_decode >= step_size) {
-      CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows);
-      decode_window();
-      num_rounds_since_last_decode = 0;
+    auto t1 = std::chrono::high_resolution_clock::now();
+    window_proc_times_arr[WindowProcTimes::SLIDE_WINDOW] +=
+        std::chrono::duration<double>(t1 - t0).count() * 1000;
+  } else {
+    // Just copy the data to the end of the rolling window.
+    auto t0 = std::chrono::high_resolution_clock::now();
+    CUDAQ_DBG("Copying data to the end of the rolling window");
+    add_syndrome_to_rolling_window(syndrome, 0);
+    this->rw_filled += num_syndromes_per_round;
+    auto t1 = std::chrono::high_resolution_clock::now();
+    window_proc_times_arr[WindowProcTimes::COPY_DATA] +=
+        std::chrono::duration<double>(t1 - t0).count() * 1000;
+  }
+  num_rounds_since_last_decode++;
+  if (rw_filled == num_syndromes_per_window &&
+      num_rounds_since_last_decode >= step_size) {
+    CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows);
+    decode_window();
+    num_rounds_since_last_decode = 0;
 
-      num_windows_decoded++;
-      if (num_windows_decoded == num_windows) {
-        num_windows_decoded = 0;
-        rw_filled = 0;
-        // for (std::size_t w = 0; w < num_windows; ++w) {
-        //   CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]);
-        // }
-        CUDAQ_DBG("Returning decoder_result");
-        return std::move(this->rw_results[0]);
-      }
+    num_windows_decoded++;
+    if (num_windows_decoded == num_windows) {
+      num_windows_decoded = 0;
+      rw_filled = 0;
+      // for (std::size_t w = 0; w < num_windows; ++w) {
+      //   CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]);
+      // }
+      CUDAQ_DBG("Returning decoder_result");
+      return std::move(this->rw_results[0]);
     }
-    CUDAQ_DBG("Returning empty decoder_result");
-    return decoder_result(); // empty return value
   }
+  CUDAQ_DBG("Returning empty decoder_result");
+  return decoder_result(); // empty return value
+}
 
-  virtual std::vector<decoder_result>
-  decode_batch(const std::vector<std::vector<float_t>> &syndromes) override {
-    if (syndromes[0].size() == this->syndrome_size) {
-      CUDAQ_DBG("Decoding whole block");
-      // Decode the whole thing, iterating over windows manually.
-      std::vector<decoder_result> results;
-      std::vector<std::vector<float_t>> syndromes_round(syndromes.size());
-      for (std::size_t r = 0; r < num_rounds; ++r) {
-        for (std::size_t s = 0; s < syndromes.size(); ++s) {
-          syndromes_round[s].resize(num_syndromes_per_round);
-          std::copy(syndromes[s].begin() + r * num_syndromes_per_round,
-                    syndromes[s].begin() + (r + 1) * num_syndromes_per_round,
-                    syndromes_round[s].begin());
-        }
-        results = decode_batch(syndromes_round);
+/// Decode a batch of syndrome vectors.
+std::vector<decoder_result> sliding_window::decode_batch(
+    const std::vector<std::vector<float_t>> &syndromes) {
+  if (syndromes[0].size() == this->syndrome_size) {
+    CUDAQ_DBG("Decoding whole block");
+    // Decode the whole thing, iterating over windows manually.
+    std::vector<decoder_result> results;
+    std::vector<std::vector<float_t>> syndromes_round(syndromes.size());
+    for (std::size_t r = 0; r < num_rounds; ++r) {
+      for (std::size_t s = 0; s < syndromes.size(); ++s) {
+        syndromes_round[s].resize(num_syndromes_per_round);
+        std::copy(syndromes[s].begin() + r * num_syndromes_per_round,
+                  syndromes[s].begin() + (r + 1) * num_syndromes_per_round,
+                  syndromes_round[s].begin());
       }
-      return results;
-    }
-    // Else we're receiving a single round.
-    if (rw_filled == 0) {
-      initialize_window(syndromes.size());
-    }
-    if (this->rw_filled == num_syndromes_per_window) {
-      CUDAQ_DBG("Window is full, sliding the window by one round");
-      // The window is full. Slide existing data to the left and write the new
-      // data at the end.
-      add_syndromes_to_rolling_window(syndromes);
-      num_rounds_since_last_decode++;
-    } else {
-      // Just copy the data to the end of the rolling window.
-      CUDAQ_DBG("Copying data to the end of the rolling window");
-      add_syndromes_to_rolling_window(syndromes);
-      this->rw_filled += num_syndromes_per_round;
-      num_rounds_since_last_decode++;
+      results = decode_batch(syndromes_round);
     }
-    if (rw_filled == num_syndromes_per_window &&
-        num_rounds_since_last_decode >= step_size) {
-      CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows);
-      decode_window();
-      num_rounds_since_last_decode = 0;
-      num_windows_decoded++;
-      if (num_windows_decoded == num_windows) {
-        num_windows_decoded = 0;
-        rw_filled = 0;
-        // Dump the per window processing times.
-        // for (std::size_t w = 0; w < num_windows; ++w) {
-        //   CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]);
-        // }
-        CUDAQ_DBG("Returning decoder_result");
-        return std::move(this->rw_results);
-      }
+    return results;
+  }
+  // Else we're receiving a single round.
+  if (rw_filled == 0) {
+    initialize_window(syndromes.size());
+  }
+  if (this->rw_filled == num_syndromes_per_window) {
+    CUDAQ_DBG("Window is full, sliding the window by one round");
+    // The window is full. Slide existing data to the left and write the new
+    // data at the end.
+    add_syndromes_to_rolling_window(syndromes);
+    num_rounds_since_last_decode++;
+  } else {
+    // Just copy the data to the end of the rolling window.
+    CUDAQ_DBG("Copying data to the end of the rolling window");
+    add_syndromes_to_rolling_window(syndromes);
+    this->rw_filled += num_syndromes_per_round;
+    num_rounds_since_last_decode++;
+  }
+  if (rw_filled == num_syndromes_per_window &&
+      num_rounds_since_last_decode >= step_size) {
+    CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows);
+    decode_window();
+    num_rounds_since_last_decode = 0;
+    num_windows_decoded++;
+    if (num_windows_decoded == num_windows) {
+      num_windows_decoded = 0;
+      rw_filled = 0;
+      // Dump the per window processing times.
+      // for (std::size_t w = 0; w < num_windows; ++w) {
+      //   CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]);
+      // }
+      CUDAQ_DBG("Returning decoder_result");
+      return std::move(this->rw_results);
     }
-    CUDAQ_DBG("Returning empty decoder_result");
-    return std::vector<decoder_result>(); // empty return value
   }
+  CUDAQ_DBG("Returning empty decoder_result");
+  return std::vector<decoder_result>(); // empty return value
+}
 
-  /// This is an internal helper function that decodes a single window. Regular
-  /// users should use the regular `cudaq::qec::decoder::decode` or
-  /// `cudaq::qec::decoder::decode_batch` functions instead of trying to access
-  /// this function.
-  void decode_window() {
-    auto t0 = std::chrono::high_resolution_clock::now();
-    const auto &w = this->num_windows_decoded;
-    std::size_t syndrome_start = w * step_size * num_syndromes_per_round;
-    std::size_t syndrome_end = syndrome_start + num_syndromes_per_window - 1;
-    std::size_t syndrome_start_next_window =
-        (w + 1) * step_size * num_syndromes_per_round;
-    std::size_t syndrome_end_next_window =
-        syndrome_start_next_window + num_syndromes_per_round - 1;
-    auto t3 = std::chrono::high_resolution_clock::now();
-    if (w > 0) {
-      // Modify the syndrome slice to account for the previous windows.
-      for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
-        std::size_t r2 = rw_next_read_index;
-        for (std::size_t r = 0; r < num_syndromes_per_window; ++r) {
-          auto &slice_val = this->rolling_window[s].at(r2);
-          slice_val =
-              static_cast<double>(static_cast<std::uint8_t>(slice_val) ^
-                                  syndrome_mods[s].at(r + syndrome_start));
-          r2++;
-          if (r2 >= num_syndromes_per_window)
-            r2 = 0;
-        }
+/// This is an internal helper function that decodes a single window. Regular
+/// users should use the regular `cudaq::qec::decoder::decode` or
+/// `cudaq::qec::decoder::decode_batch` functions instead of trying to access
+/// this function.
+void sliding_window::decode_window() {
+  auto t0 = std::chrono::high_resolution_clock::now();
+  const auto &w = this->num_windows_decoded;
+  std::size_t syndrome_start = w * step_size * num_syndromes_per_round;
+  std::size_t syndrome_end = syndrome_start + num_syndromes_per_window - 1;
+  std::size_t syndrome_start_next_window =
+      (w + 1) * step_size * num_syndromes_per_round;
+  std::size_t syndrome_end_next_window =
+      syndrome_start_next_window + num_syndromes_per_round - 1;
+  auto t3 = std::chrono::high_resolution_clock::now();
+  if (w > 0) {
+    // Modify the syndrome slice to account for the previous windows.
+    for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
+      std::size_t r2 = rw_next_read_index;
+      for (std::size_t r = 0; r < num_syndromes_per_window; ++r) {
+        auto &slice_val = this->rolling_window[s].at(r2);
+        slice_val =
+            static_cast<double>(static_cast<std::uint8_t>(slice_val) ^
+                                syndrome_mods[s].at(r + syndrome_start));
+        r2++;
+        if (r2 >= num_syndromes_per_window)
+          r2 = 0;
       }
     }
-    auto t4 = std::chrono::high_resolution_clock::now();
-    CUDAQ_DBG("Window {}: syndrome_start = {}, syndrome_end = {}, length1 = "
-              "{}, length2 = {}",
-              w, syndrome_start, syndrome_end, this->rolling_window[0].size(),
-              syndrome_end - syndrome_start + 1);
-    std::vector<decoder_result> inner_results;
-    if (this->rolling_window.size() == 1) {
-      inner_results.push_back(
-          inner_decoders[w]->decode(get_syndrome_from_rolling_window(0)));
-    } else {
-      inner_results =
-          inner_decoders[w]->decode_batch(get_syndromes_from_rolling_window());
-    }
-    // We've grabbed data from the rolling window, so we need to update the
-    // read index for the next call to decode_window.
-    update_rw_next_read_index();
-    if (!inner_results[0].converged) {
-      CUDAQ_DBG("Window {}: inner decoder failed to converge", w);
-    }
-    auto t5 = std::chrono::high_resolution_clock::now();
-    std::vector<std::vector<uint8_t>> window_results(
-        this->rolling_window.size());
+  }
+  auto t4 = std::chrono::high_resolution_clock::now();
+  CUDAQ_DBG("Window {}: syndrome_start = {}, syndrome_end = {}, length1 = "
+            "{}, length2 = {}",
+            w, syndrome_start, syndrome_end, this->rolling_window[0].size(),
+            syndrome_end - syndrome_start + 1);
+  std::vector<decoder_result> inner_results;
+  if (this->rolling_window.size() == 1) {
+    inner_results.push_back(
+        inner_decoders[w]->decode(get_syndrome_from_rolling_window(0)));
+  } else {
+    inner_results =
+        inner_decoders[w]->decode_batch(get_syndromes_from_rolling_window());
+  }
+  // We've grabbed data from the rolling window, so we need to update the
+  // read index for the next call to decode_window.
+  update_rw_next_read_index();
+  if (!inner_results[0].converged) {
+    CUDAQ_DBG("Window {}: inner decoder failed to converge", w);
+  }
+  auto t5 = std::chrono::high_resolution_clock::now();
+  std::vector<std::vector<uint8_t>> window_results(this->rolling_window.size());
+  for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
+    this->rw_results[s].converged &= inner_results[s].converged;
+    cudaq::qec::convert_vec_soft_to_hard(inner_results[s].result,
+                                         window_results[s]);
+  }
+  // Commit to everything up to the first column of the next window.
+  auto t6 = std::chrono::high_resolution_clock::now();
+  if (w < num_windows - 1) {
+    // Prepare for the next window.
+    auto next_window_first_column = first_columns[w + 1];
+    auto this_window_first_column = first_columns[w];
+    auto num_to_commit = next_window_first_column - this_window_first_column;
+    CUDAQ_DBG("  Committing {} bits from window {}", num_to_commit, w);
     for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
-      this->rw_results[s].converged &= inner_results[s].converged;
-      cudaq::qec::convert_vec_soft_to_hard(inner_results[s].result,
-                                           window_results[s]);
-    }
-    // Commit to everything up to the first column of the next window.
-    auto t6 = std::chrono::high_resolution_clock::now();
-    if (w < num_windows - 1) {
-      // Prepare for the next window.
-      auto next_window_first_column = first_columns[w + 1];
-      auto this_window_first_column = first_columns[w];
-      auto num_to_commit = next_window_first_column - this_window_first_column;
-      CUDAQ_DBG("  Committing {} bits from window {}", num_to_commit, w);
-      for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
-        for (std::size_t c = 0; c < num_to_commit; ++c) {
-          rw_results[s].result[c + this_window_first_column] =
-              window_results[s][c];
-        }
+      for (std::size_t c = 0; c < num_to_commit; ++c) {
+        rw_results[s].result[c + this_window_first_column] =
+            window_results[s][c];
       }
-      // We are committing to some errors that would affect the next round's
-      // syndrome measurements. Therefore, we need to modify some of the
-      // syndrome measurements for the next round to "back out" the errors
-      // that we already know about (or more specifically, the errors we think
-      // we've already accounted for).
-      for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
-        for (std::size_t c = 0; c < num_to_commit; ++c) {
-          if (rw_results[s].result[c + this_window_first_column]) {
-            // This bit is a 1, so we need to modify the syndrome measurements
-            // for the next window to account for this already-accounted-for
-            // error. We do this by flipping the bit in the syndrome
-            // measurements if the corresponding entry in the PCM is a 1.
-            auto *pcm_col = &full_pcm_T.at({c + this_window_first_column, 0});
-            for (auto r = syndrome_start_next_window;
-                 r <= syndrome_end_next_window; ++r) {
-              syndrome_mods[s][r] =
-                  syndrome_mods[s][r] ^ static_cast<bool>(pcm_col[r]);
-            }
+    }
+    // We are committing to some errors that would affect the next round's
+    // syndrome measurements. Therefore, we need to modify some of the
+    // syndrome measurements for the next round to "back out" the errors
+    // that we already know about (or more specifically, the errors we think
+    // we've already accounted for).
+    for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
+      for (std::size_t c = 0; c < num_to_commit; ++c) {
+        if (rw_results[s].result[c + this_window_first_column]) {
+          // This bit is a 1, so we need to modify the syndrome measurements
+          // for the next window to account for this already-accounted-for
+          // error. We do this by flipping the bit in the syndrome
+          // measurements if the corresponding entry in the PCM is a 1.
+          auto *pcm_col = &full_pcm_T.at({c + this_window_first_column, 0});
+          for (auto r = syndrome_start_next_window;
+               r <= syndrome_end_next_window; ++r) {
+            syndrome_mods[s][r] =
+                syndrome_mods[s][r] ^ static_cast<bool>(pcm_col[r]);
           }
         }
       }
-    } else {
-      // This is the last window. Append ALL of window_result to
-      // decoded_result.
-      auto this_window_first_column = first_columns[w];
-      auto num_to_commit = window_results[0].size();
-      CUDAQ_DBG("  Committing {} bits from window {}", num_to_commit, w);
-      for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
-        for (std::size_t c = 0; c < num_to_commit; ++c) {
-          rw_results[s].result[c + this_window_first_column] =
-              window_results[s][c];
-        }
+    }
+  } else {
+    // This is the last window. Append ALL of window_result to
+    // decoded_result.
+    auto this_window_first_column = first_columns[w];
+    auto num_to_commit = window_results[0].size();
+    CUDAQ_DBG("  Committing {} bits from window {}", num_to_commit, w);
+    for (std::size_t s = 0; s < this->rolling_window.size(); ++s) {
+      for (std::size_t c = 0; c < num_to_commit; ++c) {
+        rw_results[s].result[c + this_window_first_column] =
+            window_results[s][c];
       }
     }
-    auto t7 = std::chrono::high_resolution_clock::now();
-    window_proc_times.at(w) +=
-        std::chrono::duration<double>(t7 - t0).count() * 1000;
-    window_proc_times_arr[WindowProcTimes::INDEX_CALCULATION] =
-        std::chrono::duration<double>(t3 - t0).count() * 1000;
-    window_proc_times_arr[WindowProcTimes::MODIFY_SYNDROME_SLICE] =
-        std::chrono::duration<double>(t4 - t3).count() * 1000;
-    window_proc_times_arr[WindowProcTimes::INNER_DECODE] =
-        std::chrono::duration<double>(t5 - t4).count() * 1000;
-    window_proc_times_arr[WindowProcTimes::CONVERT_TO_HARD] =
-        std::chrono::duration<double>(t6 - t5).count() * 1000;
-    window_proc_times_arr[WindowProcTimes::COMMIT_TO_RESULT] =
-        std::chrono::duration<double>(t7 - t6).count() * 1000;
-    CUDAQ_INFO("Window {} time: {:.3f} ms (0:{:.3f}ms 1:{:.3f}ms 2:{:.3f}ms "
-               "3:{:.3f}ms 4:{:.3f}ms 5:{:.3f}ms 6:{:.3f}ms 7:{:.3f}ms)",
-               w, window_proc_times[w], window_proc_times_arr[0],
-               window_proc_times_arr[1], window_proc_times_arr[2],
-               window_proc_times_arr[3], window_proc_times_arr[4],
-               window_proc_times_arr[5], window_proc_times_arr[6],
-               window_proc_times_arr[7]);
   }
+  auto t7 = std::chrono::high_resolution_clock::now();
+  window_proc_times.at(w) +=
+      std::chrono::duration<double>(t7 - t0).count() * 1000;
+  window_proc_times_arr[WindowProcTimes::INDEX_CALCULATION] =
+      std::chrono::duration<double>(t3 - t0).count() * 1000;
+  window_proc_times_arr[WindowProcTimes::MODIFY_SYNDROME_SLICE] =
+      std::chrono::duration<double>(t4 - t3).count() * 1000;
+  window_proc_times_arr[WindowProcTimes::INNER_DECODE] =
+      std::chrono::duration<double>(t5 - t4).count() * 1000;
+  window_proc_times_arr[WindowProcTimes::CONVERT_TO_HARD] =
+      std::chrono::duration<double>(t6 - t5).count() * 1000;
+  window_proc_times_arr[WindowProcTimes::COMMIT_TO_RESULT] =
+      std::chrono::duration<double>(t7 - t6).count() * 1000;
+  CUDAQ_INFO("Window {} time: {:.3f} ms (0:{:.3f}ms 1:{:.3f}ms 2:{:.3f}ms "
+             "3:{:.3f}ms 4:{:.3f}ms 5:{:.3f}ms 6:{:.3f}ms 7:{:.3f}ms)",
+             w, window_proc_times[w], window_proc_times_arr[0],
+             window_proc_times_arr[1], window_proc_times_arr[2],
+             window_proc_times_arr[3], window_proc_times_arr[4],
+             window_proc_times_arr[5], window_proc_times_arr[6],
+             window_proc_times_arr[7]);
+}
 
-  virtual ~sliding_window() {}
+std::size_t sliding_window::get_num_syndromes_per_round() const {
+  return num_syndromes_per_round;
+}
 
-  CUDAQ_EXTENSION_CUSTOM_CREATOR_FUNCTION(
-      sliding_window, static std::unique_ptr<decoder> create(
-                          const cudaqx::tensor<uint8_t> &H,
-                          const cudaqx::heterogeneous_map &params) {
-        return std::make_unique<sliding_window>(H, params);
-      })
-};
+sliding_window::~sliding_window() {}
 
 CUDAQ_REGISTER_TYPE(sliding_window)
 
diff --git a/libs/qec/lib/decoders/sliding_window.h b/libs/qec/lib/decoders/sliding_window.h
new file mode 100644
index 00000000..6d5066f8
--- /dev/null
+++ b/libs/qec/lib/decoders/sliding_window.h
@@ -0,0 +1,149 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2025 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/qec/decoder.h"
+#include <vector>
+
+namespace cudaq::qec {
+
+/// @brief A sliding window decoder that processes syndromes in overlapping windows
+/// 
+/// This decoder divides the syndrome stream into overlapping windows and decodes each
+/// window independently using an inner decoder. It's designed for low-latency decoding
+/// of streaming syndrome data.
+class sliding_window : public decoder {
+private:
+  // --- Input parameters ---
+
+  /// The number of rounds of syndrome data in each window.
+  std::size_t window_size = 1;
+  /// The number of rounds to advance the window by each time.
+  std::size_t step_size = 1;
+  /// The number of syndromes per round.
+  std::size_t num_syndromes_per_round = 0;
+  /// When forming a window, should error mechanisms that span the start round
+  /// and any preceding rounds be included?
+  bool straddle_start_round = false;
+  /// When forming a window, should error mechanisms that span the end round and
+  /// any subsequent rounds be included?
+  bool straddle_end_round = true;
+  /// The vector of error rates for the error mechanisms.
+  std::vector<cudaq::qec::float_t> error_rate_vec;
+  /// The name of the inner decoder to use.
+  std::string inner_decoder_name;
+  /// The parameters to pass to the inner decoder.
+  cudaqx::heterogeneous_map inner_decoder_params;
+
+  // Derived parameters.
+  std::size_t num_windows = 0;
+  std::size_t num_rounds = 0;
+  std::size_t num_syndromes_per_window = 0;
+  std::size_t num_rounds_since_last_decode = 0;
+  std::vector<std::unique_ptr<decoder>> inner_decoders;
+  std::vector<std::size_t> first_columns;
+  cudaqx::tensor<std::uint8_t> full_pcm;
+  cudaqx::tensor<std::uint8_t> full_pcm_T;
+
+  // Enum type for timing data.
+  enum WindowProcTimes {
+    INITIALIZE_WINDOW,     // 0
+    SLIDE_WINDOW,          // 1
+    COPY_DATA,             // 2
+    INDEX_CALCULATION,     // 3
+    MODIFY_SYNDROME_SLICE, // 4
+    INNER_DECODE,          // 5
+    CONVERT_TO_HARD,       // 6
+    COMMIT_TO_RESULT,      // 7
+    NUM_WINDOW_PROC_TIMES  // 8
+  };
+
+  // State data
+  std::vector<std::vector<cudaq::qec::float_t>>
+      rolling_window; // [batch_size, num_syndromes_per_window]
+  // rolling window read and write indices (circular buffer)
+  std::size_t rw_next_write_index = 0; // [0, num_syndromes_per_window)
+  std::size_t rw_next_read_index = 0;  // [0, num_syndromes_per_window)
+  std::size_t rw_filled = 0;
+  std::size_t num_windows_decoded = 0;
+  std::vector<std::vector<bool>> syndrome_mods; // [batch_size, syndrome_size]
+  std::vector<decoder_result> rw_results;       // [batch_size]
+  std::vector<double> window_proc_times;
+  std::array<double, WindowProcTimes::NUM_WINDOW_PROC_TIMES>
+      window_proc_times_arr = {};
+
+  /// @brief Validate constructor inputs
+  void validate_inputs();
+  
+  /// @brief Initialize the window
+  /// @param num_syndromes The number of syndromes to initialize the window for
+  void initialize_window(std::size_t num_syndromes);
+  
+  /// @brief Add a single syndrome to the rolling window (circular buffer)
+  void add_syndrome_to_rolling_window(const std::vector<float_t> &syndrome,
+                                       std::size_t syndrome_index,
+                                       bool update_next_write_index = true);
+  
+  /// @brief Add a batch of syndromes to the rolling window (circular buffer)
+  void add_syndromes_to_rolling_window(
+      const std::vector<std::vector<float_t>> &syndromes);
+  
+  /// @brief Get a single syndrome from the rolling window (unwrapping circular buffer)
+  std::vector<float_t> get_syndrome_from_rolling_window(std::size_t syndrome_index);
+  
+  /// @brief Get a batch of syndromes from the rolling window (unwrapping circular buffer)
+  std::vector<std::vector<float_t>> get_syndromes_from_rolling_window();
+  
+  /// @brief Update the read index for the rolling window
+  void update_rw_next_read_index();
+  
+  /// @brief Decode a single window (internal helper)
+  void decode_window();
+
+public:
+  /// @brief Constructor
+  /// @param H The full parity check matrix for all rounds
+  /// @param params A heterogeneous map containing required parameters:
+  ///   - window_size: Size of each decoding window (in rounds)
+  ///   - step_size: Step size between consecutive windows (in rounds)
+  ///   - num_rounds: Total number of rounds
+  ///   - num_syndromes_per_round: Number of syndromes per round
+  ///   - inner_decoder: Name of the inner decoder to use
+  ///   - inner_decoder_params: Parameters for the inner decoder (optional)
+  sliding_window(const cudaqx::tensor<uint8_t> &H,
+                 const cudaqx::heterogeneous_map &params);
+
+  /// @brief Decode a syndrome vector
+  /// @param syndrome The syndrome measurements to decode
+  /// @return The decoded error correction
+  decoder_result decode(const std::vector<float_t> &syndrome) override;
+
+  /// @brief Decode multiple syndromes in batch
+  /// @param syndromes Multiple syndrome measurements to decode
+  /// @return The decoded error corrections
+  std::vector<decoder_result> decode_batch(const std::vector<std::vector<float_t>> &syndromes) override;
+
+  /// @brief Get the number of syndromes per round
+  /// @return The number of syndromes measured in each round
+  std::size_t get_num_syndromes_per_round() const;
+
+  /// @brief Destructor
+  virtual ~sliding_window();
+
+  // Plugin registration macros
+  CUDAQ_EXTENSION_CUSTOM_CREATOR_FUNCTION(
+      sliding_window, static std::unique_ptr<decoder> create(
+                          const cudaqx::tensor<uint8_t> &H,
+                          const cudaqx::heterogeneous_map &params) {
+        return std::make_unique<sliding_window>(H, params);
+      })
+};
+
+} // namespace cudaq::qec
+

From a83e424f0a967063fa21510ff4944e5aa98deba0 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Wed, 12 Nov 2025 15:15:28 +0000
Subject: [PATCH 08/11] Formatting

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoders/sliding_window.h | 42 ++++++++++++++------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/libs/qec/lib/decoders/sliding_window.h b/libs/qec/lib/decoders/sliding_window.h
index 6d5066f8..74513c8e 100644
--- a/libs/qec/lib/decoders/sliding_window.h
+++ b/libs/qec/lib/decoders/sliding_window.h
@@ -13,11 +13,12 @@
 
 namespace cudaq::qec {
 
-/// @brief A sliding window decoder that processes syndromes in overlapping windows
-/// 
-/// This decoder divides the syndrome stream into overlapping windows and decodes each
-/// window independently using an inner decoder. It's designed for low-latency decoding
-/// of streaming syndrome data.
+/// @brief A sliding window decoder that processes syndromes in overlapping
+/// windows
+///
+/// This decoder divides the syndrome stream into overlapping windows and
+/// decodes each window independently using an inner decoder. It's designed for
+/// low-latency decoding of streaming syndrome data.
 class sliding_window : public decoder {
 private:
   // --- Input parameters ---
@@ -80,29 +81,32 @@ class sliding_window : public decoder {
 
   /// @brief Validate constructor inputs
   void validate_inputs();
-  
+
   /// @brief Initialize the window
   /// @param num_syndromes The number of syndromes to initialize the window for
   void initialize_window(std::size_t num_syndromes);
-  
+
   /// @brief Add a single syndrome to the rolling window (circular buffer)
   void add_syndrome_to_rolling_window(const std::vector<float_t> &syndrome,
-                                       std::size_t syndrome_index,
-                                       bool update_next_write_index = true);
-  
+                                      std::size_t syndrome_index,
+                                      bool update_next_write_index = true);
+
   /// @brief Add a batch of syndromes to the rolling window (circular buffer)
   void add_syndromes_to_rolling_window(
       const std::vector<std::vector<float_t>> &syndromes);
-  
-  /// @brief Get a single syndrome from the rolling window (unwrapping circular buffer)
-  std::vector<float_t> get_syndrome_from_rolling_window(std::size_t syndrome_index);
-  
-  /// @brief Get a batch of syndromes from the rolling window (unwrapping circular buffer)
+
+  /// @brief Get a single syndrome from the rolling window (unwrapping circular
+  /// buffer)
+  std::vector<float_t>
+  get_syndrome_from_rolling_window(std::size_t syndrome_index);
+
+  /// @brief Get a batch of syndromes from the rolling window (unwrapping
+  /// circular buffer)
   std::vector<std::vector<float_t>> get_syndromes_from_rolling_window();
-  
+
   /// @brief Update the read index for the rolling window
   void update_rw_next_read_index();
-  
+
   /// @brief Decode a single window (internal helper)
   void decode_window();
 
@@ -127,7 +131,8 @@ class sliding_window : public decoder {
   /// @brief Decode multiple syndromes in batch
   /// @param syndromes Multiple syndrome measurements to decode
   /// @return The decoded error corrections
-  std::vector<decoder_result> decode_batch(const std::vector<std::vector<float_t>> &syndromes) override;
+  std::vector<decoder_result>
+  decode_batch(const std::vector<std::vector<float_t>> &syndromes) override;
 
   /// @brief Get the number of syndromes per round
   /// @return The number of syndromes measured in each round
@@ -146,4 +151,3 @@ class sliding_window : public decoder {
 };
 
 } // namespace cudaq::qec
-

From fb198e1bb562616a6bd0749b4c46c7c33c60c956 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Wed, 12 Nov 2025 16:26:55 +0000
Subject: [PATCH 09/11] Minor tweaks

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp                                    | 4 ++--
 libs/qec/unittests/realtime/app_examples/surface_code-1.cpp | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index 98200f7c..9c51f903 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -300,9 +300,9 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome,
       } else {
         // Buffer is full with 2 rounds: compute timelike detectors (XOR of two
         // rounds)
+        std::size_t index =
+            (pimpl->current_round - 2) * pimpl->num_syndromes_per_round;
         for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) {
-          std::size_t index =
-              (pimpl->current_round - 2) * pimpl->num_syndromes_per_round;
           pimpl->persistent_detector_buffer[i] =
               pimpl->msyn_buffer[index + i] ^
               pimpl->msyn_buffer[index + i + pimpl->num_syndromes_per_round];
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
index 8005c990..71d132fe 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
+++ b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
@@ -33,8 +33,7 @@
 void save_dem_to_file(const cudaq::qec::detector_error_model &dem,
                       std::string dem_filename, uint64_t numSyndromesPerRound,
                       uint64_t numLogical, const std::string &decoder_type,
-                      int decoder_window, int sw_window_size,
-                      int sw_step_size) {
+                      int sw_window_size, int sw_step_size) {
   cudaq::qec::decoding::config::multi_decoder_config multi_config;
   for (uint64_t i = 0; i < numLogical; i++) {
     // We actually send 1 additional round in this example, so add 1.
@@ -558,8 +557,7 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
 
     if (save_dem) {
       save_dem_to_file(dem, dem_filename, numSyndromesPerRound, numLogical,
-                       decoder_type, decoder_window, sw_window_size,
-                       sw_step_size);
+                       decoder_type, sw_window_size, sw_step_size);
       return;
     }
   }

From 9bb39a4eb51fb16f096e3f9a03a27c285afb9704 Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Thu, 13 Nov 2025 12:45:47 +0000
Subject: [PATCH 10/11] Created common function for set_D_sparse

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoder.cpp | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp
index 43021613..7b03138c 100644
--- a/libs/qec/lib/decoder.cpp
+++ b/libs/qec/lib/decoder.cpp
@@ -190,9 +190,11 @@ void decoder::set_decoder_id(uint32_t decoder_id) {
 
 uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; }
 
-void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
-  this->D_sparse = D_sparse;
-  auto *sw_decoder = dynamic_cast<sliding_window *>(this);
+template <typename PimplType>
+void set_D_sparse_common(decoder *decoder,
+                         const std::vector<std::vector<uint32_t>> &D_sparse,
+                         PimplType *pimpl) {
+  auto *sw_decoder = dynamic_cast<sliding_window *>(decoder);
 
   if (sw_decoder != nullptr) {
     pimpl->is_sliding_window = true;
@@ -215,28 +217,14 @@ void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
   pimpl->msyn_buffer_index = 0;
 }
 
+void decoder::set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse) {
+  this->D_sparse = D_sparse;
+  set_D_sparse_common(this, D_sparse, pimpl.get());
+}
+
 void decoder::set_D_sparse(const std::vector<int64_t> &D_sparse_vec_in) {
   set_sparse_from_vec(D_sparse_vec_in, this->D_sparse);
-  auto *sw_decoder = dynamic_cast<sliding_window *>(this);
-
-  if (sw_decoder != nullptr) {
-    pimpl->is_sliding_window = true;
-    pimpl->num_syndromes_per_round = sw_decoder->get_num_syndromes_per_round();
-    // Check if first row is a first-round detector (single syndrome index)
-    pimpl->has_first_round_detectors =
-        (this->D_sparse.size() > 0 && this->D_sparse[0].size() == 1);
-    pimpl->current_round = 0;
-    pimpl->persistent_detector_buffer.resize(pimpl->num_syndromes_per_round);
-    pimpl->persistent_soft_detector_buffer.resize(
-        pimpl->num_syndromes_per_round);
-  } else {
-    pimpl->is_sliding_window = false;
-  }
-
-  pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(this->D_sparse);
-  pimpl->msyn_buffer.clear();
-  pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode);
-  pimpl->msyn_buffer_index = 0;
+  set_D_sparse_common(this, this->D_sparse, pimpl.get());
 }
 
 bool decoder::enqueue_syndrome(const uint8_t *syndrome,

From d7dab65630ce2133befe926c4a166e72af60c54a Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Mon, 17 Nov 2025 14:33:32 +0000
Subject: [PATCH 11/11] Make sliding_window.cpp diffs easier for git diff views

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 libs/qec/lib/decoders/sliding_window.cpp | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/libs/qec/lib/decoders/sliding_window.cpp b/libs/qec/lib/decoders/sliding_window.cpp
index cb875c73..979227a3 100644
--- a/libs/qec/lib/decoders/sliding_window.cpp
+++ b/libs/qec/lib/decoders/sliding_window.cpp
@@ -14,11 +14,6 @@
 
 namespace cudaq::qec {
 
-// ============================================================================
-// Private helper method implementations
-// ============================================================================
-
-/// Helper function to validate constructor inputs.
 void sliding_window::validate_inputs() {
   if (window_size < 1 || window_size > num_rounds) {
     throw std::invalid_argument(
@@ -180,11 +175,6 @@ void sliding_window::update_rw_next_read_index() {
     rw_next_read_index -= num_syndromes_per_window;
 }
 
-// ============================================================================
-// Public method implementations
-// ============================================================================
-
-/// Constructor for the sliding window decoder.
 sliding_window::sliding_window(const cudaqx::tensor<uint8_t> &H,
                                const cudaqx::heterogeneous_map &params)
     : decoder(H), full_pcm(H) {
@@ -237,7 +227,6 @@ sliding_window::sliding_window(const cudaqx::tensor<uint8_t> &H,
   }
 }
 
-/// Decode a syndrome vector (either full block or single round).
 decoder_result sliding_window::decode(const std::vector<float_t> &syndrome) {
   if (syndrome.size() == this->syndrome_size) {
     auto t0 = std::chrono::high_resolution_clock::now();
@@ -301,7 +290,6 @@ decoder_result sliding_window::decode(const std::vector<float_t> &syndrome) {
   return decoder_result(); // empty return value
 }
 
-/// Decode a batch of syndrome vectors.
 std::vector<decoder_result> sliding_window::decode_batch(
     const std::vector<std::vector<float_t>> &syndromes) {
   if (syndromes[0].size() == this->syndrome_size) {
@@ -483,12 +471,12 @@ void sliding_window::decode_window() {
              window_proc_times_arr[7]);
 }
 
+sliding_window::~sliding_window() {}
+
 std::size_t sliding_window::get_num_syndromes_per_round() const {
   return num_syndromes_per_round;
 }
 
-sliding_window::~sliding_window() {}
-
 CUDAQ_REGISTER_TYPE(sliding_window)
 
 } // namespace cudaq::qec