From 26f5f50bb01f0214dd65a8c15b1103d9862cc107 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Thu, 6 Nov 2025 20:55:12 +0000 Subject: [PATCH 01/11] Added ability for surface code to run sliding_window. First attempt at overhaul of decoder::enqueue_syndrome Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 167 +++++++++++++++--- .../realtime/app_examples/surface_code-1.cpp | 129 +++++++++++--- 2 files changed, 245 insertions(+), 51 deletions(-) diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index 73a38707..f6093d9d 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -23,16 +23,35 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor &, namespace cudaq::qec { struct decoder::rt_impl { - /// The number of measurement syndromes to be decoded per decode call (i.e. - /// the number of columns in the D_sparse matrix) + /// The number of syndromes per round (enables incremental detector computation) + uint32_t num_syndromes_per_round = 0; + + /// The number of measurement syndromes to be decoded per decode call + /// (for incremental mode: one round; for batch mode: full D_sparse columns) uint32_t num_msyn_per_decode = 0; - /// The index of the next syndrome to be written in the msyn_buffer - uint32_t msyn_buffer_index = 0; + /// Counter of total syndromes buffered but not yet processed. + /// Used to detect complete rounds (when this is a multiple of num_msyn_per_decode). + /// Gets decremented after each round is decoded. Not a direct buffer index. + uint32_t num_syndromes_buffered_but_not_decoded = 0; - /// The buffer of measurement syndromes received from the client. Length is - /// num_msyn_per_decode. + /// The buffer of measurement syndromes received from the client. + /// For incremental mode: size is calculated from max D_sparse column + 1 + /// This allows buffering multiple rounds while still decoding incrementally std::vector msyn_buffer; + + /// Total buffer capacity (max column index in D_sparse + 1) + uint32_t buffer_capacity = 0; + + /// Track which round we're on (0 = reference round) + uint32_t current_round = 0; + + /// Circular buffer write position for the current round + // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then wrap around to 0. + uint32_t current_round_buffer_offset = 0; + + /// Circular buffer position of the previous round (for incremental XOR) + uint32_t prev_round_buffer_offset = 0; /// The current observable corrections. The length of this vector is the /// number of rows in the O_sparse matrix. @@ -174,33 +193,111 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; } void decoder::set_D_sparse(const std::vector> &D_sparse) { this->D_sparse = D_sparse; - pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse); + + // Infer num_syndromes_per_round from D_sparse timelike structure + // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds + // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24 + if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) { + pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0]; + } else { + // Fallback: assume 1:1 mapping + pimpl->num_syndromes_per_round = 1; + } + + // Calculate minimum buffer capacity from max column in D_sparse + uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse); + + // Enable incremental mode: process one round at a time + pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; + + // Add one extra round to buffer capacity to guarantee no wraparound within operations + // This eliminates all wraparound checks in hot loops (write and detector computation) + pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round; + + // Allocate buffer to hold all syndromes plus extra round pimpl->msyn_buffer.clear(); - pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); - pimpl->msyn_buffer_index = 0; + pimpl->msyn_buffer.resize(pimpl->buffer_capacity); + + pimpl->num_syndromes_buffered_but_not_decoded = 0; + pimpl->current_round = 0; + pimpl->current_round_buffer_offset = 0; + pimpl->prev_round_buffer_offset = 0; } void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { set_sparse_from_vec(D_sparse_vec_in, this->D_sparse); - pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse); + + // Infer num_syndromes_per_round from D_sparse timelike structure + // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds + // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24 + if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) { + pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0]; + } else { + // Fallback: assume 1:1 mapping + pimpl->num_syndromes_per_round = 1; + } + + // Calculate minimum buffer capacity from max column in D_sparse + uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse); + + // Enable incremental mode: process one round at a time + pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; + + // Add one extra round to buffer capacity to guarantee no wraparound within operations + // This eliminates all wraparound checks in hot loops (write and detector computation) + pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round; + + // Allocate buffer to hold all syndromes plus extra round pimpl->msyn_buffer.clear(); - pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); - pimpl->msyn_buffer_index = 0; + pimpl->msyn_buffer.resize(pimpl->buffer_capacity); + + pimpl->num_syndromes_buffered_but_not_decoded = 0; + pimpl->current_round = 0; + pimpl->current_round_buffer_offset = 0; + pimpl->prev_round_buffer_offset = 0; } bool decoder::enqueue_syndrome(const uint8_t *syndrome, std::size_t syndrome_length) { - if (pimpl->msyn_buffer_index + syndrome_length > pimpl->msyn_buffer.size()) { - // CUDAQ_WARN("Syndrome buffer overflow. Syndrome will be ignored."); - printf("Syndrome buffer overflow. Syndrome will be ignored.\n"); + // position_in_round represents how many syndromes of the current round have already been buffered but not yet decoded + // Values range from 0 to num_msyn_per_decode - 1. + uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode; + + // Check if this write would overwrite the previous round + // We need to preserve prev_round_buffer for XOR computation, so the maximum + // safe write from the start of the current round is buffer_capacity minus one round + uint32_t max_safe_from_round_start = pimpl->buffer_capacity - pimpl->num_syndromes_per_round; + if (position_in_round + syndrome_length > max_safe_from_round_start) { + // CUDAQ_WARN("Syndrome data too large - would overwrite previous round. Data will be ignored."); + printf("Syndrome data too large - would overwrite previous round. Data will be ignored.\n"); return false; } bool did_decode = false; + // Buffer the incoming syndromes + // No wraparound check needed: buffer is sized to guarantee operations never wrap mid-execution + uint32_t write_start = pimpl->current_round_buffer_offset + position_in_round; for (std::size_t i = 0; i < syndrome_length; i++) { - pimpl->msyn_buffer[pimpl->msyn_buffer_index] = syndrome[i]; - pimpl->msyn_buffer_index++; + pimpl->msyn_buffer[write_start + i] = syndrome[i]; } - if (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size()) { + pimpl->num_syndromes_buffered_but_not_decoded += syndrome_length; + + // Process all complete rounds that are now available + while ((pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode) == 0 && + pimpl->num_syndromes_buffered_but_not_decoded > 0) { + pimpl->current_round++; + + // First round (round 1): store as reference, don't decode yet + if (pimpl->current_round == 1) { + // Previous round stays at current position for next round's XOR + pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset; + // Advance to next round position in circular buffer + pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode; + if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity) + pimpl->current_round_buffer_offset -= pimpl->buffer_capacity; + pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode; // Decrement for next iteration + continue; // Skip to next round + } + // These are just for logging. They are initialized in such a way to avoid // dynamic memory allocation if logging is disabled. std::vector log_msyn; @@ -222,12 +319,20 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, log_observable_corrections.resize(O_sparse.size()); } - // Decode now. - for (std::size_t i = 0; i < this->D_sparse.size(); i++) { - pimpl->persistent_detector_buffer[i] = 0; - for (auto col : this->D_sparse[i]) - pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col]; + // Compute detectors incrementally by XORing current round with previous round + // Using circular buffer offsets - no D_sparse access needed + // No wraparound checks needed: buffer is sized to guarantee operations never wrap mid-execution + for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { + pimpl->persistent_detector_buffer[i] = + pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ + pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i]; } + + // Update offsets for next round: current becomes previous, advance current + pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset; + pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode; + if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity) + pimpl->current_round_buffer_offset -= pimpl->buffer_capacity; if (should_log) { log_msyn.reserve(pimpl->msyn_buffer.size()); for (std::size_t d = 0, D = pimpl->msyn_buffer.size(); d < D; d++) { @@ -295,9 +400,11 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, printf("%s\n", s.c_str()); } did_decode = true; - // Prepare for more data. - pimpl->msyn_buffer_index = 0; + + // Decrement counter for next iteration of while loop + pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode; } + return did_decode; } @@ -344,9 +451,15 @@ std::size_t decoder::get_num_observables() const { return O_sparse.size(); } void decoder::reset_decoder() { // Zero out all data that is considered "per-shot" memory. - pimpl->msyn_buffer_index = 0; + pimpl->num_syndromes_buffered_but_not_decoded = 0; pimpl->msyn_buffer.clear(); - pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); + pimpl->msyn_buffer.resize(pimpl->buffer_capacity); + + // Reset incremental computation state + pimpl->current_round = 0; + pimpl->current_round_buffer_offset = 0; + pimpl->prev_round_buffer_offset = 0; + pimpl->corrections.clear(); pimpl->corrections.resize(O_sparse.size()); const bool log_due_to_log_level = diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp index 61a7aef5..1d34aaa5 100644 --- a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp +++ b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp @@ -32,14 +32,16 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, std::string dem_filename, uint64_t numSyndromesPerRound, - uint64_t numLogical) { + uint64_t numLogical, const std::string &decoder_type, + int decoder_window, int sw_window_size, + int sw_step_size) { cudaq::qec::decoding::config::multi_decoder_config multi_config; for (uint64_t i = 0; i < numLogical; i++) { // We actually send 1 additional round in this example, so add 1. auto numRounds = dem.num_detectors() / numSyndromesPerRound + 1; cudaq::qec::decoding::config::decoder_config config; config.id = i; - config.type = "nv-qldpc-decoder"; + config.type = decoder_type; // Use parameter instead of hardcoded config.block_size = dem.num_error_mechanisms(); config.syndrome_size = dem.num_detectors(); config.num_syndromes_per_round = numSyndromesPerRound; @@ -48,24 +50,53 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, cudaq::qec::pcm_to_sparse_vec(dem.observables_flips_matrix); config.D_sparse = cudaq::qec::generate_timelike_sparse_detector_matrix( numSyndromesPerRound, numRounds, /*include_first_round=*/false); - config.decoder_custom_args = - cudaq::qec::decoding::config::nv_qldpc_decoder_config(); - auto &nv_config = - std::get( - config.decoder_custom_args); - nv_config.use_sparsity = true; - nv_config.error_rate_vec = dem.error_rates; - nv_config.use_osd = true; - nv_config.max_iterations = 50; - nv_config.osd_order = 60; - nv_config.osd_method = 3; + + if (decoder_type == "nv-qldpc-decoder") { + // Original NV-QLDPC configuration + config.decoder_custom_args = + cudaq::qec::decoding::config::nv_qldpc_decoder_config(); + auto &nv_config = + std::get( + config.decoder_custom_args); + nv_config.use_sparsity = true; + nv_config.error_rate_vec = dem.error_rates; + nv_config.use_osd = true; + nv_config.max_iterations = 50; + nv_config.osd_order = 60; + nv_config.osd_method = 3; + + } else if (decoder_type == "sliding_window") { + // Sliding window configuration + cudaq::qec::decoding::config::sliding_window_config sw_config; + sw_config.window_size = sw_window_size; + sw_config.step_size = sw_step_size; + sw_config.num_syndromes_per_round = numSyndromesPerRound; + sw_config.straddle_start_round = false; + sw_config.straddle_end_round = true; + sw_config.inner_decoder_name = "nv-qldpc-decoder"; + sw_config.error_rate_vec = dem.error_rates; // Required by sliding_window + + // Configure inner NV-QLDPC decoder + cudaq::qec::decoding::config::nv_qldpc_decoder_config nv_config; + nv_config.use_sparsity = true; + nv_config.error_rate_vec = dem.error_rates; + nv_config.use_osd = true; + nv_config.max_iterations = 50; + nv_config.osd_order = 60; + nv_config.osd_method = 3; + + sw_config.nv_qldpc_decoder_params = nv_config; + config.decoder_custom_args = sw_config; + } + multi_config.decoders.push_back(config); } std::string config_str = multi_config.to_yaml_str(200); std::ofstream config_file(dem_filename); config_file << config_str; config_file.close(); - printf("Saved config to file: %s\n", dem_filename.c_str()); + printf("Saved %s config to file: %s\n", decoder_type.c_str(), + dem_filename.c_str()); return; } @@ -82,14 +113,31 @@ void load_dem_from_file(const std::string &dem_filename, cudaq::qec::decoding::config::multi_decoder_config::from_yaml_str( dem_str); if (numLogical != config.decoders.size()) { - printf("ERROR: numLogical [%ld] !- config.decoders.size() [%ld]\n", + printf("ERROR: numLogical [%ld] != config.decoders.size() [%ld]\n", numLogical, config.decoders.size()); exit(1); } auto decoder_config = config.decoders[0]; - auto nv_qldpc_config = - std::get( - decoder_config.decoder_custom_args); + + // Extract error rates based on decoder type + std::vector error_rates; + + if (decoder_config.type == "nv-qldpc-decoder") { + auto nv_config = + std::get( + decoder_config.decoder_custom_args); + error_rates = nv_config.error_rate_vec.value(); + + } else if (decoder_config.type == "sliding_window") { + auto sw_config = + std::get( + decoder_config.decoder_custom_args); + // Extract from top-level error_rate_vec (required for sliding_window) + if (!sw_config.error_rate_vec.empty()) { + error_rates = sw_config.error_rate_vec; + } + } + dem.detector_error_matrix = cudaq::qec::pcm_from_sparse_vec( decoder_config.H_sparse, decoder_config.syndrome_size, decoder_config.block_size); @@ -99,10 +147,11 @@ void load_dem_from_file(const std::string &dem_filename, decoder_config.O_sparse.end(), -1); dem.observables_flips_matrix = cudaq::qec::pcm_from_sparse_vec( decoder_config.O_sparse, num_observables, decoder_config.block_size); - dem.error_rates = nv_qldpc_config.error_rate_vec.value(); - printf("Loaded dem from file: %s\n", dem_filename.c_str()); + dem.error_rates = error_rates; + printf("Loaded %s config from file: %s\n", decoder_config.type.c_str(), + dem_filename.c_str()); - // Now configure the decoders + // Now configure the decoders (works for both types) cudaq::qec::decoding::config::configure_decoders(config); } @@ -376,7 +425,9 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance, double p_spam, cudaq::qec::operation statePrep, std::size_t numShots, std::size_t numRounds, std::size_t numLogical, std::string dem_filename, - bool save_dem, bool load_dem, int decoder_window) { + bool save_dem, bool load_dem, int decoder_window, + const std::string &decoder_type, int sw_window_size, + int sw_step_size) { if (!code.contains_operation(statePrep)) throw std::runtime_error( "sample_memory_circuit_error - requested state prep kernel not found."); @@ -532,7 +583,9 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance, dem.observables_flips_matrix.dump_bits(); if (save_dem) { - save_dem_to_file(dem, dem_filename, numSyndromesPerRound, numLogical); + save_dem_to_file(dem, dem_filename, numSyndromesPerRound, numLogical, + decoder_type, decoder_window, sw_window_size, + sw_step_size); return; } } @@ -603,6 +656,11 @@ void show_help() { "distance\n"); printf(" --decoder_window Number of rounds to use for the decoder " "window. Default: distance\n"); + printf(" --decoder_type Decoder type: 'nv-qldpc-decoder' or " + "'sliding_window'. Default: nv-qldpc-decoder\n"); + printf(" --sw_window_size Sliding window size (only for " + "sliding_window decoder). Default: decoder_window\n"); + printf(" --sw_step_size Sliding window step size. Default: 1\n"); printf(" --save_dem Save the detector error model to a file.\n"); printf(" --load_dem Load the detector error model from a file. " "(Cannot be used with --save_dem)\n"); @@ -619,6 +677,11 @@ int main(int argc, char **argv) { bool save_dem = false; bool load_dem = false; std::string dem_filename; + + // Decoder type selection + std::string decoder_type = "nv-qldpc-decoder"; // Default + int sw_window_size = -1; // For sliding_window, default to decoder_window + int sw_step_size = 1; // For sliding_window // Parse the command line arguments for (int i = 1; i < argc; i++) { @@ -644,6 +707,15 @@ int main(int argc, char **argv) { } else if (arg == "--decoder_window") { decoder_window = std::stoi(argv[i + 1]); i++; + } else if (arg == "--decoder_type") { + decoder_type = argv[i + 1]; + i++; + } else if (arg == "--sw_window_size") { + sw_window_size = std::stoi(argv[i + 1]); + i++; + } else if (arg == "--sw_step_size") { + sw_step_size = std::stoi(argv[i + 1]); + i++; } else if (arg == "--save_dem") { save_dem = true; dem_filename = argv[i + 1]; @@ -671,6 +743,14 @@ int main(int argc, char **argv) { num_rounds = distance; if (decoder_window == -1) decoder_window = distance; + if (sw_window_size == -1) + sw_window_size = decoder_window; + + // Validate decoder type + if (decoder_type != "nv-qldpc-decoder" && decoder_type != "sliding_window") { + printf("Error: --decoder_type must be 'nv-qldpc-decoder' or 'sliding_window'\n"); + return 1; + } // Validate that num_rounds >= distance if (num_rounds < distance || num_rounds % distance != 0) { @@ -721,7 +801,8 @@ int main(int argc, char **argv) { demo_circuit_host(*code, distance, p_spam, cudaq::qec::operation::prep0, num_shots, num_rounds, num_logical, dem_filename, save_dem, - load_dem, decoder_window); + load_dem, decoder_window, decoder_type, sw_window_size, + sw_step_size); // Ensure clean shutdown cudaq::qec::decoding::config::finalize_decoders(); From 5e12fd88def5281dd500f1f61f0dc837437892df Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Thu, 6 Nov 2025 22:06:36 +0000 Subject: [PATCH 02/11] Bug fix + formatting Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 131 ++++++++++-------- .../realtime/app_examples/surface_code-1.cpp | 35 ++--- 2 files changed, 92 insertions(+), 74 deletions(-) diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index f6093d9d..90bf0b7e 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -23,7 +23,8 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor &, namespace cudaq::qec { struct decoder::rt_impl { - /// The number of syndromes per round (enables incremental detector computation) + /// The number of syndromes per round (enables incremental detector + /// computation) uint32_t num_syndromes_per_round = 0; /// The number of measurement syndromes to be decoded per decode call @@ -31,25 +32,27 @@ struct decoder::rt_impl { uint32_t num_msyn_per_decode = 0; /// Counter of total syndromes buffered but not yet processed. - /// Used to detect complete rounds (when this is a multiple of num_msyn_per_decode). - /// Gets decremented after each round is decoded. Not a direct buffer index. + /// Used to detect complete rounds (when this is a multiple of + /// num_msyn_per_decode). Gets decremented after each round is decoded. Not a + /// direct buffer index. uint32_t num_syndromes_buffered_but_not_decoded = 0; - /// The buffer of measurement syndromes received from the client. + /// The buffer of measurement syndromes received from the client. /// For incremental mode: size is calculated from max D_sparse column + 1 /// This allows buffering multiple rounds while still decoding incrementally std::vector msyn_buffer; - + /// Total buffer capacity (max column index in D_sparse + 1) uint32_t buffer_capacity = 0; /// Track which round we're on (0 = reference round) uint32_t current_round = 0; - + /// Circular buffer write position for the current round - // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then wrap around to 0. + // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then + // wrap around to 0. uint32_t current_round_buffer_offset = 0; - + /// Circular buffer position of the previous round (for incremental XOR) uint32_t prev_round_buffer_offset = 0; @@ -193,31 +196,33 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; } void decoder::set_D_sparse(const std::vector> &D_sparse) { this->D_sparse = D_sparse; - + // Infer num_syndromes_per_round from D_sparse timelike structure - // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds - // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24 - if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) { - pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0]; + // For timelike detectors, each detector XORs two syndromes from consecutive + // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with + // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24 + if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) { + pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0]; } else { // Fallback: assume 1:1 mapping pimpl->num_syndromes_per_round = 1; } - + // Calculate minimum buffer capacity from max column in D_sparse uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse); - + // Enable incremental mode: process one round at a time pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; - - // Add one extra round to buffer capacity to guarantee no wraparound within operations - // This eliminates all wraparound checks in hot loops (write and detector computation) + + // Add one extra round to buffer capacity to guarantee no wraparound within + // operations This eliminates all wraparound checks in hot loops (write and + // detector computation) pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round; - + // Allocate buffer to hold all syndromes plus extra round pimpl->msyn_buffer.clear(); pimpl->msyn_buffer.resize(pimpl->buffer_capacity); - + pimpl->num_syndromes_buffered_but_not_decoded = 0; pimpl->current_round = 0; pimpl->current_round_buffer_offset = 0; @@ -226,31 +231,33 @@ void decoder::set_D_sparse(const std::vector> &D_sparse) { void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { set_sparse_from_vec(D_sparse_vec_in, this->D_sparse); - + // Infer num_syndromes_per_round from D_sparse timelike structure - // For timelike detectors, consecutive detectors XOR syndromes from consecutive rounds - // e.g., detector[0] = [0, 24], detector[1] = [1, 25], so num_syndromes_per_round = 24 - if (D_sparse.size() >= 2 && D_sparse[0].size() >= 2 && D_sparse[1].size() >= 2) { - pimpl->num_syndromes_per_round = D_sparse[1][0] - D_sparse[0][0]; + // For timelike detectors, each detector XORs two syndromes from consecutive + // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with + // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24 + if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) { + pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0]; } else { // Fallback: assume 1:1 mapping pimpl->num_syndromes_per_round = 1; } - + // Calculate minimum buffer capacity from max column in D_sparse uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse); - + // Enable incremental mode: process one round at a time pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; - - // Add one extra round to buffer capacity to guarantee no wraparound within operations - // This eliminates all wraparound checks in hot loops (write and detector computation) + + // Add one extra round to buffer capacity to guarantee no wraparound within + // operations This eliminates all wraparound checks in hot loops (write and + // detector computation) pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round; - + // Allocate buffer to hold all syndromes plus extra round pimpl->msyn_buffer.clear(); pimpl->msyn_buffer.resize(pimpl->buffer_capacity); - + pimpl->num_syndromes_buffered_but_not_decoded = 0; pimpl->current_round = 0; pimpl->current_round_buffer_offset = 0; @@ -259,33 +266,41 @@ void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { bool decoder::enqueue_syndrome(const uint8_t *syndrome, std::size_t syndrome_length) { - // position_in_round represents how many syndromes of the current round have already been buffered but not yet decoded - // Values range from 0 to num_msyn_per_decode - 1. - uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode; - + // position_in_round represents how many syndromes of the current round have + // already been buffered but not yet decoded Values range from 0 to + // num_msyn_per_decode - 1. + uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded % + pimpl->num_msyn_per_decode; + // Check if this write would overwrite the previous round // We need to preserve prev_round_buffer for XOR computation, so the maximum - // safe write from the start of the current round is buffer_capacity minus one round - uint32_t max_safe_from_round_start = pimpl->buffer_capacity - pimpl->num_syndromes_per_round; + // safe write from the start of the current round is buffer_capacity minus one + // round + uint32_t max_safe_from_round_start = + pimpl->buffer_capacity - pimpl->num_syndromes_per_round; if (position_in_round + syndrome_length > max_safe_from_round_start) { - // CUDAQ_WARN("Syndrome data too large - would overwrite previous round. Data will be ignored."); - printf("Syndrome data too large - would overwrite previous round. Data will be ignored.\n"); + // CUDAQ_WARN("Syndrome data too large - would overwrite previous round. + // Data will be ignored."); + printf("Syndrome data too large - would overwrite previous round. Data " + "will be ignored.\n"); return false; } bool did_decode = false; // Buffer the incoming syndromes - // No wraparound check needed: buffer is sized to guarantee operations never wrap mid-execution + // No wraparound check needed: buffer is sized to guarantee operations never + // wrap mid-execution uint32_t write_start = pimpl->current_round_buffer_offset + position_in_round; for (std::size_t i = 0; i < syndrome_length; i++) { pimpl->msyn_buffer[write_start + i] = syndrome[i]; } pimpl->num_syndromes_buffered_but_not_decoded += syndrome_length; - + // Process all complete rounds that are now available - while ((pimpl->num_syndromes_buffered_but_not_decoded % pimpl->num_msyn_per_decode) == 0 && + while ((pimpl->num_syndromes_buffered_but_not_decoded % + pimpl->num_msyn_per_decode) == 0 && pimpl->num_syndromes_buffered_but_not_decoded > 0) { pimpl->current_round++; - + // First round (round 1): store as reference, don't decode yet if (pimpl->current_round == 1) { // Previous round stays at current position for next round's XOR @@ -294,10 +309,11 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode; if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity) pimpl->current_round_buffer_offset -= pimpl->buffer_capacity; - pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode; // Decrement for next iteration - continue; // Skip to next round + pimpl->num_syndromes_buffered_but_not_decoded -= + pimpl->num_msyn_per_decode; // Decrement for next iteration + continue; // Skip to next round } - + // These are just for logging. They are initialized in such a way to avoid // dynamic memory allocation if logging is disabled. std::vector log_msyn; @@ -319,15 +335,16 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, log_observable_corrections.resize(O_sparse.size()); } - // Compute detectors incrementally by XORing current round with previous round - // Using circular buffer offsets - no D_sparse access needed - // No wraparound checks needed: buffer is sized to guarantee operations never wrap mid-execution + // Compute detectors incrementally by XORing current round with previous + // round Using circular buffer offsets - no D_sparse access needed No + // wraparound checks needed: buffer is sized to guarantee operations never + // wrap mid-execution for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { - pimpl->persistent_detector_buffer[i] = - pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ + pimpl->persistent_detector_buffer[i] = + pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i]; } - + // Update offsets for next round: current becomes previous, advance current pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset; pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode; @@ -400,11 +417,11 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, printf("%s\n", s.c_str()); } did_decode = true; - + // Decrement counter for next iteration of while loop pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode; } - + return did_decode; } @@ -454,12 +471,12 @@ void decoder::reset_decoder() { pimpl->num_syndromes_buffered_but_not_decoded = 0; pimpl->msyn_buffer.clear(); pimpl->msyn_buffer.resize(pimpl->buffer_capacity); - + // Reset incremental computation state pimpl->current_round = 0; pimpl->current_round_buffer_offset = 0; pimpl->prev_round_buffer_offset = 0; - + pimpl->corrections.clear(); pimpl->corrections.resize(O_sparse.size()); const bool log_due_to_log_level = diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp index 1d34aaa5..5a8e15a5 100644 --- a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp +++ b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp @@ -41,7 +41,7 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, auto numRounds = dem.num_detectors() / numSyndromesPerRound + 1; cudaq::qec::decoding::config::decoder_config config; config.id = i; - config.type = decoder_type; // Use parameter instead of hardcoded + config.type = decoder_type; // Use parameter instead of hardcoded config.block_size = dem.num_error_mechanisms(); config.syndrome_size = dem.num_detectors(); config.num_syndromes_per_round = numSyndromesPerRound; @@ -50,7 +50,7 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, cudaq::qec::pcm_to_sparse_vec(dem.observables_flips_matrix); config.D_sparse = cudaq::qec::generate_timelike_sparse_detector_matrix( numSyndromesPerRound, numRounds, /*include_first_round=*/false); - + if (decoder_type == "nv-qldpc-decoder") { // Original NV-QLDPC configuration config.decoder_custom_args = @@ -64,7 +64,7 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, nv_config.max_iterations = 50; nv_config.osd_order = 60; nv_config.osd_method = 3; - + } else if (decoder_type == "sliding_window") { // Sliding window configuration cudaq::qec::decoding::config::sliding_window_config sw_config; @@ -74,8 +74,8 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, sw_config.straddle_start_round = false; sw_config.straddle_end_round = true; sw_config.inner_decoder_name = "nv-qldpc-decoder"; - sw_config.error_rate_vec = dem.error_rates; // Required by sliding_window - + sw_config.error_rate_vec = dem.error_rates; // Required by sliding_window + // Configure inner NV-QLDPC decoder cudaq::qec::decoding::config::nv_qldpc_decoder_config nv_config; nv_config.use_sparsity = true; @@ -84,11 +84,11 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, nv_config.max_iterations = 50; nv_config.osd_order = 60; nv_config.osd_method = 3; - + sw_config.nv_qldpc_decoder_params = nv_config; config.decoder_custom_args = sw_config; } - + multi_config.decoders.push_back(config); } std::string config_str = multi_config.to_yaml_str(200); @@ -118,16 +118,16 @@ void load_dem_from_file(const std::string &dem_filename, exit(1); } auto decoder_config = config.decoders[0]; - + // Extract error rates based on decoder type std::vector error_rates; - + if (decoder_config.type == "nv-qldpc-decoder") { auto nv_config = std::get( decoder_config.decoder_custom_args); error_rates = nv_config.error_rate_vec.value(); - + } else if (decoder_config.type == "sliding_window") { auto sw_config = std::get( @@ -137,7 +137,7 @@ void load_dem_from_file(const std::string &dem_filename, error_rates = sw_config.error_rate_vec; } } - + dem.detector_error_matrix = cudaq::qec::pcm_from_sparse_vec( decoder_config.H_sparse, decoder_config.syndrome_size, decoder_config.block_size); @@ -677,11 +677,11 @@ int main(int argc, char **argv) { bool save_dem = false; bool load_dem = false; std::string dem_filename; - + // Decoder type selection - std::string decoder_type = "nv-qldpc-decoder"; // Default - int sw_window_size = -1; // For sliding_window, default to decoder_window - int sw_step_size = 1; // For sliding_window + std::string decoder_type = "nv-qldpc-decoder"; // Default + int sw_window_size = -1; // For sliding_window, default to decoder_window + int sw_step_size = 1; // For sliding_window // Parse the command line arguments for (int i = 1; i < argc; i++) { @@ -745,10 +745,11 @@ int main(int argc, char **argv) { decoder_window = distance; if (sw_window_size == -1) sw_window_size = decoder_window; - + // Validate decoder type if (decoder_type != "nv-qldpc-decoder" && decoder_type != "sliding_window") { - printf("Error: --decoder_type must be 'nv-qldpc-decoder' or 'sliding_window'\n"); + printf("Error: --decoder_type must be 'nv-qldpc-decoder' or " + "'sliding_window'\n"); return 1; } From cc08dea7a78595bdf7b1531fddf7af544ff27803 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Fri, 7 Nov 2025 21:32:30 +0000 Subject: [PATCH 03/11] Add support for first round detectors Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 89 +++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index 90bf0b7e..9358d9e4 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -72,6 +72,9 @@ struct decoder::rt_impl { /// The id of the decoder (for instrumentation) uint32_t decoder_id = 0; + + /// Whether D_sparse has first-round detectors (determined in set_D_sparse) + bool has_first_round_detectors = false; }; void decoder::rt_impl_deleter::operator()(rt_impl *p) const { delete p; } @@ -197,15 +200,24 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; } void decoder::set_D_sparse(const std::vector> &D_sparse) { this->D_sparse = D_sparse; - // Infer num_syndromes_per_round from D_sparse timelike structure - // For timelike detectors, each detector XORs two syndromes from consecutive - // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with - // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24 - if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) { - pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0]; - } else { - // Fallback: assume 1:1 mapping - pimpl->num_syndromes_per_round = 1; + // Analyze D_sparse structure (assumes well-formed D_sparse from generator): + // 1. First-round detectors (if any) are always at the beginning + // 2. All timelike detectors have the same stride (num_syndromes_per_round) + + // Check if first row is a first-round detector (single syndrome index) + pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1); + + // Find num_syndromes_per_round from first timelike detector + // (skip first-round detectors if present, they're all at the beginning) + pimpl->num_syndromes_per_round = 1; // Default fallback + for (const auto& detector_syndrome_indices : D_sparse) { + if (detector_syndrome_indices.size() >= 2) { + // First timelike detector found: XORs syndromes from consecutive rounds + // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2) + // so num_syndromes_per_round = 8 + pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0]; + break; // Found it, no need to continue + } } // Calculate minimum buffer capacity from max column in D_sparse @@ -232,15 +244,24 @@ void decoder::set_D_sparse(const std::vector> &D_sparse) { void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { set_sparse_from_vec(D_sparse_vec_in, this->D_sparse); - // Infer num_syndromes_per_round from D_sparse timelike structure - // For timelike detectors, each detector XORs two syndromes from consecutive - // rounds e.g., detector[0] = [0, 24] means XOR syndrome 0 (round 0) with - // syndrome 24 (round 1) So num_syndromes_per_round = 24 - 0 = 24 - if (D_sparse.size() >= 1 && D_sparse[0].size() >= 2) { - pimpl->num_syndromes_per_round = D_sparse[0][1] - D_sparse[0][0]; - } else { - // Fallback: assume 1:1 mapping - pimpl->num_syndromes_per_round = 1; + // Analyze D_sparse structure (assumes well-formed D_sparse from generator): + // 1. First-round detectors (if any) are always at the beginning + // 2. All timelike detectors have the same stride (num_syndromes_per_round) + + // Check if first row is a first-round detector (single syndrome index) + pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1); + + // Find num_syndromes_per_round from first timelike detector + // (skip first-round detectors if present, they're all at the beginning) + pimpl->num_syndromes_per_round = 1; // Default fallback + for (const auto& detector_syndrome_indices : D_sparse) { + if (detector_syndrome_indices.size() >= 2) { + // First timelike detector found: XORs syndromes from consecutive rounds + // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2) + // so num_syndromes_per_round = 8 + pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0]; + break; // Found it, no need to continue + } } // Calculate minimum buffer capacity from max column in D_sparse @@ -301,8 +322,10 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, pimpl->num_syndromes_buffered_but_not_decoded > 0) { pimpl->current_round++; - // First round (round 1): store as reference, don't decode yet - if (pimpl->current_round == 1) { + // First round (round 1): skip decoding (store as reference) + // UNLESS there are first-round detectors that need immediate decoding + // (first-round detector check is done once in set_D_sparse) + if (pimpl->current_round == 1 && !pimpl->has_first_round_detectors) { // Previous round stays at current position for next round's XOR pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset; // Advance to next round position in circular buffer @@ -335,14 +358,24 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, log_observable_corrections.resize(O_sparse.size()); } - // Compute detectors incrementally by XORing current round with previous - // round Using circular buffer offsets - no D_sparse access needed No - // wraparound checks needed: buffer is sized to guarantee operations never - // wrap mid-execution - for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { - pimpl->persistent_detector_buffer[i] = - pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ - pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i]; + // Compute detectors based on whether first-round detectors exist + if (pimpl->has_first_round_detectors) { + // When first-round detectors exist, must use D_sparse for all detectors + // because first-round detectors reference only one syndrome (not two) + for (std::size_t i = 0; i < this->D_sparse.size(); i++) { + pimpl->persistent_detector_buffer[i] = 0; + for (auto col : this->D_sparse[i]) + pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col]; + } + } else { + // Pure timelike detectors: use incremental XOR (current ⊕ previous round) + // Using circular buffer offsets - no D_sparse access needed + // No wraparound checks needed: buffer is sized to guarantee operations never wrap + for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { + pimpl->persistent_detector_buffer[i] = + pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ + pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i]; + } } // Update offsets for next round: current becomes previous, advance current From 7ccde1d236216f34fb7ff615f413dd82f3db89a9 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Fri, 7 Nov 2025 21:39:54 +0000 Subject: [PATCH 04/11] clang_format and minor function rename Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index 9358d9e4..6d47bff6 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -148,7 +148,7 @@ decoder::get(const std::string &name, const cudaqx::tensor &H, return iter->second(H, param_map); } -static uint32_t calculate_num_msyn_per_decode( +static uint32_t calculate_syndrome_buffer_capacity( const std::vector> &D_sparse) { uint32_t max_col = 0; for (const auto &row : D_sparse) @@ -203,25 +203,27 @@ void decoder::set_D_sparse(const std::vector> &D_sparse) { // Analyze D_sparse structure (assumes well-formed D_sparse from generator): // 1. First-round detectors (if any) are always at the beginning // 2. All timelike detectors have the same stride (num_syndromes_per_round) - + // Check if first row is a first-round detector (single syndrome index) - pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1); - + pimpl->has_first_round_detectors = + (D_sparse.size() > 0 && D_sparse[0].size() == 1); + // Find num_syndromes_per_round from first timelike detector // (skip first-round detectors if present, they're all at the beginning) pimpl->num_syndromes_per_round = 1; // Default fallback - for (const auto& detector_syndrome_indices : D_sparse) { + for (const auto &detector_syndrome_indices : D_sparse) { if (detector_syndrome_indices.size() >= 2) { // First timelike detector found: XORs syndromes from consecutive rounds // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2) // so num_syndromes_per_round = 8 - pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0]; + pimpl->num_syndromes_per_round = + detector_syndrome_indices[1] - detector_syndrome_indices[0]; break; // Found it, no need to continue } } // Calculate minimum buffer capacity from max column in D_sparse - uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse); + uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse); // Enable incremental mode: process one round at a time pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; @@ -247,25 +249,27 @@ void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { // Analyze D_sparse structure (assumes well-formed D_sparse from generator): // 1. First-round detectors (if any) are always at the beginning // 2. All timelike detectors have the same stride (num_syndromes_per_round) - + // Check if first row is a first-round detector (single syndrome index) - pimpl->has_first_round_detectors = (D_sparse.size() > 0 && D_sparse[0].size() == 1); - + pimpl->has_first_round_detectors = + (D_sparse.size() > 0 && D_sparse[0].size() == 1); + // Find num_syndromes_per_round from first timelike detector // (skip first-round detectors if present, they're all at the beginning) pimpl->num_syndromes_per_round = 1; // Default fallback - for (const auto& detector_syndrome_indices : D_sparse) { + for (const auto &detector_syndrome_indices : D_sparse) { if (detector_syndrome_indices.size() >= 2) { // First timelike detector found: XORs syndromes from consecutive rounds // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2) // so num_syndromes_per_round = 8 - pimpl->num_syndromes_per_round = detector_syndrome_indices[1] - detector_syndrome_indices[0]; + pimpl->num_syndromes_per_round = + detector_syndrome_indices[1] - detector_syndrome_indices[0]; break; // Found it, no need to continue } } // Calculate minimum buffer capacity from max column in D_sparse - uint32_t min_capacity = calculate_num_msyn_per_decode(D_sparse); + uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse); // Enable incremental mode: process one round at a time pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; @@ -370,7 +374,8 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, } else { // Pure timelike detectors: use incremental XOR (current ⊕ previous round) // Using circular buffer offsets - no D_sparse access needed - // No wraparound checks needed: buffer is sized to guarantee operations never wrap + // No wraparound checks needed: buffer is sized to guarantee operations + // never wrap for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { pimpl->persistent_detector_buffer[i] = pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ From 8d8f02c8631e39e138bf85cdf550d179a008dfed Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Mon, 10 Nov 2025 18:56:55 +0000 Subject: [PATCH 05/11] Revert decoder.cpp back to original before reworking the approach Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 224 +++++---------------------------------- 1 file changed, 28 insertions(+), 196 deletions(-) diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index 6d47bff6..73a38707 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -23,39 +23,17 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor &, namespace cudaq::qec { struct decoder::rt_impl { - /// The number of syndromes per round (enables incremental detector - /// computation) - uint32_t num_syndromes_per_round = 0; - - /// The number of measurement syndromes to be decoded per decode call - /// (for incremental mode: one round; for batch mode: full D_sparse columns) + /// The number of measurement syndromes to be decoded per decode call (i.e. + /// the number of columns in the D_sparse matrix) uint32_t num_msyn_per_decode = 0; - /// Counter of total syndromes buffered but not yet processed. - /// Used to detect complete rounds (when this is a multiple of - /// num_msyn_per_decode). Gets decremented after each round is decoded. Not a - /// direct buffer index. - uint32_t num_syndromes_buffered_but_not_decoded = 0; + /// The index of the next syndrome to be written in the msyn_buffer + uint32_t msyn_buffer_index = 0; - /// The buffer of measurement syndromes received from the client. - /// For incremental mode: size is calculated from max D_sparse column + 1 - /// This allows buffering multiple rounds while still decoding incrementally + /// The buffer of measurement syndromes received from the client. Length is + /// num_msyn_per_decode. std::vector msyn_buffer; - /// Total buffer capacity (max column index in D_sparse + 1) - uint32_t buffer_capacity = 0; - - /// Track which round we're on (0 = reference round) - uint32_t current_round = 0; - - /// Circular buffer write position for the current round - // Values are 0, num_msyn_per_decode * 2, num_msyn_per_decode * 3, etc. then - // wrap around to 0. - uint32_t current_round_buffer_offset = 0; - - /// Circular buffer position of the previous round (for incremental XOR) - uint32_t prev_round_buffer_offset = 0; - /// The current observable corrections. The length of this vector is the /// number of rows in the O_sparse matrix. std::vector corrections; @@ -72,9 +50,6 @@ struct decoder::rt_impl { /// The id of the decoder (for instrumentation) uint32_t decoder_id = 0; - - /// Whether D_sparse has first-round detectors (determined in set_D_sparse) - bool has_first_round_detectors = false; }; void decoder::rt_impl_deleter::operator()(rt_impl *p) const { delete p; } @@ -148,7 +123,7 @@ decoder::get(const std::string &name, const cudaqx::tensor &H, return iter->second(H, param_map); } -static uint32_t calculate_syndrome_buffer_capacity( +static uint32_t calculate_num_msyn_per_decode( const std::vector> &D_sparse) { uint32_t max_col = 0; for (const auto &row : D_sparse) @@ -199,148 +174,33 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; } void decoder::set_D_sparse(const std::vector> &D_sparse) { this->D_sparse = D_sparse; - - // Analyze D_sparse structure (assumes well-formed D_sparse from generator): - // 1. First-round detectors (if any) are always at the beginning - // 2. All timelike detectors have the same stride (num_syndromes_per_round) - - // Check if first row is a first-round detector (single syndrome index) - pimpl->has_first_round_detectors = - (D_sparse.size() > 0 && D_sparse[0].size() == 1); - - // Find num_syndromes_per_round from first timelike detector - // (skip first-round detectors if present, they're all at the beginning) - pimpl->num_syndromes_per_round = 1; // Default fallback - for (const auto &detector_syndrome_indices : D_sparse) { - if (detector_syndrome_indices.size() >= 2) { - // First timelike detector found: XORs syndromes from consecutive rounds - // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2) - // so num_syndromes_per_round = 8 - pimpl->num_syndromes_per_round = - detector_syndrome_indices[1] - detector_syndrome_indices[0]; - break; // Found it, no need to continue - } - } - - // Calculate minimum buffer capacity from max column in D_sparse - uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse); - - // Enable incremental mode: process one round at a time - pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; - - // Add one extra round to buffer capacity to guarantee no wraparound within - // operations This eliminates all wraparound checks in hot loops (write and - // detector computation) - pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round; - - // Allocate buffer to hold all syndromes plus extra round + pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse); pimpl->msyn_buffer.clear(); - pimpl->msyn_buffer.resize(pimpl->buffer_capacity); - - pimpl->num_syndromes_buffered_but_not_decoded = 0; - pimpl->current_round = 0; - pimpl->current_round_buffer_offset = 0; - pimpl->prev_round_buffer_offset = 0; + pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); + pimpl->msyn_buffer_index = 0; } void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { set_sparse_from_vec(D_sparse_vec_in, this->D_sparse); - - // Analyze D_sparse structure (assumes well-formed D_sparse from generator): - // 1. First-round detectors (if any) are always at the beginning - // 2. All timelike detectors have the same stride (num_syndromes_per_round) - - // Check if first row is a first-round detector (single syndrome index) - pimpl->has_first_round_detectors = - (D_sparse.size() > 0 && D_sparse[0].size() == 1); - - // Find num_syndromes_per_round from first timelike detector - // (skip first-round detectors if present, they're all at the beginning) - pimpl->num_syndromes_per_round = 1; // Default fallback - for (const auto &detector_syndrome_indices : D_sparse) { - if (detector_syndrome_indices.size() >= 2) { - // First timelike detector found: XORs syndromes from consecutive rounds - // e.g., [0, 8] means XOR syndrome 0 (round 1) with syndrome 8 (round 2) - // so num_syndromes_per_round = 8 - pimpl->num_syndromes_per_round = - detector_syndrome_indices[1] - detector_syndrome_indices[0]; - break; // Found it, no need to continue - } - } - - // Calculate minimum buffer capacity from max column in D_sparse - uint32_t min_capacity = calculate_syndrome_buffer_capacity(D_sparse); - - // Enable incremental mode: process one round at a time - pimpl->num_msyn_per_decode = pimpl->num_syndromes_per_round; - - // Add one extra round to buffer capacity to guarantee no wraparound within - // operations This eliminates all wraparound checks in hot loops (write and - // detector computation) - pimpl->buffer_capacity = min_capacity + pimpl->num_syndromes_per_round; - - // Allocate buffer to hold all syndromes plus extra round + pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse); pimpl->msyn_buffer.clear(); - pimpl->msyn_buffer.resize(pimpl->buffer_capacity); - - pimpl->num_syndromes_buffered_but_not_decoded = 0; - pimpl->current_round = 0; - pimpl->current_round_buffer_offset = 0; - pimpl->prev_round_buffer_offset = 0; + pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); + pimpl->msyn_buffer_index = 0; } bool decoder::enqueue_syndrome(const uint8_t *syndrome, std::size_t syndrome_length) { - // position_in_round represents how many syndromes of the current round have - // already been buffered but not yet decoded Values range from 0 to - // num_msyn_per_decode - 1. - uint32_t position_in_round = pimpl->num_syndromes_buffered_but_not_decoded % - pimpl->num_msyn_per_decode; - - // Check if this write would overwrite the previous round - // We need to preserve prev_round_buffer for XOR computation, so the maximum - // safe write from the start of the current round is buffer_capacity minus one - // round - uint32_t max_safe_from_round_start = - pimpl->buffer_capacity - pimpl->num_syndromes_per_round; - if (position_in_round + syndrome_length > max_safe_from_round_start) { - // CUDAQ_WARN("Syndrome data too large - would overwrite previous round. - // Data will be ignored."); - printf("Syndrome data too large - would overwrite previous round. Data " - "will be ignored.\n"); + if (pimpl->msyn_buffer_index + syndrome_length > pimpl->msyn_buffer.size()) { + // CUDAQ_WARN("Syndrome buffer overflow. Syndrome will be ignored."); + printf("Syndrome buffer overflow. Syndrome will be ignored.\n"); return false; } bool did_decode = false; - // Buffer the incoming syndromes - // No wraparound check needed: buffer is sized to guarantee operations never - // wrap mid-execution - uint32_t write_start = pimpl->current_round_buffer_offset + position_in_round; for (std::size_t i = 0; i < syndrome_length; i++) { - pimpl->msyn_buffer[write_start + i] = syndrome[i]; + pimpl->msyn_buffer[pimpl->msyn_buffer_index] = syndrome[i]; + pimpl->msyn_buffer_index++; } - pimpl->num_syndromes_buffered_but_not_decoded += syndrome_length; - - // Process all complete rounds that are now available - while ((pimpl->num_syndromes_buffered_but_not_decoded % - pimpl->num_msyn_per_decode) == 0 && - pimpl->num_syndromes_buffered_but_not_decoded > 0) { - pimpl->current_round++; - - // First round (round 1): skip decoding (store as reference) - // UNLESS there are first-round detectors that need immediate decoding - // (first-round detector check is done once in set_D_sparse) - if (pimpl->current_round == 1 && !pimpl->has_first_round_detectors) { - // Previous round stays at current position for next round's XOR - pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset; - // Advance to next round position in circular buffer - pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode; - if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity) - pimpl->current_round_buffer_offset -= pimpl->buffer_capacity; - pimpl->num_syndromes_buffered_but_not_decoded -= - pimpl->num_msyn_per_decode; // Decrement for next iteration - continue; // Skip to next round - } - + if (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size()) { // These are just for logging. They are initialized in such a way to avoid // dynamic memory allocation if logging is disabled. std::vector log_msyn; @@ -362,32 +222,12 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, log_observable_corrections.resize(O_sparse.size()); } - // Compute detectors based on whether first-round detectors exist - if (pimpl->has_first_round_detectors) { - // When first-round detectors exist, must use D_sparse for all detectors - // because first-round detectors reference only one syndrome (not two) - for (std::size_t i = 0; i < this->D_sparse.size(); i++) { - pimpl->persistent_detector_buffer[i] = 0; - for (auto col : this->D_sparse[i]) - pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col]; - } - } else { - // Pure timelike detectors: use incremental XOR (current ⊕ previous round) - // Using circular buffer offsets - no D_sparse access needed - // No wraparound checks needed: buffer is sized to guarantee operations - // never wrap - for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { - pimpl->persistent_detector_buffer[i] = - pimpl->msyn_buffer[pimpl->prev_round_buffer_offset + i] ^ - pimpl->msyn_buffer[pimpl->current_round_buffer_offset + i]; - } + // Decode now. + for (std::size_t i = 0; i < this->D_sparse.size(); i++) { + pimpl->persistent_detector_buffer[i] = 0; + for (auto col : this->D_sparse[i]) + pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col]; } - - // Update offsets for next round: current becomes previous, advance current - pimpl->prev_round_buffer_offset = pimpl->current_round_buffer_offset; - pimpl->current_round_buffer_offset += pimpl->num_msyn_per_decode; - if (pimpl->current_round_buffer_offset >= pimpl->buffer_capacity) - pimpl->current_round_buffer_offset -= pimpl->buffer_capacity; if (should_log) { log_msyn.reserve(pimpl->msyn_buffer.size()); for (std::size_t d = 0, D = pimpl->msyn_buffer.size(); d < D; d++) { @@ -455,11 +295,9 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, printf("%s\n", s.c_str()); } did_decode = true; - - // Decrement counter for next iteration of while loop - pimpl->num_syndromes_buffered_but_not_decoded -= pimpl->num_msyn_per_decode; + // Prepare for more data. + pimpl->msyn_buffer_index = 0; } - return did_decode; } @@ -506,15 +344,9 @@ std::size_t decoder::get_num_observables() const { return O_sparse.size(); } void decoder::reset_decoder() { // Zero out all data that is considered "per-shot" memory. - pimpl->num_syndromes_buffered_but_not_decoded = 0; + pimpl->msyn_buffer_index = 0; pimpl->msyn_buffer.clear(); - pimpl->msyn_buffer.resize(pimpl->buffer_capacity); - - // Reset incremental computation state - pimpl->current_round = 0; - pimpl->current_round_buffer_offset = 0; - pimpl->prev_round_buffer_offset = 0; - + pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); pimpl->corrections.clear(); pimpl->corrections.resize(O_sparse.size()); const bool log_due_to_log_level = From 58b68f281f180676ca84e7ada2dba21a0b29c938 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Mon, 10 Nov 2025 18:57:34 +0000 Subject: [PATCH 06/11] Added sliding window test Signed-off-by: Chuck Ketcham --- .../realtime/app_examples/CMakeLists.txt | 14 ++++++++++++++ .../app_examples/surface_code-1-test.sh | 18 ++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/libs/qec/unittests/realtime/app_examples/CMakeLists.txt b/libs/qec/unittests/realtime/app_examples/CMakeLists.txt index 6b948a49..d33a636b 100644 --- a/libs/qec/unittests/realtime/app_examples/CMakeLists.txt +++ b/libs/qec/unittests/realtime/app_examples/CMakeLists.txt @@ -38,6 +38,20 @@ add_test( ${CMAKE_BINARY_DIR}/lib WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) + +# Test with sliding_window decoder +add_test( + NAME app_examples.surface_code-1-local-test-distance-3-sliding-window + COMMAND + bash "${CMAKE_CURRENT_SOURCE_DIR}/surface_code-1-test.sh" + ${CMAKE_CURRENT_BINARY_DIR}/surface_code-1-local + ${CMAKE_CURRENT_BINARY_DIR}/surface_code-1-local + 3 40 40 NULL 12 6 + ${CMAKE_BINARY_DIR}/lib + sliding_window 6 1 + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} +) + # This must be disabled for now because the multi_error_lut decoder is not # powerful enough to pass this test. The nv-qldpc-decoder can pass this test, # but that is not available on the GitHub repo. diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh b/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh index 8b2c08f8..c14c6e80 100644 --- a/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh +++ b/libs/qec/unittests/realtime/app_examples/surface_code-1-test.sh @@ -21,10 +21,13 @@ return_code=0 # num_rounds # decoder_window # Path to libcudaq-qec-realtime-decoding-quantinuum-private.so +# decoder_type (optional, defaults to multi_error_lut) +# sw_window_size (optional, for sliding_window decoder, defaults to decoder_window) +# sw_step_size (optional, for sliding_window decoder, defaults to 1) -# Check that all 9 arguments are provided. -if [[ $# -ne 9 ]]; then - echo "Error: Expected 9 arguments" +# Check that at least 9 arguments are provided. +if [[ $# -lt 9 ]]; then + echo "Error: Expected at least 9 arguments (got $#)" exit 1 fi @@ -37,6 +40,9 @@ SERVER_EXECUTABLE=$6 NUM_ROUNDS=$7 DECODER_WINDOW=$8 LIB_DIR=$9 +DECODER_TYPE=${10:-multi_error_lut} +SW_WINDOW_SIZE=${11:-$DECODER_WINDOW} +SW_STEP_SIZE=${12:-1} export CUDAQ_DEFAULT_SIMULATOR=stim @@ -51,7 +57,7 @@ FULL_SUFFIX=$timestamp-$RNG_SUFFIX CONFIG_FILE=config-${FULL_SUFFIX}.yml # Generate the config file using the first executable. -$EXE_PATH1 --distance $DISTANCE --num_rounds $NUM_ROUNDS --num_shots $NUM_SHOTS --save_dem $CONFIG_FILE --decoder_window $DECODER_WINDOW | tee save_dem-$FULL_SUFFIX.log +$EXE_PATH1 --distance $DISTANCE --num_rounds $NUM_ROUNDS --num_shots $NUM_SHOTS --save_dem $CONFIG_FILE --decoder_window $DECODER_WINDOW --decoder_type $DECODER_TYPE --sw_window_size $SW_WINDOW_SIZE --sw_step_size $SW_STEP_SIZE | tee save_dem-$FULL_SUFFIX.log export CUDAQ_DUMP_JIT_IR=${CUDAQ_DUMP_JIT_IR:-0} @@ -67,8 +73,8 @@ export CUDAQ_DUMP_JIT_IR=${CUDAQ_DUMP_JIT_IR:-0} # Use the config file using the second executable. -echo Running $EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW -$EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW |& tee load_dem-$FULL_SUFFIX.log +echo Running $EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW --decoder_type $DECODER_TYPE --sw_window_size $SW_WINDOW_SIZE --sw_step_size $SW_STEP_SIZE +$EXE_PATH2 --distance $DISTANCE --num_shots $NUM_SHOTS --load_dem $CONFIG_FILE --num_rounds $NUM_ROUNDS --decoder_window $DECODER_WINDOW --decoder_type $DECODER_TYPE --sw_window_size $SW_WINDOW_SIZE --sw_step_size $SW_STEP_SIZE |& tee load_dem-$FULL_SUFFIX.log # Look for results like this in the output: From e441e4845f3309d190a29c8ead698641032788b7 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Wed, 12 Nov 2025 01:16:26 +0000 Subject: [PATCH 07/11] Reworked approach in place - sliding window test passes Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 104 ++- libs/qec/lib/decoders/sliding_window.cpp | 915 +++++++++++------------ libs/qec/lib/decoders/sliding_window.h | 149 ++++ 3 files changed, 676 insertions(+), 492 deletions(-) create mode 100644 libs/qec/lib/decoders/sliding_window.h diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index 73a38707..98200f7c 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -20,6 +20,9 @@ INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor &) INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor &, const cudaqx::heterogeneous_map &) +// Include decoder implementations AFTER registry instantiation +#include "decoders/sliding_window.h" + namespace cudaq::qec { struct decoder::rt_impl { @@ -50,6 +53,18 @@ struct decoder::rt_impl { /// The id of the decoder (for instrumentation) uint32_t decoder_id = 0; + + bool is_sliding_window = false; + + /// The number of syndromes per round. Only used for sliding window decoder. + size_t num_syndromes_per_round = 0; + + /// Whether the first round detectors are included. Only used for sliding + /// window decoder. + bool has_first_round_detectors = false; + + /// The current round. Only used for sliding window decoder. + uint32_t current_round = 0; }; void decoder::rt_impl_deleter::operator()(rt_impl *p) const { delete p; } @@ -174,6 +189,23 @@ uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; } void decoder::set_D_sparse(const std::vector> &D_sparse) { this->D_sparse = D_sparse; + auto *sw_decoder = dynamic_cast(this); + + if (sw_decoder != nullptr) { + pimpl->is_sliding_window = true; + pimpl->num_syndromes_per_round = sw_decoder->get_num_syndromes_per_round(); + // Check if first row is a first-round detector (single syndrome index) + pimpl->has_first_round_detectors = + (D_sparse.size() > 0 && D_sparse[0].size() == 1); + pimpl->current_round = 0; + pimpl->persistent_detector_buffer.resize(pimpl->num_syndromes_per_round); + pimpl->persistent_soft_detector_buffer.resize( + pimpl->num_syndromes_per_round); + + } else { + pimpl->is_sliding_window = false; + } + pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse); pimpl->msyn_buffer.clear(); pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); @@ -182,7 +214,23 @@ void decoder::set_D_sparse(const std::vector> &D_sparse) { void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { set_sparse_from_vec(D_sparse_vec_in, this->D_sparse); - pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(D_sparse); + auto *sw_decoder = dynamic_cast(this); + + if (sw_decoder != nullptr) { + pimpl->is_sliding_window = true; + pimpl->num_syndromes_per_round = sw_decoder->get_num_syndromes_per_round(); + // Check if first row is a first-round detector (single syndrome index) + pimpl->has_first_round_detectors = + (this->D_sparse.size() > 0 && this->D_sparse[0].size() == 1); + pimpl->current_round = 0; + pimpl->persistent_detector_buffer.resize(pimpl->num_syndromes_per_round); + pimpl->persistent_soft_detector_buffer.resize( + pimpl->num_syndromes_per_round); + } else { + pimpl->is_sliding_window = false; + } + + pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(this->D_sparse); pimpl->msyn_buffer.clear(); pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); pimpl->msyn_buffer_index = 0; @@ -195,12 +243,23 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, printf("Syndrome buffer overflow. Syndrome will be ignored.\n"); return false; } + + pimpl->current_round++; bool did_decode = false; for (std::size_t i = 0; i < syndrome_length; i++) { pimpl->msyn_buffer[pimpl->msyn_buffer_index] = syndrome[i]; pimpl->msyn_buffer_index++; } - if (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size()) { + + bool should_decode = false; + if (!pimpl->is_sliding_window) { + should_decode = (pimpl->msyn_buffer_index == pimpl->msyn_buffer.size()); + } else { + should_decode = + (pimpl->current_round >= 2) || + (pimpl->current_round == 1 && pimpl->has_first_round_detectors); + } + if (should_decode) { // These are just for logging. They are initialized in such a way to avoid // dynamic memory allocation if logging is disabled. std::vector log_msyn; @@ -223,11 +282,34 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, } // Decode now. - for (std::size_t i = 0; i < this->D_sparse.size(); i++) { - pimpl->persistent_detector_buffer[i] = 0; - for (auto col : this->D_sparse[i]) - pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col]; + if (!pimpl->is_sliding_window) { + for (std::size_t i = 0; i < this->D_sparse.size(); i++) { + pimpl->persistent_detector_buffer[i] = 0; + for (auto col : this->D_sparse[i]) + pimpl->persistent_detector_buffer[i] ^= pimpl->msyn_buffer[col]; + } + } else { + // For sliding window decoder, syndrome_length must equal + // num_syndromes_per_round + assert(syndrome_length == pimpl->num_syndromes_per_round); + if (pimpl->current_round == 1 && pimpl->has_first_round_detectors) { + // First round: only compute first-round detectors (direct copy) + for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { + pimpl->persistent_detector_buffer[i] = pimpl->msyn_buffer[i]; + } + } else { + // Buffer is full with 2 rounds: compute timelike detectors (XOR of two + // rounds) + for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { + std::size_t index = + (pimpl->current_round - 2) * pimpl->num_syndromes_per_round; + pimpl->persistent_detector_buffer[i] = + pimpl->msyn_buffer[index + i] ^ + pimpl->msyn_buffer[index + i + pimpl->num_syndromes_per_round]; + } + } } + if (should_log) { log_msyn.reserve(pimpl->msyn_buffer.size()); for (std::size_t d = 0, D = pimpl->msyn_buffer.size(); d < D; d++) { @@ -246,6 +328,14 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, convert_vec_hard_to_soft(pimpl->persistent_detector_buffer, pimpl->persistent_soft_detector_buffer); auto decoded_result = decode(pimpl->persistent_soft_detector_buffer); + + // If we didn't get a decoded result, just return + if (pimpl->is_sliding_window) { + if (decoded_result.result.size() == 0) { + return false; + } + } + if (should_log) { log_t2 = std::chrono::high_resolution_clock::now(); for (std::size_t e = 0, E = decoded_result.result.size(); e < E; e++) @@ -297,6 +387,7 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, did_decode = true; // Prepare for more data. pimpl->msyn_buffer_index = 0; + pimpl->current_round = 0; } return did_decode; } @@ -345,6 +436,7 @@ std::size_t decoder::get_num_observables() const { return O_sparse.size(); } void decoder::reset_decoder() { // Zero out all data that is considered "per-shot" memory. pimpl->msyn_buffer_index = 0; + pimpl->current_round = 0; pimpl->msyn_buffer.clear(); pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); pimpl->corrections.clear(); diff --git a/libs/qec/lib/decoders/sliding_window.cpp b/libs/qec/lib/decoders/sliding_window.cpp index d57171e8..cb875c73 100644 --- a/libs/qec/lib/decoders/sliding_window.cpp +++ b/libs/qec/lib/decoders/sliding_window.cpp @@ -6,545 +6,488 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ +#include "sliding_window.h" #include "common/Logger.h" -#include "cudaq/qec/decoder.h" #include "cudaq/qec/pcm_utils.h" #include #include namespace cudaq::qec { -/// @brief This is a sliding window decoder that receives syndromes on a -/// round-by-round basis, and decodes them according window-specific parameters -/// provided in the decoder. -class sliding_window : public decoder { -private: - // --- Input parameters --- +// ============================================================================ +// Private helper method implementations +// ============================================================================ - /// The number of rounds of syndrome data in each window. - std::size_t window_size = 1; - /// The number of rounds to advance the window by each time. - std::size_t step_size = 1; - /// The number of syndromes per round. - std::size_t num_syndromes_per_round = 0; - /// When forming a window, should error mechanisms that span the start round - /// and any preceding rounds be included? - bool straddle_start_round = false; - /// When forming a window, should error mechanisms that span the end round and - /// any subsequent rounds be included? - bool straddle_end_round = true; - /// The vector of error rates for the error mechanisms. - std::vector error_rate_vec; - /// The name of the inner decoder to use. - std::string inner_decoder_name; - /// The parameters to pass to the inner decoder. - cudaqx::heterogeneous_map inner_decoder_params; - - // Derived parameters. - std::size_t num_windows = 0; - std::size_t num_rounds = 0; - std::size_t num_syndromes_per_window = 0; - std::size_t num_rounds_since_last_decode = 0; - std::vector> inner_decoders; - std::vector first_columns; - cudaqx::tensor full_pcm; - cudaqx::tensor full_pcm_T; - - // Enum type for timing data. - enum WindowProcTimes { - INITIALIZE_WINDOW, // 0 - SLIDE_WINDOW, // 1 - COPY_DATA, // 2 - INDEX_CALCULATION, // 3 - MODIFY_SYNDROME_SLICE, // 4 - INNER_DECODE, // 5 - CONVERT_TO_HARD, // 6 - COMMIT_TO_RESULT, // 7 - NUM_WINDOW_PROC_TIMES // 8 - }; - - // State data - std::vector> - rolling_window; // [batch_size, num_syndromes_per_window] - // rolling window read and write indices (circular buffer) - std::size_t rw_next_write_index = 0; // [0, num_syndromes_per_window) - std::size_t rw_next_read_index = 0; // [0, num_syndromes_per_window) - std::size_t rw_filled = 0; - std::size_t num_windows_decoded = 0; - std::vector> syndrome_mods; // [batch_size, syndrome_size] - std::vector rw_results; // [batch_size] - std::vector window_proc_times; - std::array - window_proc_times_arr = {}; - - void validate_inputs() { - if (window_size < 1 || window_size > num_rounds) { - throw std::invalid_argument( - fmt::format("sliding_window constructor: window_size ({}) must " - "be between 1 and num_rounds ({})", - window_size, num_rounds)); - } - if (step_size < 1 || step_size > window_size) { - throw std::invalid_argument( - fmt::format("sliding_window constructor: step_size ({}) must " - "be between 1 and window_size ({})", - step_size, window_size)); - } - if ((num_rounds - window_size) % step_size != 0) { - throw std::invalid_argument( - fmt::format("sliding_window constructor: num_rounds - " - "window_size ({}) must be divisible by step_size ({})", - num_rounds - window_size, step_size)); - } - if (num_syndromes_per_round == 0) { - throw std::invalid_argument("sliding_window constructor: " - "num_syndromes_per_round must be non-zero"); - } - if (H.shape()[0] % num_syndromes_per_round != 0) { - throw std::invalid_argument( - "sliding_window constructor: Number of rows in H must be divisible " - "by num_syndromes_per_round"); - } - if (inner_decoder_name.empty()) { - throw std::invalid_argument( - "sliding_window constructor: inner_decoder_name must be non-empty"); - } - if (inner_decoder_params.empty()) { - CUDAQ_WARN("sliding_window constructor: inner_decoder_params is empty. " - "Is that intentional?"); - } - if (error_rate_vec.empty()) { - throw std::invalid_argument( - "sliding_window constructor: error_rate_vec must be non-empty"); - } - - // Enforce that H is already sorted. - if (!cudaq::qec::pcm_is_sorted(H, num_syndromes_per_round)) { - throw std::invalid_argument("sliding_window constructor: PCM must be " - "sorted. See cudaq::qec::simplify_pcm."); - } +/// Helper function to validate constructor inputs. +void sliding_window::validate_inputs() { + if (window_size < 1 || window_size > num_rounds) { + throw std::invalid_argument( + fmt::format("sliding_window constructor: window_size ({}) must " + "be between 1 and num_rounds ({})", + window_size, num_rounds)); + } + if (step_size < 1 || step_size > window_size) { + throw std::invalid_argument( + fmt::format("sliding_window constructor: step_size ({}) must " + "be between 1 and window_size ({})", + step_size, window_size)); + } + if ((num_rounds - window_size) % step_size != 0) { + throw std::invalid_argument( + fmt::format("sliding_window constructor: num_rounds - " + "window_size ({}) must be divisible by step_size ({})", + num_rounds - window_size, step_size)); + } + if (num_syndromes_per_round == 0) { + throw std::invalid_argument("sliding_window constructor: " + "num_syndromes_per_round must be non-zero"); + } + if (H.shape()[0] % num_syndromes_per_round != 0) { + throw std::invalid_argument( + "sliding_window constructor: Number of rows in H must be divisible " + "by num_syndromes_per_round"); + } + if (inner_decoder_name.empty()) { + throw std::invalid_argument( + "sliding_window constructor: inner_decoder_name must be non-empty"); + } + if (inner_decoder_params.empty()) { + CUDAQ_WARN("sliding_window constructor: inner_decoder_params is empty. " + "Is that intentional?"); + } + if (error_rate_vec.empty()) { + throw std::invalid_argument( + "sliding_window constructor: error_rate_vec must be non-empty"); } - /// Helper function to initialize the window. - /// @param num_syndromes The number of syndromes to initialize the window for. - /// This will be 1 for non-batched mode. - void initialize_window(std::size_t num_syndromes) { - // Initialize the syndrome mods and rw_results. - auto t0 = std::chrono::high_resolution_clock::now(); - window_proc_times_arr.fill(0.0); - syndrome_mods.resize(num_syndromes); - for (std::size_t s = 0; s < num_syndromes; ++s) { - syndrome_mods[s].clear(); - syndrome_mods[s].resize(this->syndrome_size); - } - rw_results.clear(); - rw_results.resize(num_syndromes); - for (std::size_t s = 0; s < num_syndromes; ++s) { - rw_results[s].converged = true; // Gets set to false if we fail to decode - rw_results[s].result.resize(this->block_size); - } - rolling_window.resize(num_syndromes); - for (std::size_t s = 0; s < num_syndromes; ++s) { - rolling_window[s].clear(); - rolling_window[s].resize(num_syndromes_per_window); - } - window_proc_times.resize(num_windows); - std::fill(window_proc_times.begin(), window_proc_times.end(), 0.0); - rw_next_write_index = 0; - rw_next_read_index = 0; - rw_filled = 0; - num_rounds_since_last_decode = 0; - CUDAQ_DBG("Initializing window"); - auto t1 = std::chrono::high_resolution_clock::now(); - window_proc_times_arr[WindowProcTimes::INITIALIZE_WINDOW] = - std::chrono::duration(t1 - t0).count() * 1000; + // Enforce that H is already sorted. + if (!cudaq::qec::pcm_is_sorted(H, num_syndromes_per_round)) { + throw std::invalid_argument("sliding_window constructor: PCM must be " + "sorted. See cudaq::qec::simplify_pcm."); } +} - /// Helper function to add a single syndrome to the rolling window (circular - /// buffer). - void add_syndrome_to_rolling_window(const std::vector &syndrome, - std::size_t syndrome_index, - bool update_next_write_index = true) { - // This assumes that the syndrome size evenly divides into the rolling - // window (of length num_syndromes_per_window), so verify that here. - if (num_syndromes_per_window % syndrome.size() != 0) { - throw std::invalid_argument( - fmt::format("add_syndrome_to_rolling_window: syndrome " - "size ({}) must evenly divide into the rolling " - "window size ({})", - syndrome.size(), num_syndromes_per_window)); - } - std::copy(syndrome.begin(), syndrome.end(), - rolling_window[syndrome_index].begin() + rw_next_write_index); - if (update_next_write_index) { - rw_next_write_index += syndrome.size(); - if (rw_next_write_index >= num_syndromes_per_window) - rw_next_write_index = 0; - } +/// Helper function to initialize the window. +/// @param num_syndromes The number of syndromes to initialize the window for. +/// This will be 1 for non-batched mode. +void sliding_window::initialize_window(std::size_t num_syndromes) { + // Initialize the syndrome mods and rw_results. + auto t0 = std::chrono::high_resolution_clock::now(); + window_proc_times_arr.fill(0.0); + syndrome_mods.resize(num_syndromes); + for (std::size_t s = 0; s < num_syndromes; ++s) { + syndrome_mods[s].clear(); + syndrome_mods[s].resize(this->syndrome_size); + } + rw_results.clear(); + rw_results.resize(num_syndromes); + for (std::size_t s = 0; s < num_syndromes; ++s) { + rw_results[s].converged = true; // Gets set to false if we fail to decode + rw_results[s].result.resize(this->block_size); } + rolling_window.resize(num_syndromes); + for (std::size_t s = 0; s < num_syndromes; ++s) { + rolling_window[s].clear(); + rolling_window[s].resize(num_syndromes_per_window); + } + window_proc_times.resize(num_windows); + std::fill(window_proc_times.begin(), window_proc_times.end(), 0.0); + rw_next_write_index = 0; + rw_next_read_index = 0; + rw_filled = 0; + num_rounds_since_last_decode = 0; + CUDAQ_DBG("Initializing window"); + auto t1 = std::chrono::high_resolution_clock::now(); + window_proc_times_arr[WindowProcTimes::INITIALIZE_WINDOW] = + std::chrono::duration(t1 - t0).count() * 1000; +} - /// Helper function to add a batch of syndromes to the rolling window - /// (circular buffer). - void add_syndromes_to_rolling_window( - const std::vector> &syndromes) { - // Set update_next_write_index to false in the loop because we will update - // it once at the end. - for (std::size_t s = 0; s < syndromes.size(); ++s) { - add_syndrome_to_rolling_window(syndromes[s], s, - /*update_next_write_index=*/false); - if (syndromes[s].size() != syndromes[0].size()) { - throw std::invalid_argument( - fmt::format("add_syndromes_to_rolling_window: syndrome " - "size ({}) must be the same as the first syndrome " - "size ({})", - syndromes[s].size(), syndromes[0].size())); - } - } - rw_next_write_index += syndromes[0].size(); +/// Helper function to add a single syndrome to the rolling window (circular +/// buffer). +void sliding_window::add_syndrome_to_rolling_window( + const std::vector &syndrome, std::size_t syndrome_index, + bool update_next_write_index) { + // This assumes that the syndrome size evenly divides into the rolling + // window (of length num_syndromes_per_window), so verify that here. + if (num_syndromes_per_window % syndrome.size() != 0) { + throw std::invalid_argument( + fmt::format("add_syndrome_to_rolling_window: syndrome " + "size ({}) must evenly divide into the rolling " + "window size ({})", + syndrome.size(), num_syndromes_per_window)); + } + std::copy(syndrome.begin(), syndrome.end(), + rolling_window[syndrome_index].begin() + rw_next_write_index); + if (update_next_write_index) { + rw_next_write_index += syndrome.size(); if (rw_next_write_index >= num_syndromes_per_window) rw_next_write_index = 0; } +} - /// Helper function to get a single syndrome from the rolling window - /// (unwrapping a circular buffer). - std::vector - get_syndrome_from_rolling_window(std::size_t syndrome_index) { - std::vector syndrome(num_syndromes_per_window); - // Copy from rw_next_read_index to the end of the buffer. - std::copy(rolling_window[syndrome_index].begin() + rw_next_read_index, - rolling_window[syndrome_index].end(), syndrome.begin()); - // Copy from the beginning of the rolling window to rw_next_read_index. - std::copy(rolling_window[syndrome_index].begin(), - rolling_window[syndrome_index].begin() + rw_next_read_index, - syndrome.end() - rw_next_read_index); - return syndrome; - } - - /// Helper function to get a batch of syndromes from the rolling window - /// (unwrapping a circular buffer). - std::vector> get_syndromes_from_rolling_window() { - std::vector> syndromes(rolling_window.size()); - for (std::size_t s = 0; s < rolling_window.size(); ++s) { - syndromes[s] = get_syndrome_from_rolling_window(s); +/// Helper function to add a batch of syndromes to the rolling window +/// (circular buffer). +void sliding_window::add_syndromes_to_rolling_window( + const std::vector> &syndromes) { + // Set update_next_write_index to false in the loop because we will update + // it once at the end. + for (std::size_t s = 0; s < syndromes.size(); ++s) { + add_syndrome_to_rolling_window(syndromes[s], s, + /*update_next_write_index=*/false); + if (syndromes[s].size() != syndromes[0].size()) { + throw std::invalid_argument( + fmt::format("add_syndromes_to_rolling_window: syndrome " + "size ({}) must be the same as the first syndrome " + "size ({})", + syndromes[s].size(), syndromes[0].size())); } - return syndromes; } + rw_next_write_index += syndromes[0].size(); + if (rw_next_write_index >= num_syndromes_per_window) + rw_next_write_index = 0; +} - /// Helper function to update the read index for the rolling window. - void update_rw_next_read_index() { - rw_next_read_index += step_size * num_syndromes_per_round; - if (rw_next_read_index >= num_syndromes_per_window) - rw_next_read_index -= num_syndromes_per_window; +/// Helper function to get a single syndrome from the rolling window +/// (unwrapping a circular buffer). +std::vector +sliding_window::get_syndrome_from_rolling_window(std::size_t syndrome_index) { + std::vector syndrome(num_syndromes_per_window); + // Copy from rw_next_read_index to the end of the buffer. + std::copy(rolling_window[syndrome_index].begin() + rw_next_read_index, + rolling_window[syndrome_index].end(), syndrome.begin()); + // Copy from the beginning of the rolling window to rw_next_read_index. + std::copy(rolling_window[syndrome_index].begin(), + rolling_window[syndrome_index].begin() + rw_next_read_index, + syndrome.end() - rw_next_read_index); + return syndrome; +} + +/// Helper function to get a batch of syndromes from the rolling window +/// (unwrapping a circular buffer). +std::vector> +sliding_window::get_syndromes_from_rolling_window() { + std::vector> syndromes(rolling_window.size()); + for (std::size_t s = 0; s < rolling_window.size(); ++s) { + syndromes[s] = get_syndrome_from_rolling_window(s); } + return syndromes; +} -public: - sliding_window(const cudaqx::tensor &H, - const cudaqx::heterogeneous_map ¶ms) - : decoder(H), full_pcm(H) { - full_pcm_T = full_pcm.transpose(); - // Fetch parameters from the params map. - window_size = params.get("window_size", window_size); - step_size = params.get("step_size", step_size); - num_syndromes_per_round = params.get("num_syndromes_per_round", - num_syndromes_per_round); - straddle_start_round = - params.get("straddle_start_round", straddle_start_round); - straddle_end_round = - params.get("straddle_end_round", straddle_end_round); - error_rate_vec = params.get>( - "error_rate_vec", error_rate_vec); - inner_decoder_name = - params.get("inner_decoder_name", inner_decoder_name); - inner_decoder_params = params.get( - "inner_decoder_params", inner_decoder_params); +/// Helper function to update the read index for the rolling window. +void sliding_window::update_rw_next_read_index() { + rw_next_read_index += step_size * num_syndromes_per_round; + if (rw_next_read_index >= num_syndromes_per_window) + rw_next_read_index -= num_syndromes_per_window; +} - num_rounds = H.shape()[0] / num_syndromes_per_round; - num_windows = (num_rounds - window_size) / step_size + 1; - num_syndromes_per_window = num_syndromes_per_round * window_size; +// ============================================================================ +// Public method implementations +// ============================================================================ - validate_inputs(); +/// Constructor for the sliding window decoder. +sliding_window::sliding_window(const cudaqx::tensor &H, + const cudaqx::heterogeneous_map ¶ms) + : decoder(H), full_pcm(H) { + full_pcm_T = full_pcm.transpose(); + // Fetch parameters from the params map. + window_size = params.get("window_size", window_size); + step_size = params.get("step_size", step_size); + num_syndromes_per_round = params.get("num_syndromes_per_round", + num_syndromes_per_round); + straddle_start_round = + params.get("straddle_start_round", straddle_start_round); + straddle_end_round = + params.get("straddle_end_round", straddle_end_round); + error_rate_vec = params.get>( + "error_rate_vec", error_rate_vec); + inner_decoder_name = + params.get("inner_decoder_name", inner_decoder_name); + inner_decoder_params = params.get( + "inner_decoder_params", inner_decoder_params); - // Create the inner decoders. - for (std::size_t w = 0; w < num_windows; ++w) { - std::size_t start_round = w * step_size; - std::size_t end_round = start_round + window_size - 1; - auto [H_round, first_column, last_column] = - cudaq::qec::get_pcm_for_rounds( - H, num_syndromes_per_round, start_round, end_round, - straddle_start_round, straddle_end_round); - first_columns.push_back(first_column); + num_rounds = H.shape()[0] / num_syndromes_per_round; + num_windows = (num_rounds - window_size) / step_size + 1; + num_syndromes_per_window = num_syndromes_per_round * window_size; - // Slice the error vector to only include the current window. - auto inner_decoder_params_mod = inner_decoder_params; - std::vector error_vec_mod( - error_rate_vec.begin() + first_column, - error_rate_vec.begin() + last_column + 1); - inner_decoder_params_mod.insert("error_rate_vec", error_vec_mod); + validate_inputs(); - CUDAQ_INFO("Creating a decoder for rounds {}-{} (dims {} x {}) " - "first_column = {}, last_column = {}", - start_round, end_round, H_round.shape()[0], H_round.shape()[1], - first_column, last_column); - auto inner_decoder = - decoder::get(inner_decoder_name, H_round, inner_decoder_params_mod); - inner_decoders.push_back(std::move(inner_decoder)); - } + // Create the inner decoders. + for (std::size_t w = 0; w < num_windows; ++w) { + std::size_t start_round = w * step_size; + std::size_t end_round = start_round + window_size - 1; + auto [H_round, first_column, last_column] = cudaq::qec::get_pcm_for_rounds( + H, num_syndromes_per_round, start_round, end_round, + straddle_start_round, straddle_end_round); + first_columns.push_back(first_column); + + // Slice the error vector to only include the current window. + auto inner_decoder_params_mod = inner_decoder_params; + std::vector error_vec_mod( + error_rate_vec.begin() + first_column, + error_rate_vec.begin() + last_column + 1); + inner_decoder_params_mod.insert("error_rate_vec", error_vec_mod); + + CUDAQ_INFO("Creating a decoder for rounds {}-{} (dims {} x {}) " + "first_column = {}, last_column = {}", + start_round, end_round, H_round.shape()[0], H_round.shape()[1], + first_column, last_column); + auto inner_decoder = + decoder::get(inner_decoder_name, H_round, inner_decoder_params_mod); + inner_decoders.push_back(std::move(inner_decoder)); } +} - virtual decoder_result decode(const std::vector &syndrome) override { - if (syndrome.size() == this->syndrome_size) { - auto t0 = std::chrono::high_resolution_clock::now(); - CUDAQ_DBG("Decoding whole block"); - // Decode the whole thing, iterating over windows manually. - decoder_result result; - std::vector syndrome_round(num_syndromes_per_round); - for (std::size_t r = 0; r < num_rounds; ++r) { - std::copy(syndrome.begin() + r * num_syndromes_per_round, - syndrome.begin() + (r + 1) * num_syndromes_per_round, - syndrome_round.begin()); - result = decode(syndrome_round); - // Note: result will be empty until the final loop iteration. - } - auto t1 = std::chrono::high_resolution_clock::now(); - std::chrono::duration diff = t1 - t0; - CUDAQ_INFO("Whole block time: {:.3f} ms", diff.count() * 1000); - return result; - } - // Else we're receiving a single round. - if (rw_filled == 0) { - initialize_window(/*num_syndromes=*/1); +/// Decode a syndrome vector (either full block or single round). +decoder_result sliding_window::decode(const std::vector &syndrome) { + if (syndrome.size() == this->syndrome_size) { + auto t0 = std::chrono::high_resolution_clock::now(); + CUDAQ_DBG("Decoding whole block"); + // Decode the whole thing, iterating over windows manually. + decoder_result result; + std::vector syndrome_round(num_syndromes_per_round); + for (std::size_t r = 0; r < num_rounds; ++r) { + std::copy(syndrome.begin() + r * num_syndromes_per_round, + syndrome.begin() + (r + 1) * num_syndromes_per_round, + syndrome_round.begin()); + result = decode(syndrome_round); + // Note: result will be empty until the final loop iteration. } - if (this->rw_filled == num_syndromes_per_window) { - auto t0 = std::chrono::high_resolution_clock::now(); - CUDAQ_DBG("Window is full, sliding the window by one round"); - add_syndrome_to_rolling_window(syndrome, 0); + auto t1 = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = t1 - t0; + CUDAQ_INFO("Whole block time: {:.3f} ms", diff.count() * 1000); + return result; + } + // Else we're receiving a single round. + if (rw_filled == 0) { + initialize_window(/*num_syndromes=*/1); + } + if (this->rw_filled == num_syndromes_per_window) { + auto t0 = std::chrono::high_resolution_clock::now(); + CUDAQ_DBG("Window is full, sliding the window by one round"); + add_syndrome_to_rolling_window(syndrome, 0); - auto t1 = std::chrono::high_resolution_clock::now(); - window_proc_times_arr[WindowProcTimes::SLIDE_WINDOW] += - std::chrono::duration(t1 - t0).count() * 1000; - } else { - // Just copy the data to the end of the rolling window. - auto t0 = std::chrono::high_resolution_clock::now(); - CUDAQ_DBG("Copying data to the end of the rolling window"); - add_syndrome_to_rolling_window(syndrome, 0); - this->rw_filled += num_syndromes_per_round; - auto t1 = std::chrono::high_resolution_clock::now(); - window_proc_times_arr[WindowProcTimes::COPY_DATA] += - std::chrono::duration(t1 - t0).count() * 1000; - } - num_rounds_since_last_decode++; - if (rw_filled == num_syndromes_per_window && - num_rounds_since_last_decode >= step_size) { - CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows); - decode_window(); - num_rounds_since_last_decode = 0; + auto t1 = std::chrono::high_resolution_clock::now(); + window_proc_times_arr[WindowProcTimes::SLIDE_WINDOW] += + std::chrono::duration(t1 - t0).count() * 1000; + } else { + // Just copy the data to the end of the rolling window. + auto t0 = std::chrono::high_resolution_clock::now(); + CUDAQ_DBG("Copying data to the end of the rolling window"); + add_syndrome_to_rolling_window(syndrome, 0); + this->rw_filled += num_syndromes_per_round; + auto t1 = std::chrono::high_resolution_clock::now(); + window_proc_times_arr[WindowProcTimes::COPY_DATA] += + std::chrono::duration(t1 - t0).count() * 1000; + } + num_rounds_since_last_decode++; + if (rw_filled == num_syndromes_per_window && + num_rounds_since_last_decode >= step_size) { + CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows); + decode_window(); + num_rounds_since_last_decode = 0; - num_windows_decoded++; - if (num_windows_decoded == num_windows) { - num_windows_decoded = 0; - rw_filled = 0; - // for (std::size_t w = 0; w < num_windows; ++w) { - // CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]); - // } - CUDAQ_DBG("Returning decoder_result"); - return std::move(this->rw_results[0]); - } + num_windows_decoded++; + if (num_windows_decoded == num_windows) { + num_windows_decoded = 0; + rw_filled = 0; + // for (std::size_t w = 0; w < num_windows; ++w) { + // CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]); + // } + CUDAQ_DBG("Returning decoder_result"); + return std::move(this->rw_results[0]); } - CUDAQ_DBG("Returning empty decoder_result"); - return decoder_result(); // empty return value } + CUDAQ_DBG("Returning empty decoder_result"); + return decoder_result(); // empty return value +} - virtual std::vector - decode_batch(const std::vector> &syndromes) override { - if (syndromes[0].size() == this->syndrome_size) { - CUDAQ_DBG("Decoding whole block"); - // Decode the whole thing, iterating over windows manually. - std::vector results; - std::vector> syndromes_round(syndromes.size()); - for (std::size_t r = 0; r < num_rounds; ++r) { - for (std::size_t s = 0; s < syndromes.size(); ++s) { - syndromes_round[s].resize(num_syndromes_per_round); - std::copy(syndromes[s].begin() + r * num_syndromes_per_round, - syndromes[s].begin() + (r + 1) * num_syndromes_per_round, - syndromes_round[s].begin()); - } - results = decode_batch(syndromes_round); +/// Decode a batch of syndrome vectors. +std::vector sliding_window::decode_batch( + const std::vector> &syndromes) { + if (syndromes[0].size() == this->syndrome_size) { + CUDAQ_DBG("Decoding whole block"); + // Decode the whole thing, iterating over windows manually. + std::vector results; + std::vector> syndromes_round(syndromes.size()); + for (std::size_t r = 0; r < num_rounds; ++r) { + for (std::size_t s = 0; s < syndromes.size(); ++s) { + syndromes_round[s].resize(num_syndromes_per_round); + std::copy(syndromes[s].begin() + r * num_syndromes_per_round, + syndromes[s].begin() + (r + 1) * num_syndromes_per_round, + syndromes_round[s].begin()); } - return results; - } - // Else we're receiving a single round. - if (rw_filled == 0) { - initialize_window(syndromes.size()); - } - if (this->rw_filled == num_syndromes_per_window) { - CUDAQ_DBG("Window is full, sliding the window by one round"); - // The window is full. Slide existing data to the left and write the new - // data at the end. - add_syndromes_to_rolling_window(syndromes); - num_rounds_since_last_decode++; - } else { - // Just copy the data to the end of the rolling window. - CUDAQ_DBG("Copying data to the end of the rolling window"); - add_syndromes_to_rolling_window(syndromes); - this->rw_filled += num_syndromes_per_round; - num_rounds_since_last_decode++; + results = decode_batch(syndromes_round); } - if (rw_filled == num_syndromes_per_window && - num_rounds_since_last_decode >= step_size) { - CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows); - decode_window(); - num_rounds_since_last_decode = 0; - num_windows_decoded++; - if (num_windows_decoded == num_windows) { - num_windows_decoded = 0; - rw_filled = 0; - // Dump the per window processing times. - // for (std::size_t w = 0; w < num_windows; ++w) { - // CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]); - // } - CUDAQ_DBG("Returning decoder_result"); - return std::move(this->rw_results); - } + return results; + } + // Else we're receiving a single round. + if (rw_filled == 0) { + initialize_window(syndromes.size()); + } + if (this->rw_filled == num_syndromes_per_window) { + CUDAQ_DBG("Window is full, sliding the window by one round"); + // The window is full. Slide existing data to the left and write the new + // data at the end. + add_syndromes_to_rolling_window(syndromes); + num_rounds_since_last_decode++; + } else { + // Just copy the data to the end of the rolling window. + CUDAQ_DBG("Copying data to the end of the rolling window"); + add_syndromes_to_rolling_window(syndromes); + this->rw_filled += num_syndromes_per_round; + num_rounds_since_last_decode++; + } + if (rw_filled == num_syndromes_per_window && + num_rounds_since_last_decode >= step_size) { + CUDAQ_DBG("Decoding window {}/{}", num_windows_decoded + 1, num_windows); + decode_window(); + num_rounds_since_last_decode = 0; + num_windows_decoded++; + if (num_windows_decoded == num_windows) { + num_windows_decoded = 0; + rw_filled = 0; + // Dump the per window processing times. + // for (std::size_t w = 0; w < num_windows; ++w) { + // CUDAQ_DBG("Window {} time: {} ms", w, window_proc_times[w]); + // } + CUDAQ_DBG("Returning decoder_result"); + return std::move(this->rw_results); } - CUDAQ_DBG("Returning empty decoder_result"); - return std::vector(); // empty return value } + CUDAQ_DBG("Returning empty decoder_result"); + return std::vector(); // empty return value +} - /// This is an internal helper function that decodes a single window. Regular - /// users should use the regular `cudaq::qec::decoder::decode` or - /// `cudaq::qec::decoder::decode_batch` functions instead of trying to access - /// this function. - void decode_window() { - auto t0 = std::chrono::high_resolution_clock::now(); - const auto &w = this->num_windows_decoded; - std::size_t syndrome_start = w * step_size * num_syndromes_per_round; - std::size_t syndrome_end = syndrome_start + num_syndromes_per_window - 1; - std::size_t syndrome_start_next_window = - (w + 1) * step_size * num_syndromes_per_round; - std::size_t syndrome_end_next_window = - syndrome_start_next_window + num_syndromes_per_round - 1; - auto t3 = std::chrono::high_resolution_clock::now(); - if (w > 0) { - // Modify the syndrome slice to account for the previous windows. - for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { - std::size_t r2 = rw_next_read_index; - for (std::size_t r = 0; r < num_syndromes_per_window; ++r) { - auto &slice_val = this->rolling_window[s].at(r2); - slice_val = - static_cast(static_cast(slice_val) ^ - syndrome_mods[s].at(r + syndrome_start)); - r2++; - if (r2 >= num_syndromes_per_window) - r2 = 0; - } +/// This is an internal helper function that decodes a single window. Regular +/// users should use the regular `cudaq::qec::decoder::decode` or +/// `cudaq::qec::decoder::decode_batch` functions instead of trying to access +/// this function. +void sliding_window::decode_window() { + auto t0 = std::chrono::high_resolution_clock::now(); + const auto &w = this->num_windows_decoded; + std::size_t syndrome_start = w * step_size * num_syndromes_per_round; + std::size_t syndrome_end = syndrome_start + num_syndromes_per_window - 1; + std::size_t syndrome_start_next_window = + (w + 1) * step_size * num_syndromes_per_round; + std::size_t syndrome_end_next_window = + syndrome_start_next_window + num_syndromes_per_round - 1; + auto t3 = std::chrono::high_resolution_clock::now(); + if (w > 0) { + // Modify the syndrome slice to account for the previous windows. + for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { + std::size_t r2 = rw_next_read_index; + for (std::size_t r = 0; r < num_syndromes_per_window; ++r) { + auto &slice_val = this->rolling_window[s].at(r2); + slice_val = + static_cast(static_cast(slice_val) ^ + syndrome_mods[s].at(r + syndrome_start)); + r2++; + if (r2 >= num_syndromes_per_window) + r2 = 0; } } - auto t4 = std::chrono::high_resolution_clock::now(); - CUDAQ_DBG("Window {}: syndrome_start = {}, syndrome_end = {}, length1 = " - "{}, length2 = {}", - w, syndrome_start, syndrome_end, this->rolling_window[0].size(), - syndrome_end - syndrome_start + 1); - std::vector inner_results; - if (this->rolling_window.size() == 1) { - inner_results.push_back( - inner_decoders[w]->decode(get_syndrome_from_rolling_window(0))); - } else { - inner_results = - inner_decoders[w]->decode_batch(get_syndromes_from_rolling_window()); - } - // We've grabbed data from the rolling window, so we need to update the - // read index for the next call to decode_window. - update_rw_next_read_index(); - if (!inner_results[0].converged) { - CUDAQ_DBG("Window {}: inner decoder failed to converge", w); - } - auto t5 = std::chrono::high_resolution_clock::now(); - std::vector> window_results( - this->rolling_window.size()); + } + auto t4 = std::chrono::high_resolution_clock::now(); + CUDAQ_DBG("Window {}: syndrome_start = {}, syndrome_end = {}, length1 = " + "{}, length2 = {}", + w, syndrome_start, syndrome_end, this->rolling_window[0].size(), + syndrome_end - syndrome_start + 1); + std::vector inner_results; + if (this->rolling_window.size() == 1) { + inner_results.push_back( + inner_decoders[w]->decode(get_syndrome_from_rolling_window(0))); + } else { + inner_results = + inner_decoders[w]->decode_batch(get_syndromes_from_rolling_window()); + } + // We've grabbed data from the rolling window, so we need to update the + // read index for the next call to decode_window. + update_rw_next_read_index(); + if (!inner_results[0].converged) { + CUDAQ_DBG("Window {}: inner decoder failed to converge", w); + } + auto t5 = std::chrono::high_resolution_clock::now(); + std::vector> window_results(this->rolling_window.size()); + for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { + this->rw_results[s].converged &= inner_results[s].converged; + cudaq::qec::convert_vec_soft_to_hard(inner_results[s].result, + window_results[s]); + } + // Commit to everything up to the first column of the next window. + auto t6 = std::chrono::high_resolution_clock::now(); + if (w < num_windows - 1) { + // Prepare for the next window. + auto next_window_first_column = first_columns[w + 1]; + auto this_window_first_column = first_columns[w]; + auto num_to_commit = next_window_first_column - this_window_first_column; + CUDAQ_DBG(" Committing {} bits from window {}", num_to_commit, w); for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { - this->rw_results[s].converged &= inner_results[s].converged; - cudaq::qec::convert_vec_soft_to_hard(inner_results[s].result, - window_results[s]); - } - // Commit to everything up to the first column of the next window. - auto t6 = std::chrono::high_resolution_clock::now(); - if (w < num_windows - 1) { - // Prepare for the next window. - auto next_window_first_column = first_columns[w + 1]; - auto this_window_first_column = first_columns[w]; - auto num_to_commit = next_window_first_column - this_window_first_column; - CUDAQ_DBG(" Committing {} bits from window {}", num_to_commit, w); - for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { - for (std::size_t c = 0; c < num_to_commit; ++c) { - rw_results[s].result[c + this_window_first_column] = - window_results[s][c]; - } + for (std::size_t c = 0; c < num_to_commit; ++c) { + rw_results[s].result[c + this_window_first_column] = + window_results[s][c]; } - // We are committing to some errors that would affect the next round's - // syndrome measurements. Therefore, we need to modify some of the - // syndrome measurements for the next round to "back out" the errors - // that we already know about (or more specifically, the errors we think - // we've already accounted for). - for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { - for (std::size_t c = 0; c < num_to_commit; ++c) { - if (rw_results[s].result[c + this_window_first_column]) { - // This bit is a 1, so we need to modify the syndrome measurements - // for the next window to account for this already-accounted-for - // error. We do this by flipping the bit in the syndrome - // measurements if the corresponding entry in the PCM is a 1. - auto *pcm_col = &full_pcm_T.at({c + this_window_first_column, 0}); - for (auto r = syndrome_start_next_window; - r <= syndrome_end_next_window; ++r) { - syndrome_mods[s][r] = - syndrome_mods[s][r] ^ static_cast(pcm_col[r]); - } + } + // We are committing to some errors that would affect the next round's + // syndrome measurements. Therefore, we need to modify some of the + // syndrome measurements for the next round to "back out" the errors + // that we already know about (or more specifically, the errors we think + // we've already accounted for). + for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { + for (std::size_t c = 0; c < num_to_commit; ++c) { + if (rw_results[s].result[c + this_window_first_column]) { + // This bit is a 1, so we need to modify the syndrome measurements + // for the next window to account for this already-accounted-for + // error. We do this by flipping the bit in the syndrome + // measurements if the corresponding entry in the PCM is a 1. + auto *pcm_col = &full_pcm_T.at({c + this_window_first_column, 0}); + for (auto r = syndrome_start_next_window; + r <= syndrome_end_next_window; ++r) { + syndrome_mods[s][r] = + syndrome_mods[s][r] ^ static_cast(pcm_col[r]); } } } - } else { - // This is the last window. Append ALL of window_result to - // decoded_result. - auto this_window_first_column = first_columns[w]; - auto num_to_commit = window_results[0].size(); - CUDAQ_DBG(" Committing {} bits from window {}", num_to_commit, w); - for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { - for (std::size_t c = 0; c < num_to_commit; ++c) { - rw_results[s].result[c + this_window_first_column] = - window_results[s][c]; - } + } + } else { + // This is the last window. Append ALL of window_result to + // decoded_result. + auto this_window_first_column = first_columns[w]; + auto num_to_commit = window_results[0].size(); + CUDAQ_DBG(" Committing {} bits from window {}", num_to_commit, w); + for (std::size_t s = 0; s < this->rolling_window.size(); ++s) { + for (std::size_t c = 0; c < num_to_commit; ++c) { + rw_results[s].result[c + this_window_first_column] = + window_results[s][c]; } } - auto t7 = std::chrono::high_resolution_clock::now(); - window_proc_times.at(w) += - std::chrono::duration(t7 - t0).count() * 1000; - window_proc_times_arr[WindowProcTimes::INDEX_CALCULATION] = - std::chrono::duration(t3 - t0).count() * 1000; - window_proc_times_arr[WindowProcTimes::MODIFY_SYNDROME_SLICE] = - std::chrono::duration(t4 - t3).count() * 1000; - window_proc_times_arr[WindowProcTimes::INNER_DECODE] = - std::chrono::duration(t5 - t4).count() * 1000; - window_proc_times_arr[WindowProcTimes::CONVERT_TO_HARD] = - std::chrono::duration(t6 - t5).count() * 1000; - window_proc_times_arr[WindowProcTimes::COMMIT_TO_RESULT] = - std::chrono::duration(t7 - t6).count() * 1000; - CUDAQ_INFO("Window {} time: {:.3f} ms (0:{:.3f}ms 1:{:.3f}ms 2:{:.3f}ms " - "3:{:.3f}ms 4:{:.3f}ms 5:{:.3f}ms 6:{:.3f}ms 7:{:.3f}ms)", - w, window_proc_times[w], window_proc_times_arr[0], - window_proc_times_arr[1], window_proc_times_arr[2], - window_proc_times_arr[3], window_proc_times_arr[4], - window_proc_times_arr[5], window_proc_times_arr[6], - window_proc_times_arr[7]); } + auto t7 = std::chrono::high_resolution_clock::now(); + window_proc_times.at(w) += + std::chrono::duration(t7 - t0).count() * 1000; + window_proc_times_arr[WindowProcTimes::INDEX_CALCULATION] = + std::chrono::duration(t3 - t0).count() * 1000; + window_proc_times_arr[WindowProcTimes::MODIFY_SYNDROME_SLICE] = + std::chrono::duration(t4 - t3).count() * 1000; + window_proc_times_arr[WindowProcTimes::INNER_DECODE] = + std::chrono::duration(t5 - t4).count() * 1000; + window_proc_times_arr[WindowProcTimes::CONVERT_TO_HARD] = + std::chrono::duration(t6 - t5).count() * 1000; + window_proc_times_arr[WindowProcTimes::COMMIT_TO_RESULT] = + std::chrono::duration(t7 - t6).count() * 1000; + CUDAQ_INFO("Window {} time: {:.3f} ms (0:{:.3f}ms 1:{:.3f}ms 2:{:.3f}ms " + "3:{:.3f}ms 4:{:.3f}ms 5:{:.3f}ms 6:{:.3f}ms 7:{:.3f}ms)", + w, window_proc_times[w], window_proc_times_arr[0], + window_proc_times_arr[1], window_proc_times_arr[2], + window_proc_times_arr[3], window_proc_times_arr[4], + window_proc_times_arr[5], window_proc_times_arr[6], + window_proc_times_arr[7]); +} - virtual ~sliding_window() {} +std::size_t sliding_window::get_num_syndromes_per_round() const { + return num_syndromes_per_round; +} - CUDAQ_EXTENSION_CUSTOM_CREATOR_FUNCTION( - sliding_window, static std::unique_ptr create( - const cudaqx::tensor &H, - const cudaqx::heterogeneous_map ¶ms) { - return std::make_unique(H, params); - }) -}; +sliding_window::~sliding_window() {} CUDAQ_REGISTER_TYPE(sliding_window) diff --git a/libs/qec/lib/decoders/sliding_window.h b/libs/qec/lib/decoders/sliding_window.h new file mode 100644 index 00000000..6d5066f8 --- /dev/null +++ b/libs/qec/lib/decoders/sliding_window.h @@ -0,0 +1,149 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2025 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include "cudaq/qec/decoder.h" +#include + +namespace cudaq::qec { + +/// @brief A sliding window decoder that processes syndromes in overlapping windows +/// +/// This decoder divides the syndrome stream into overlapping windows and decodes each +/// window independently using an inner decoder. It's designed for low-latency decoding +/// of streaming syndrome data. +class sliding_window : public decoder { +private: + // --- Input parameters --- + + /// The number of rounds of syndrome data in each window. + std::size_t window_size = 1; + /// The number of rounds to advance the window by each time. + std::size_t step_size = 1; + /// The number of syndromes per round. + std::size_t num_syndromes_per_round = 0; + /// When forming a window, should error mechanisms that span the start round + /// and any preceding rounds be included? + bool straddle_start_round = false; + /// When forming a window, should error mechanisms that span the end round and + /// any subsequent rounds be included? + bool straddle_end_round = true; + /// The vector of error rates for the error mechanisms. + std::vector error_rate_vec; + /// The name of the inner decoder to use. + std::string inner_decoder_name; + /// The parameters to pass to the inner decoder. + cudaqx::heterogeneous_map inner_decoder_params; + + // Derived parameters. + std::size_t num_windows = 0; + std::size_t num_rounds = 0; + std::size_t num_syndromes_per_window = 0; + std::size_t num_rounds_since_last_decode = 0; + std::vector> inner_decoders; + std::vector first_columns; + cudaqx::tensor full_pcm; + cudaqx::tensor full_pcm_T; + + // Enum type for timing data. + enum WindowProcTimes { + INITIALIZE_WINDOW, // 0 + SLIDE_WINDOW, // 1 + COPY_DATA, // 2 + INDEX_CALCULATION, // 3 + MODIFY_SYNDROME_SLICE, // 4 + INNER_DECODE, // 5 + CONVERT_TO_HARD, // 6 + COMMIT_TO_RESULT, // 7 + NUM_WINDOW_PROC_TIMES // 8 + }; + + // State data + std::vector> + rolling_window; // [batch_size, num_syndromes_per_window] + // rolling window read and write indices (circular buffer) + std::size_t rw_next_write_index = 0; // [0, num_syndromes_per_window) + std::size_t rw_next_read_index = 0; // [0, num_syndromes_per_window) + std::size_t rw_filled = 0; + std::size_t num_windows_decoded = 0; + std::vector> syndrome_mods; // [batch_size, syndrome_size] + std::vector rw_results; // [batch_size] + std::vector window_proc_times; + std::array + window_proc_times_arr = {}; + + /// @brief Validate constructor inputs + void validate_inputs(); + + /// @brief Initialize the window + /// @param num_syndromes The number of syndromes to initialize the window for + void initialize_window(std::size_t num_syndromes); + + /// @brief Add a single syndrome to the rolling window (circular buffer) + void add_syndrome_to_rolling_window(const std::vector &syndrome, + std::size_t syndrome_index, + bool update_next_write_index = true); + + /// @brief Add a batch of syndromes to the rolling window (circular buffer) + void add_syndromes_to_rolling_window( + const std::vector> &syndromes); + + /// @brief Get a single syndrome from the rolling window (unwrapping circular buffer) + std::vector get_syndrome_from_rolling_window(std::size_t syndrome_index); + + /// @brief Get a batch of syndromes from the rolling window (unwrapping circular buffer) + std::vector> get_syndromes_from_rolling_window(); + + /// @brief Update the read index for the rolling window + void update_rw_next_read_index(); + + /// @brief Decode a single window (internal helper) + void decode_window(); + +public: + /// @brief Constructor + /// @param H The full parity check matrix for all rounds + /// @param params A heterogeneous map containing required parameters: + /// - window_size: Size of each decoding window (in rounds) + /// - step_size: Step size between consecutive windows (in rounds) + /// - num_rounds: Total number of rounds + /// - num_syndromes_per_round: Number of syndromes per round + /// - inner_decoder: Name of the inner decoder to use + /// - inner_decoder_params: Parameters for the inner decoder (optional) + sliding_window(const cudaqx::tensor &H, + const cudaqx::heterogeneous_map ¶ms); + + /// @brief Decode a syndrome vector + /// @param syndrome The syndrome measurements to decode + /// @return The decoded error correction + decoder_result decode(const std::vector &syndrome) override; + + /// @brief Decode multiple syndromes in batch + /// @param syndromes Multiple syndrome measurements to decode + /// @return The decoded error corrections + std::vector decode_batch(const std::vector> &syndromes) override; + + /// @brief Get the number of syndromes per round + /// @return The number of syndromes measured in each round + std::size_t get_num_syndromes_per_round() const; + + /// @brief Destructor + virtual ~sliding_window(); + + // Plugin registration macros + CUDAQ_EXTENSION_CUSTOM_CREATOR_FUNCTION( + sliding_window, static std::unique_ptr create( + const cudaqx::tensor &H, + const cudaqx::heterogeneous_map ¶ms) { + return std::make_unique(H, params); + }) +}; + +} // namespace cudaq::qec + From a83e424f0a967063fa21510ff4944e5aa98deba0 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Wed, 12 Nov 2025 15:15:28 +0000 Subject: [PATCH 08/11] Formatting Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoders/sliding_window.h | 42 ++++++++++++++------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/libs/qec/lib/decoders/sliding_window.h b/libs/qec/lib/decoders/sliding_window.h index 6d5066f8..74513c8e 100644 --- a/libs/qec/lib/decoders/sliding_window.h +++ b/libs/qec/lib/decoders/sliding_window.h @@ -13,11 +13,12 @@ namespace cudaq::qec { -/// @brief A sliding window decoder that processes syndromes in overlapping windows -/// -/// This decoder divides the syndrome stream into overlapping windows and decodes each -/// window independently using an inner decoder. It's designed for low-latency decoding -/// of streaming syndrome data. +/// @brief A sliding window decoder that processes syndromes in overlapping +/// windows +/// +/// This decoder divides the syndrome stream into overlapping windows and +/// decodes each window independently using an inner decoder. It's designed for +/// low-latency decoding of streaming syndrome data. class sliding_window : public decoder { private: // --- Input parameters --- @@ -80,29 +81,32 @@ class sliding_window : public decoder { /// @brief Validate constructor inputs void validate_inputs(); - + /// @brief Initialize the window /// @param num_syndromes The number of syndromes to initialize the window for void initialize_window(std::size_t num_syndromes); - + /// @brief Add a single syndrome to the rolling window (circular buffer) void add_syndrome_to_rolling_window(const std::vector &syndrome, - std::size_t syndrome_index, - bool update_next_write_index = true); - + std::size_t syndrome_index, + bool update_next_write_index = true); + /// @brief Add a batch of syndromes to the rolling window (circular buffer) void add_syndromes_to_rolling_window( const std::vector> &syndromes); - - /// @brief Get a single syndrome from the rolling window (unwrapping circular buffer) - std::vector get_syndrome_from_rolling_window(std::size_t syndrome_index); - - /// @brief Get a batch of syndromes from the rolling window (unwrapping circular buffer) + + /// @brief Get a single syndrome from the rolling window (unwrapping circular + /// buffer) + std::vector + get_syndrome_from_rolling_window(std::size_t syndrome_index); + + /// @brief Get a batch of syndromes from the rolling window (unwrapping + /// circular buffer) std::vector> get_syndromes_from_rolling_window(); - + /// @brief Update the read index for the rolling window void update_rw_next_read_index(); - + /// @brief Decode a single window (internal helper) void decode_window(); @@ -127,7 +131,8 @@ class sliding_window : public decoder { /// @brief Decode multiple syndromes in batch /// @param syndromes Multiple syndrome measurements to decode /// @return The decoded error corrections - std::vector decode_batch(const std::vector> &syndromes) override; + std::vector + decode_batch(const std::vector> &syndromes) override; /// @brief Get the number of syndromes per round /// @return The number of syndromes measured in each round @@ -146,4 +151,3 @@ class sliding_window : public decoder { }; } // namespace cudaq::qec - From fb198e1bb562616a6bd0749b4c46c7c33c60c956 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Wed, 12 Nov 2025 16:26:55 +0000 Subject: [PATCH 09/11] Minor tweaks Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 4 ++-- libs/qec/unittests/realtime/app_examples/surface_code-1.cpp | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index 98200f7c..9c51f903 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -300,9 +300,9 @@ bool decoder::enqueue_syndrome(const uint8_t *syndrome, } else { // Buffer is full with 2 rounds: compute timelike detectors (XOR of two // rounds) + std::size_t index = + (pimpl->current_round - 2) * pimpl->num_syndromes_per_round; for (std::size_t i = 0; i < pimpl->num_syndromes_per_round; i++) { - std::size_t index = - (pimpl->current_round - 2) * pimpl->num_syndromes_per_round; pimpl->persistent_detector_buffer[i] = pimpl->msyn_buffer[index + i] ^ pimpl->msyn_buffer[index + i + pimpl->num_syndromes_per_round]; diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp index 8005c990..71d132fe 100644 --- a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp +++ b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp @@ -33,8 +33,7 @@ void save_dem_to_file(const cudaq::qec::detector_error_model &dem, std::string dem_filename, uint64_t numSyndromesPerRound, uint64_t numLogical, const std::string &decoder_type, - int decoder_window, int sw_window_size, - int sw_step_size) { + int sw_window_size, int sw_step_size) { cudaq::qec::decoding::config::multi_decoder_config multi_config; for (uint64_t i = 0; i < numLogical; i++) { // We actually send 1 additional round in this example, so add 1. @@ -558,8 +557,7 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance, if (save_dem) { save_dem_to_file(dem, dem_filename, numSyndromesPerRound, numLogical, - decoder_type, decoder_window, sw_window_size, - sw_step_size); + decoder_type, sw_window_size, sw_step_size); return; } } From 9bb39a4eb51fb16f096e3f9a03a27c285afb9704 Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Thu, 13 Nov 2025 12:45:47 +0000 Subject: [PATCH 10/11] Created common function for set_D_sparse Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoder.cpp | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/libs/qec/lib/decoder.cpp b/libs/qec/lib/decoder.cpp index 43021613..7b03138c 100644 --- a/libs/qec/lib/decoder.cpp +++ b/libs/qec/lib/decoder.cpp @@ -190,9 +190,11 @@ void decoder::set_decoder_id(uint32_t decoder_id) { uint32_t decoder::get_decoder_id() const { return pimpl->decoder_id; } -void decoder::set_D_sparse(const std::vector> &D_sparse) { - this->D_sparse = D_sparse; - auto *sw_decoder = dynamic_cast(this); +template +void set_D_sparse_common(decoder *decoder, + const std::vector> &D_sparse, + PimplType *pimpl) { + auto *sw_decoder = dynamic_cast(decoder); if (sw_decoder != nullptr) { pimpl->is_sliding_window = true; @@ -215,28 +217,14 @@ void decoder::set_D_sparse(const std::vector> &D_sparse) { pimpl->msyn_buffer_index = 0; } +void decoder::set_D_sparse(const std::vector> &D_sparse) { + this->D_sparse = D_sparse; + set_D_sparse_common(this, D_sparse, pimpl.get()); +} + void decoder::set_D_sparse(const std::vector &D_sparse_vec_in) { set_sparse_from_vec(D_sparse_vec_in, this->D_sparse); - auto *sw_decoder = dynamic_cast(this); - - if (sw_decoder != nullptr) { - pimpl->is_sliding_window = true; - pimpl->num_syndromes_per_round = sw_decoder->get_num_syndromes_per_round(); - // Check if first row is a first-round detector (single syndrome index) - pimpl->has_first_round_detectors = - (this->D_sparse.size() > 0 && this->D_sparse[0].size() == 1); - pimpl->current_round = 0; - pimpl->persistent_detector_buffer.resize(pimpl->num_syndromes_per_round); - pimpl->persistent_soft_detector_buffer.resize( - pimpl->num_syndromes_per_round); - } else { - pimpl->is_sliding_window = false; - } - - pimpl->num_msyn_per_decode = calculate_num_msyn_per_decode(this->D_sparse); - pimpl->msyn_buffer.clear(); - pimpl->msyn_buffer.resize(pimpl->num_msyn_per_decode); - pimpl->msyn_buffer_index = 0; + set_D_sparse_common(this, this->D_sparse, pimpl.get()); } bool decoder::enqueue_syndrome(const uint8_t *syndrome, From d7dab65630ce2133befe926c4a166e72af60c54a Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Mon, 17 Nov 2025 14:33:32 +0000 Subject: [PATCH 11/11] Make sliding_window.cpp diffs easier for git diff views Signed-off-by: Chuck Ketcham --- libs/qec/lib/decoders/sliding_window.cpp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/libs/qec/lib/decoders/sliding_window.cpp b/libs/qec/lib/decoders/sliding_window.cpp index cb875c73..979227a3 100644 --- a/libs/qec/lib/decoders/sliding_window.cpp +++ b/libs/qec/lib/decoders/sliding_window.cpp @@ -14,11 +14,6 @@ namespace cudaq::qec { -// ============================================================================ -// Private helper method implementations -// ============================================================================ - -/// Helper function to validate constructor inputs. void sliding_window::validate_inputs() { if (window_size < 1 || window_size > num_rounds) { throw std::invalid_argument( @@ -180,11 +175,6 @@ void sliding_window::update_rw_next_read_index() { rw_next_read_index -= num_syndromes_per_window; } -// ============================================================================ -// Public method implementations -// ============================================================================ - -/// Constructor for the sliding window decoder. sliding_window::sliding_window(const cudaqx::tensor &H, const cudaqx::heterogeneous_map ¶ms) : decoder(H), full_pcm(H) { @@ -237,7 +227,6 @@ sliding_window::sliding_window(const cudaqx::tensor &H, } } -/// Decode a syndrome vector (either full block or single round). decoder_result sliding_window::decode(const std::vector &syndrome) { if (syndrome.size() == this->syndrome_size) { auto t0 = std::chrono::high_resolution_clock::now(); @@ -301,7 +290,6 @@ decoder_result sliding_window::decode(const std::vector &syndrome) { return decoder_result(); // empty return value } -/// Decode a batch of syndrome vectors. std::vector sliding_window::decode_batch( const std::vector> &syndromes) { if (syndromes[0].size() == this->syndrome_size) { @@ -483,12 +471,12 @@ void sliding_window::decode_window() { window_proc_times_arr[7]); } +sliding_window::~sliding_window() {} + std::size_t sliding_window::get_num_syndromes_per_round() const { return num_syndromes_per_round; } -sliding_window::~sliding_window() {} - CUDAQ_REGISTER_TYPE(sliding_window) } // namespace cudaq::qec