I suspect that the incorrect usage of cuda::memcpy_async has led to uncoalesced global/shared accesses. #5155

Time-Limit · 2025-07-04T11:41:04Z

Time-Limit
Jul 4, 2025

I'm a beginner in CUDA and am practicing sgemm. When I try to optimize data transfer using cuda::pipeline and cuda::memcpy_async, I encounter the issue of Uncoalesced Global/shared Accesses. I suspect that my usage is incorrect, and I hope someone can help me identify the problem. Thanks very much!

This image is an message from Nvidia Nsight Compute.

The following code is without cuda::memcpy_async and cuda::pipeline,

  101   for (int iter = 0; iter < k_iter_count; ++iter) {
  102     if (iter != 0) {
  103       __syncthreads();
  104     }
  105
  106     // Read `block_tile * k_per_iter` floats from each of A and B in global
  107     // memory into shared memory per iteration.
  108     {
  109       *(float4 *)(&A_sm[warp_id * block_tile + lane_id * 4]) = __ldg(
  110           (const float4
  111                *)(&A[(C_m_top + warp_id * 16 + lane_id % 8 + lane_id / 16 * 8) *
  112                          K +
  113                      iter * k_per_iter + lane_id % 16 / 8 * 4]));
  114     }
  115     {
  116       *(float4 *)(&B_sm[warp_id * block_tile + lane_id * 4]) = __ldg(
  117           (const float4 *)(&B[(iter * k_per_iter + warp_id) * N + C_n_left +
  118                               lane_id % 8 / 2 * 16 + (lane_id & 1) * 4 +
  119                               (lane_id & 0x8) + lane_id / 16 * 64]));
  120     }
  121     __syncthreads();

and all memory accessing is coalesced，each thread loads a float4, so 16 sectors per request.

The following code is with cuda::memcpy_async and cuda::pipeline，the index of each thread is not change, but 24 sectors per request.

237   for (int iter = 0; iter < k_iter_count; ++iter) {
  238     if (iter != 0) {
  239       block.sync();
  240     }
  241
  242     // Read `block_tile * k_per_iter` floats from each of A and B in global
  243     // memory into shared memory per iteration.
  244     void *A_sm_dst = A_sm + warp_id * block_tile + lane_id * 4;
  245     const void *A_global_src =
  246         A + ((C_m_top + warp_id * 16 + lane_id % 8 + lane_id / 16 * 8) * K +
  247              iter * k_per_iter + lane_id % 16 / 8 * 4);
  248     void *B_sm_dst = B_sm + warp_id * block_tile + lane_id * 4;
  249     const void *B_global_src =
  250         B +
  251         ((iter * k_per_iter + warp_id) * N + C_n_left + lane_id % 8 / 2 * 16 +
  252          (lane_id & 1) * 4 + (lane_id & 0x8) + lane_id / 16 * 64);
  253     pipeline.producer_acquire();
  254     cuda::memcpy_async(A_sm_dst, A_global_src, cuda::aligned_size_t<16>(1),
  255                        pipeline);
  256     cuda::memcpy_async(B_sm_dst, B_global_src, cuda::aligned_size_t<16>(1),
  257                        pipeline);
  258     pipeline.producer_commit();
  259     pipeline.consumer_wait();
  260     block.sync();

This is the complete code.

#include <cassert>
#include <cooperative_groups/memcpy_async.h>
#include <cstdio>
#include <cstdlib>
#include <cuda/pipeline>
#include <cuda_runtime.h>
#include <random>
#include <vector>

#include "util/error.h"
#include "util/util.cuh"

template <int BLOCK_TILE>
void launch(const float *A, const float *B, float *C, int M, int N, int K);

int main() {
  static const int M = (1 << 12), N = (1 << 12), K = (1 << 12);
  const float EPS = 1e-1;

  std::vector<float> host_A(M * K), host_B(K * N), host_C(M * N),
      host_result(M * N);
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<float> dis(-5.0f, 5.0f);
  for (auto &vec : {&host_A, &host_B}) {
    for (auto &data : *vec) {
      data = dis(gen);
    }
  }

  float *A, *B, *C;
  for (auto &pair : {std::make_pair(host_A, &A), std::make_pair(host_B, &B),
                     std::make_pair(host_C, &C)}) {
    const std::vector<float> &host = pair.first;
    float *&device = *pair.second;
    cudaMalloc(&device, sizeof(float) * host.size());
    cudaMemcpy(device, host.data(), sizeof(float) * host.size(),
               cudaMemcpyDefault);
    CHECK_CUDA_ERROR();
  }

  {
    cudaMemset(C, 0, M * N * sizeof(float));
    launch<128>(A, B, C, M, N, K);
    memset(host_C.data(), 0, sizeof(float) * host_C.size());
    cudaMemcpy(host_C.data(), C, sizeof(float) * host_C.size(),
               cudaMemcpyDefault);
    const float(*host_result_ptr)[N] =
        reinterpret_cast<const float(*)[N]>(host_result.data());
    const float(*device_result_ptr)[N] =
        reinterpret_cast<const float(*)[N]>(host_C.data());
  }

  return 0;
}

// A, B and C are all row-marjor matrices;
template <int block_tile, int thread_tile, int k_per_iter>
__global__ void
shared_memory__eliminate_bank_conflict__global_store_memory_colacesing(
    const float *A, const float *B, float *C, int M, int N, int K) {
  extern __shared__ float shared_memory_buffer[];

  float *A_sm = shared_memory_buffer;
  float *B_sm = A_sm + block_tile * k_per_iter;

  const int k_iter_count = (K + k_per_iter - 1) / k_per_iter;

  // The current block needs to compute `block_tile * block_tile` floats in
  // `C`, spanning from `C_m_top` to `C_m_bottom`(exclusive) and from
  // `C_n_left` to `C_n_right`(exclusive).
  int C_m_top = blockIdx.y * block_tile;
  int C_n_left = blockIdx.x * block_tile;
  int C_m_bottom = min(C_m_top + block_tile, M);
  int C_n_right = min(C_n_left + block_tile, N);

  // `C_reg` is being used to store the partial sum.
  float C_reg[thread_tile][thread_tile] = {0};

  // The following code computes the relative position of the top-right float
  // in the `C` submatrix.
  const int lane_id = threadIdx.x & 31;
  const int warp_id = threadIdx.x >> 5;

  // Every thread needs to compute `thread_tile * thread_tile` floats in the
  // `C` matrix.
  // A_reg is K-marjor; B_reg is N-marjor;
  const int k_per_iter_half = k_per_iter / 2;
  float A_reg[thread_tile * k_per_iter_half];
  float B_reg[thread_tile * k_per_iter_half];
  const int A_offset = lane_id / 16 * 64 + lane_id % 16 / 2 * 4;

  // Each thread shifts `warp_id` position to the right within its own
  // half-warp.
  const int B_shifted_lane_id =
      ((lane_id + warp_id * 4) % 16 + (lane_id & 0x10)) ^ (warp_id / 4 * 16);
  const int B_offset = B_shifted_lane_id / 16 * 64 +
                       ((B_shifted_lane_id % 16) & 0xfd) / 4 * 8 +
                       (B_shifted_lane_id & 1) * 4;

  for (int iter = 0; iter < k_iter_count; ++iter) {
    if (iter != 0) {
      __syncthreads();
    }

    // Read `block_tile * k_per_iter` floats from each of A and B in global
    // memory into shared memory per iteration.
    {
      *(float4 *)(&A_sm[warp_id * block_tile + lane_id * 4]) = __ldg(
          (const float4
               *)(&A[(C_m_top + warp_id * 16 + lane_id % 8 + lane_id / 16 * 8) *
                         K +
                     iter * k_per_iter + lane_id % 16 / 8 * 4]));
    }
    {
      *(float4 *)(&B_sm[warp_id * block_tile + lane_id * 4]) = __ldg(
          (const float4 *)(&B[(iter * k_per_iter + warp_id) * N + C_n_left +
                              lane_id % 8 / 2 * 16 + (lane_id & 1) * 4 +
                              (lane_id & 0x8) + lane_id / 16 * 64]));
    }
    __syncthreads();

    for (int i = 0; i < thread_tile; ++i) {
      *(float4 *)&A_reg[i * k_per_iter_half] =
          *(float4 *)&A_sm[A_offset + i * 128];
    }
    for (int i = 0; i < k_per_iter_half; ++i) {
      *(float4 *)&B_reg[i * thread_tile] = *(float4 *)&B_sm[B_offset + i * 128];
      *(float4 *)&B_reg[i * thread_tile + 4] =
          *(float4 *)&B_sm[B_offset + i * 128 + 32];
    }

    for (int i = 0; i < thread_tile; ++i) {
      for (int j = 0; j < thread_tile; ++j) {
        for (int k = 0; k < k_per_iter_half; ++k) {
          C_reg[i][j] +=
              A_reg[i * k_per_iter_half + k] * B_reg[k * thread_tile + j];
        }
      }
    }

    for (int i = 0; i < thread_tile; ++i) {
      *(float4 *)&A_reg[i * k_per_iter_half] =
          *(float4 *)&A_sm[A_offset + i * 128 + 32];
    }
    for (int i = 4; i < k_per_iter; ++i) {
      *(float4 *)&B_reg[(i - 4) * thread_tile] =
          *(float4 *)&B_sm[B_offset + i * 128];
      *(float4 *)&B_reg[(i - 4) * thread_tile + 4] =
          *(float4 *)&B_sm[B_offset + i * 128 + 32];
    }

    for (int i = 0; i < thread_tile; ++i) {
      for (int j = 0; j < thread_tile; ++j) {
        for (int k = 0; k < k_per_iter_half; ++k) {
          C_reg[i][j] +=
              A_reg[i * k_per_iter_half + k] * B_reg[k * thread_tile + j];
        }
      }
    }
  }

  // C_reg is N-marjor
  if (C_m_top + block_tile <= M && C_n_left + block_tile <= N) {
    for (int i = 0; i < thread_tile; ++i) {
      int m = C_m_top + i * 16 + lane_id / 16 * 8 + lane_id % 16 / 2;
      int n = C_n_left + B_shifted_lane_id % 16 / 4 * 16 +
              B_shifted_lane_id / 16 * 64 + (B_shifted_lane_id & 1) * 4;
      *(float4 *)&C[m * N + n] = *(const float4 *)(&C_reg[i][0]);
      *(float4 *)&C[m * N + n + 8] = *(const float4 *)(&C_reg[i][4]);
    }
  } else {
    // for (int i = 0; i < thread_tile; ++i) {
    //   const int m = C_m_top + C_sub_m + i;
    //   for (int j = 0; j < thread_tile; ++j) {
    //     const int n = C_n_left + C_sub_n + (j < 4 ? j : (32 + j - 4));
    //     if (m < M && n < N) {
    //       C[m * N + n] = C_reg[i][j];
    //     }
    //   }
    // }
  }
}

// A, B and C are all row-marjor matrices;
template <int block_tile, int thread_tile, int k_per_iter>
__global__ void
shared_memory__eliminate_bank_conflict__global_store_memory_colacesing__pipeline_1_stage(
    const float * __restrict__ A, const float * __restrict__ B, float * __restrict__ C, int M, int N, int K) {
  extern __shared__ float shared_memory_buffer[];

  float *A_sm = shared_memory_buffer;
  float *B_sm = A_sm + block_tile * k_per_iter;

  const int k_iter_count = (K + k_per_iter - 1) / k_per_iter;

  // The current block needs to compute `block_tile * block_tile` floats in
  // `C`, spanning from `C_m_top` to `C_m_bottom`(exclusive) and from
  // `C_n_left` to `C_n_right`(exclusive).
  int C_m_top = blockIdx.y * block_tile;
  int C_n_left = blockIdx.x * block_tile;
  int C_m_bottom = min(C_m_top + block_tile, M);
  int C_n_right = min(C_n_left + block_tile, N);

  // `C_reg` is being used to store the partial sum.
  float C_reg[thread_tile][thread_tile] = {0};

  // The following code computes the relative position of the top-right float
  // in the `C` submatrix.
  const int lane_id = threadIdx.x & 31;
  const int warp_id = threadIdx.x >> 5;

  // Every thread needs to compute `thread_tile * thread_tile` floats in the
  // `C` matrix.
  // A_reg is K-marjor; B_reg is N-marjor;
  const int k_per_iter_half = k_per_iter / 2;
  float A_reg[thread_tile * k_per_iter_half];
  float B_reg[thread_tile * k_per_iter_half];
  const int A_offset = lane_id / 16 * 64 + lane_id % 16 / 2 * 4;

  // Each thread shifts `warp_id` position to the right within its own
  // half-warp.
  const int B_shifted_lane_id =
      ((lane_id + warp_id * 4) % 16 + (lane_id & 0x10)) ^ (warp_id / 4 * 16);
  const int B_offset = B_shifted_lane_id / 16 * 64 +
                       ((B_shifted_lane_id % 16) & 0xfd) / 4 * 8 +
                       (B_shifted_lane_id & 1) * 4;

  auto grid = cooperative_groups::this_grid();
  auto block = cooperative_groups::this_thread_block();

  constexpr size_t stages_count = 1;
  __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count> shared_state;

  auto pipeline = cuda::make_pipeline(block, &shared_state);

  for (int iter = 0; iter < k_iter_count; ++iter) {
    if (iter != 0) {
      block.sync();
    }

    // Read `block_tile * k_per_iter` floats from each of A and B in global
    // memory into shared memory per iteration.
    void *A_sm_dst = A_sm + warp_id * block_tile + lane_id * 4;
    const void *A_global_src =
        A + ((C_m_top + warp_id * 16 + lane_id % 8 + lane_id / 16 * 8) * K +
             iter * k_per_iter + lane_id % 16 / 8 * 4);
    void *B_sm_dst = B_sm + warp_id * block_tile + lane_id * 4;
    const void *B_global_src =
        B +
        ((iter * k_per_iter + warp_id) * N + C_n_left + lane_id % 8 / 2 * 16 +
         (lane_id & 1) * 4 + (lane_id & 0x8) + lane_id / 16 * 64);
    pipeline.producer_acquire();
    cuda::memcpy_async(A_sm_dst, A_global_src, cuda::aligned_size_t<16>(1),
                       pipeline);
    cuda::memcpy_async(B_sm_dst, B_global_src, cuda::aligned_size_t<16>(1),
                       pipeline);
    pipeline.producer_commit();
    pipeline.consumer_wait();
    block.sync();
    // if (iter < 16 && blockIdx.x == 1 && blockIdx.y == 1) {
    //   for (int i = 0; i < block.size(); ++i) {
    //     if (i == block.thread_rank()) {
    //       printf("thread = %03d, A_sm = %p, A_global = %p, B_sm = %p, B_global "
    //              "= %p\n",
    //              block.thread_rank(),
    //              (float4 *)(A_sm + warp_id * block_tile + lane_id * 4),
    //              (float4 *)(A + ((C_m_top + warp_id * 16 + lane_id % 8 +
    //                               lane_id / 16 * 8) *
    //                                  K +
    //                              iter * k_per_iter + lane_id % 16 / 8 * 4)),
    //              (float4 *)(B_sm + warp_id * block_tile + lane_id * 4),
    //              (float4 *)(B + ((iter * k_per_iter + warp_id) * N + C_n_left +
    //                              lane_id % 8 / 2 * 16 + (lane_id & 1) * 4 +
    //                              (lane_id & 0x8) + lane_id / 16 * 64)));
    //     }
    //     block.sync();
    //   }
    // }

    // if (iter == 0 && warp_id == 0 && lane_id == 0 && blockIdx.x == 0 &&
    //     blockIdx.y == 0) {
    //   printf("\nA_sm begin\n");
    //   for (int i = 0; i < 128 * 8; ++i) {
    //     if (i % 32 == 0) {
    //       printf("\n");
    //     }
    //     printf("%03d_%03d ", int(A_sm[i]) / 128, int(A_sm[i]) % 128);
    //   }
    //   printf("\nA_sm end\n");
    // }

    // if (iter == 0 && warp_id == 0 && lane_id == 0 && blockIdx.x == 0 &&
    // blockIdx.y == 0) {
    //   printf("\nB_sm begin\n");
    //   for (int i = 0; i < 128 * 8; ++i) {
    //     if (i % 32 == 0) {
    //       printf("\n");
    //     }
    //     printf("%03d_%03d ", int(B_sm[i]) / 128, int(B_sm[i]) % 128);
    //   }
    //   printf("\nB_sm end\n");
    // }

    for (int i = 0; i < thread_tile; ++i) {
      *(float4 *)&A_reg[i * k_per_iter_half] =
          *(float4 *)&A_sm[A_offset + i * 128];
    }
    for (int i = 0; i < k_per_iter_half; ++i) {
      *(float4 *)&B_reg[i * thread_tile] = *(float4 *)&B_sm[B_offset + i * 128];
      *(float4 *)&B_reg[i * thread_tile + 4] =
          *(float4 *)&B_sm[B_offset + i * 128 + 32];
    }

    for (int i = 0; i < thread_tile; ++i) {
      for (int j = 0; j < thread_tile; ++j) {
        for (int k = 0; k < k_per_iter_half; ++k) {
          C_reg[i][j] +=
              A_reg[i * k_per_iter_half + k] * B_reg[k * thread_tile + j];
        }
      }
    }

    for (int i = 0; i < thread_tile; ++i) {
      *(float4 *)&A_reg[i * k_per_iter_half] =
          *(float4 *)&A_sm[A_offset + i * 128 + 32];
    }
    for (int i = 4; i < k_per_iter; ++i) {
      *(float4 *)&B_reg[(i - 4) * thread_tile] =
          *(float4 *)&B_sm[B_offset + i * 128];
      *(float4 *)&B_reg[(i - 4) * thread_tile + 4] =
          *(float4 *)&B_sm[B_offset + i * 128 + 32];
    }

    for (int i = 0; i < thread_tile; ++i) {
      for (int j = 0; j < thread_tile; ++j) {
        for (int k = 0; k < k_per_iter_half; ++k) {
          C_reg[i][j] +=
              A_reg[i * k_per_iter_half + k] * B_reg[k * thread_tile + j];
        }
      }
    }
    pipeline.consumer_release();
  }

  // C_reg is N-marjor
  if (C_m_top + block_tile <= M && C_n_left + block_tile <= N) {
    for (int i = 0; i < thread_tile; ++i) {
      int m = C_m_top + i * 16 + lane_id / 16 * 8 + lane_id % 16 / 2;
      int n = C_n_left + B_shifted_lane_id % 16 / 4 * 16 +
              B_shifted_lane_id / 16 * 64 + (B_shifted_lane_id & 1) * 4;
      *(float4 *)&C[m * N + n] = *(const float4 *)(&C_reg[i][0]);
      *(float4 *)&C[m * N + n + 8] = *(const float4 *)(&C_reg[i][4]);
      // if (blockIdx.x == 0 && blockIdx.y == 0) {
      //   for (int t = 0; t < 256; ++t) {
      //     if (t == threadIdx.x) {
      //       printf("i = %03d, warp_id = %03d, lane_id = %03d, shifted = %03d,
      //       "
      //              "m = %03d, n = "
      //              "%03d\n",
      //              i, warp_id, lane_id, B_shifted_lane_id, m, n);
      //     }
      //     __syncthreads();
      //   }
      // }
    }
  } else {
    // for (int i = 0; i < thread_tile; ++i) {
    //   const int m = C_m_top + C_sub_m + i;
    //   for (int j = 0; j < thread_tile; ++j) {
    //     const int n = C_n_left + C_sub_n + (j < 4 ? j : (32 + j - 4));
    //     if (m < M && n < N) {
    //       C[m * N + n] = C_reg[i][j];
    //     }
    //   }
    // }
  }
}

template <int BLOCK_TILE>
void launch(const float *A, const float *B, float *C, int M, int N, int K) {
  constexpr int THREAD_TILE = 8;
  static_assert(BLOCK_TILE % THREAD_TILE == 0);
  constexpr dim3 block(BLOCK_TILE / THREAD_TILE, BLOCK_TILE / THREAD_TILE);
  constexpr int k_per_iter = 8;
  const dim3 grid_dim((N + BLOCK_TILE - 1) / BLOCK_TILE,
                      (M + BLOCK_TILE - 1) / BLOCK_TILE);
  constexpr dim3 block_dim(BLOCK_TILE / THREAD_TILE *
                           (BLOCK_TILE / THREAD_TILE));

  const int32_t shared_memory_bytes =
      k_per_iter * BLOCK_TILE * sizeof(float) * 2;

  static_assert(BLOCK_TILE * k_per_iter % (block_dim.x * block_dim.y) == 0);
  static_assert(THREAD_TILE == 8);

  if (M % 128 == 0 && N % 128 == 0 && K % 8 == 0) {
    shared_memory__eliminate_bank_conflict__global_store_memory_colacesing__pipeline_1_stage<
        BLOCK_TILE, THREAD_TILE, k_per_iter>
        <<<grid_dim, block_dim, shared_memory_bytes * 2>>>(A, B, C, M, N, K);
    CHECK_CUDA_ERROR();
    shared_memory__eliminate_bank_conflict__global_store_memory_colacesing<
        BLOCK_TILE, THREAD_TILE, k_per_iter>
        <<<grid_dim, block_dim, shared_memory_bytes>>>(A, B, C, M, N, K);
    CHECK_CUDA_ERROR();
  } else {
    printf("M or N or K are not suitable\n");
  }
}

Time-Limit · 2025-07-04T11:52:24Z

Time-Limit
Jul 4, 2025
Author

cuda::memcpy_async(B_sm_dst, B_global_src, cuda::aligned_size_t<16>(1), pipeline);

In this line of code, must B_global_src be thread_idx * 4 plus an additional fixed offset?

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

I suspect that the incorrect usage of cuda::memcpy_async has led to uncoalesced global/shared accesses. #5155

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Select a reply

Uh oh!

I suspect that the incorrect usage of cuda::memcpy_async has led to uncoalesced global/shared accesses. #5155

Uh oh!

Time-Limit Jul 4, 2025

Replies: 1 comment

Uh oh!

Uh oh!

Time-Limit Jul 4, 2025 Author

Time-Limit
Jul 4, 2025

Time-Limit
Jul 4, 2025
Author