I have written a simple CUDA program to perform array reduction using thread block clusters and distributed shared memory. I am compiling it with CUDA 12.0 and running on a hopper GPU. Below is the code I use:
#include <stdio.h>
#include <cooperative_groups.h>
#define CLUSTER_SIZE 4
#define BLOCK_SIZE 32
namespace cg = cooperative_groups;
__global__ void __cluster_dims__(CLUSTER_SIZE, 1, 1)
cluster_reduce_sum(int n, float *arr, float *sum)
{
__shared__ float shared_mem[BLOCK_SIZE];
__shared__ float cluster_sum;
cluster_sum = 0.0f;
cg::cluster_group cluster = cg::this_cluster();
unsigned int cluster_block_rank = cluster.block_rank();
unsigned int cluster_size = cluster.dim_blocks().x;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
shared_mem[threadIdx.x] = 0.0f;
if (idx < n) {
shared_mem[threadIdx.x] = arr[idx];
}
__syncthreads();
for (int offset = BLOCK_SIZE / 2; offset; offset /= 2) {
if (threadIdx.x < offset) {
shared_mem[threadIdx.x] += shared_mem[threadIdx.x + offset];
}
__syncthreads();
}
cluster.sync();
if (threadIdx.x == 0) {
atomicAdd(cluster.map_shared_rank(&cluster_sum, 0), shared_mem[0]);
printf("blockIdx: %d cluster_sum: %f\n", blockIdx.x, (float)*cluster.map_shared_rank(&cluster_sum, 0));
}
cluster.sync();
if (threadIdx.x == 0 && cluster_block_rank == 0) {
atomicAdd(sum, cluster_sum);
}
cluster.sync();
}
int main(int argc, char* argv[]) {
int n = 128;
if (argc > 1) {
n = atoi(argv[1]);
}
float *h_arr, *h_sum, sum;
h_arr = (float*) malloc(n * sizeof(float));
h_sum = (float*) malloc(sizeof(float));
int upper = 1024, lower = -1024;
sum = 0.0f;
for(int i = 0; i < n; i++)
{
h_arr[i] = 1;
sum += h_arr[i];
}
float *d_arr, *d_sum;
cudaMalloc(&d_arr, n * sizeof(float));
cudaMalloc(&d_sum, sizeof(float));
cudaMemcpy(d_arr, h_arr, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemset(d_sum, 0, sizeof(float));
int num_clusters = (n-1) / (CLUSTER_SIZE * BLOCK_SIZE) + 1;
cluster_reduce_sum <<< CLUSTER_SIZE * num_clusters, BLOCK_SIZE >>> (n, d_arr, d_sum);
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("CUDA error: %s\n", cudaGetErrorString(error));
return -1;
}
cudaMemcpy(h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);
if (*h_sum != sum) {
printf("Kernel incorrect: %f vs %f\n", sum, *h_sum);
}
return 0;
}
The outputs of the kernel and CPU do not match and as far as I know, there doesn’t seem to be any bugs in my code. Please help me find the issue with this code. Thanks!
Edit: including some debug info to the question. I added a print statement just before adding the per block sum (shared_mem[0]
)to the cluster_sum
. The per block sum seem to be all correct (32). However when I print the final cluster just before adding it to the total sum, it is not correct. Expected value of cluster_sum
is 128, but I see 64. Seems the block sum is not properly accumulated to the cluster sum.
Another interesting point I observed is that if I just change the input datatype to int
from float
, the code works completely fine and passes the output check.
Edit: I run the code. It looks that atomicAdd does not work correctly on float. I add several printf to the original codes.
The issue reported here appears to be a defect in CUDA prior to 12.3, related to distributed shared memory atomics on float
or double
type. The issue should be resolved by updating to CUDA 12.3.
As an alternative "verification" or "workaround", the data type can be changed from float
to eg. int
in the provided code, and the code behavior should be correct, even on CUDA 12.0
Example of that case on CUDA 12.0:
$ cat t21.cu
#include <stdio.h>
#include <cooperative_groups.h>
#define CLUSTER_SIZE 4
#define BLOCK_SIZE 32
namespace cg = cooperative_groups;
using mt=int;
__global__ void __cluster_dims__(CLUSTER_SIZE, 1, 1)
cluster_reduce_sum(int n, mt *arr, mt *sum)
{
__shared__ mt shared_mem[BLOCK_SIZE];
__shared__ mt cluster_sum;
cluster_sum = 0;
cg::cluster_group cluster = cg::this_cluster();
unsigned int cluster_block_rank = cluster.block_rank();
unsigned int cluster_size = cluster.dim_blocks().x;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
shared_mem[threadIdx.x] = 0;
if (idx < n) {
shared_mem[threadIdx.x] = arr[idx];
}
__syncthreads();
for (int offset = BLOCK_SIZE / 2; offset; offset /= 2) {
if (threadIdx.x < offset) {
shared_mem[threadIdx.x] += shared_mem[threadIdx.x + offset];
}
__syncthreads();
}
cluster.sync();
if (threadIdx.x == 0) {
atomicAdd(cluster.map_shared_rank(&cluster_sum, 0), shared_mem[0]);
}
cluster.sync();
if (threadIdx.x == 0 && cluster_block_rank == 0) {
atomicAdd(sum, cluster_sum);
}
cluster.sync();
}
int main(int argc, char* argv[]) {
int n = 128;
if (argc > 1) {
n = atoi(argv[1]);
}
mt *h_arr, *h_sum, sum;
h_arr = (mt*) malloc(n * sizeof(float));
h_sum = (mt*) malloc(sizeof(float));
//int upper = 1024, lower = -1024;
sum = 0;
for(int i = 0; i < n; i++)
{
h_arr[i] = 1;
sum += h_arr[i];
}
mt *d_arr, *d_sum;
cudaMalloc(&d_arr, n * sizeof(mt));
cudaMalloc(&d_sum, sizeof(mt));
cudaMemcpy(d_arr, h_arr, n * sizeof(mt), cudaMemcpyHostToDevice);
//int num_clusters = ceil ((float)n / (CLUSTER_SIZE * BLOCK_SIZE)) + 1;
int num_clusters = 1;
cluster_reduce_sum <<< CLUSTER_SIZE * num_clusters, BLOCK_SIZE >>> (n, d_arr, d_sum);
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("CUDA error: %s\n", cudaGetErrorString(error));
return -1;
}
cudaMemcpy(h_sum, d_sum, sizeof(mt), cudaMemcpyDeviceToHost);
if (*h_sum != sum) {
printf("Kernel incorrect: %f vs %f\n", (float)sum, (float)(*h_sum));
}
return 0;
}
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0
$ nvcc -o t21 t21.cu -arch=sm_90
$ compute-sanitizer ./t21
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$
(4112521)
(The num_clusters
calculation was also incorrect, but that did not materially affect the observation.)