I am trying to transpose a matrix. It works as expected for some values and starts crashing with bigger ones or even between executions of the program.
What I am trying to make is to split the matrix in n by n blocks, each one launches SEGMENT_SIZE
threads and each thread processes a column of its block. For now I am assuming that MATRIX_SIZE
is divisible by SEGMENT_SIZE
.
Invocation and parameters:
#define MATRIX_SIZE 64
#define SEGMENT_SIZE 32
int n_block = (MATRIX_SIZE % SEGMENT_SIZE) ? MATRIX_SIZE / SEGMENT_SIZE + 1 : MATRIX_SIZE / SEGMENT_SIZE;
dim3 blocksPerGrid(n_block, n_block);
dim3 threadsPerBlock(SEGMENT_SIZE);
transposeMatrix<<<blocksPerGrid, threadsPerBlock, threadsPerBlock.x * threadsPerBlock.x * sizeof(float)>>>(d_data, MATRIX_SIZE);
Kernel:
__global__ void transposeMatrix(float *d_data, int mat_dim)
{
// Array in Shared Memory
extern __shared__ float sdata[];
for (int i = 0; i < blockDim.x; i++)
{
sdata[blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
}
__syncthreads();
for (int i = 0; i < blockDim.x; i++)
{
d_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
}
}
For MATRIX_SIZE
64 and SEGMENT_SIZE
32 works as expected and never fails, but for bigger values of MATRIX_SIZE
(like 480) it starts behaving unexpectedly and on some executions the resulting matrix is not correct.
It's doubtful that an in-place transpose could work the way you have it designed here. __syncthreads()
does not synchronize the entire grid, only each block. Therefore, in a larger test case, you will have some threadblocks writing their output over input data that has not yet been read in by some other threadblock.
Two options would be to:
or:
(A third option not covered here would be to replace your usage of __syncthreads()
with an appropriate cooperative kernels grid-wide sync.)
Here is a worked example demonstrating both ideas:
$ cat t24.cu
#include <iostream>
__global__ void transposeMatrix(float *o_data, float *i_data, int mat_dim)
{
// Array in Shared Memory
extern __shared__ float sdata[];
for (int i = 0; i < blockDim.x; i++)
{
sdata[blockDim.x * i + threadIdx.x] = i_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
}
__syncthreads();
for (int i = 0; i < blockDim.x; i++)
{
o_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
}
}
__global__ void transposeMatrix_ip(float *d_data, int mat_dim)
{
// Array in Shared Memory
extern __shared__ float sdata[];
if (blockIdx.x >= blockIdx.y){ // only need blocks on or "above" the diagonal
for (int i = 0; i < blockDim.x; i++)
{
sdata[blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
if (blockIdx.x != blockIdx.y) sdata[(blockDim.x*blockDim.x)+blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.x * blockDim.x) * mat_dim + (blockDim.x * blockIdx.y + threadIdx.x)];
}
__syncthreads();
for (int i = 0; i < blockDim.x; i++)
{
d_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
if (blockIdx.x != blockIdx.y) d_data[(i + blockIdx.x * blockDim.x) + (threadIdx.x + blockIdx.y * blockDim.x) * mat_dim] = sdata[(blockDim.x*blockDim.x)+ threadIdx.x + i * blockDim.x];
}
}
}
int main(){
#define MATRIX_SIZE 4000
#define SEGMENT_SIZE 32
int n_block = (MATRIX_SIZE % SEGMENT_SIZE) ? MATRIX_SIZE / SEGMENT_SIZE + 1 : MATRIX_SIZE / SEGMENT_SIZE;
float *d_idata, *d_odata, *h_idata, *h_odata;
cudaMalloc(&d_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float));
cudaMalloc(&d_odata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float));
h_idata = new float[MATRIX_SIZE*MATRIX_SIZE];
h_odata = new float[MATRIX_SIZE*MATRIX_SIZE];
int q = 0;
for (int i = 0; i < MATRIX_SIZE; i++)
for (int j = 0; j < MATRIX_SIZE; j++)
h_idata[i*MATRIX_SIZE+j] = q++;
cudaMemcpy(d_idata, h_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksPerGrid(n_block, n_block);
dim3 threadsPerBlock(SEGMENT_SIZE);
transposeMatrix<<<blocksPerGrid, threadsPerBlock, threadsPerBlock.x * threadsPerBlock.x * sizeof(float)>>>(d_odata, d_idata, MATRIX_SIZE);
cudaMemcpy(h_odata, d_odata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
q = 0;
for (int i = 0; i < MATRIX_SIZE; i++)
for (int j = 0; j < MATRIX_SIZE; j++)
if (h_odata[j*MATRIX_SIZE+i] != q++) {std::cout << "1mismatch at: " << i << "," << j << " was: " << h_odata[j*MATRIX_SIZE+i] << " should be: " << q-1 << std::endl; break; break;}
transposeMatrix_ip<<<blocksPerGrid, threadsPerBlock, 2*threadsPerBlock.x * threadsPerBlock.x * sizeof(float)>>>(d_idata, MATRIX_SIZE);
cudaMemcpy(h_odata, d_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
q = 0;
for (int i = 0; i < MATRIX_SIZE; i++)
for (int j = 0; j < MATRIX_SIZE; j++)
if (h_odata[j*MATRIX_SIZE+i] != q++) {std::cout << "2mismatch at: " << i << "," << j << " was: " << h_odata[j*MATRIX_SIZE+i] << " should be: " << q-1 << std::endl; break; break;}
}
$ nvcc -o t24 t24.cu
$ compute-sanitizer ./t24
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$
this in-place approach has each block handling 2 tiles. Therefore we only need blocks on or on one side of the diagonal to act. Those blocks each load 2 tiles into shared memory, and then write out those two tiles. Therefore twice as much shared mem per block is needed. For the blocks on the diagonal (blockIdx.x == blockIdx.y
) we only need one tile operation per block not two.
Also note that the matrix size (4000) here is chosen carefully to avoid overflowing the integer-storage capacity of a float
quantity. If you make this test case much larger, it will probably fail due to the larger integers involved in the test case, not because the method is flawed.