I have a toy code below:
#include <stdio.h>
#include <stdlib.h>
__global__ void add1InGPU( int *devArr )
{
int i = threadIdx.x;
devArr[i] += 1;
}
int main( void )
{
int *h_arr = (int*)malloc( 10 * sizeof( int ) );
int *d_arr = NULL;
cudaMalloc( &d_arr, 10 * sizeof( int ) );
cudaMemset( d_arr, 10, 10 * sizeof( int ) );
add1InGPU <<<1, 10>>> ( d_arr );
cudaDeviceSynchronize();
cudaMemcpy(h_arr, d_arr, 10 * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_arr);
for (int i = 0; i < 10; i++) {
printf("%d\n", h_arr[i]);
}
free( h_arr );
return 0;
}
Why running it I get 168430091
in each index of the array? Seems to me a corrupted number.
However, if I change cudaMemset( d_arr, 10, 10 * sizeof( int ) );
to cudaMemset( d_arr, 0, 10 * sizeof( int ) );
, I get the correct result, which is all 1 in the array.
Kindly teach where I got wrong, thanks in advance.
Alright I got why… cudaMemset, like a plain memset, only set each byte to the target value, while int is, in most machines, 4 bytes. Hence the issue.