I'm trying to implement Cuda blocking, loosely following this tutorial
I have tried to implement that code but the loop hangs.
Similarly, I have implemented this in a kernel, where mutex is allocated on device:
int state = 0;
int * mutex;
CudaMalloc(&mutex, sizeof(int) );
cudaMemcpy(mutex, &state , sizeof(int), cudaMemcpyHostToDevice);
mykernel<<< 1, 9>>>(mutex);
__global__ void mykernel(int * mutex){
int ti = threadIdx.x+blockDim.x*blockIdx.x;
printf("TESTATOM0 -- THREAD%d MUTEX:%d\n", ti, *mutex);
printf("TESTATOM1 -- THREAD%d MUTEX:%d RETURN:%d\n", ti, *mutex, atomicCAS(mutex, 0, 1));
printf("TESTATOM2 -- THREAD%d MUTEX:%d\n", ti, *mutex);
atomicExch(mutex, 0);
printf("TESTATOM3 -- THREAD%d MUTEX:%d\n", ti, *mutex);
}
The output:
TESTATOM0 -- THREAD0 MUTEX:0
TESTATOM0 -- THREAD1 MUTEX:0
TESTATOM0 -- THREAD2 MUTEX:0
TESTATOM0 -- THREAD3 MUTEX:0
TESTATOM0 -- THREAD4 MUTEX:0
TESTATOM0 -- THREAD5 MUTEX:0
TESTATOM0 -- THREAD6 MUTEX:0
TESTATOM0 -- THREAD7 MUTEX:0
TESTATOM0 -- THREAD8 MUTEX:0
TESTATOM1 -- THREAD0 MUTEX:0 RETURN:0
TESTATOM1 -- THREAD1 MUTEX:0 RETURN:1
TESTATOM1 -- THREAD2 MUTEX:0 RETURN:1
TESTATOM1 -- THREAD3 MUTEX:0 RETURN:1
TESTATOM1 -- THREAD4 MUTEX:0 RETURN:1
TESTATOM1 -- THREAD5 MUTEX:0 RETURN:1
TESTATOM1 -- THREAD6 MUTEX:0 RETURN:1
TESTATOM1 -- THREAD7 MUTEX:0 RETURN:1
TESTATOM1 -- THREAD8 MUTEX:0 RETURN:1
TESTATOM2 -- THREAD0 MUTEX:1
TESTATOM2 -- THREAD1 MUTEX:1
TESTATOM2 -- THREAD2 MUTEX:1
TESTATOM2 -- THREAD3 MUTEX:1
TESTATOM2 -- THREAD4 MUTEX:1
TESTATOM2 -- THREAD5 MUTEX:1
TESTATOM2 -- THREAD6 MUTEX:1
TESTATOM2 -- THREAD7 MUTEX:1
TESTATOM2 -- THREAD8 MUTEX:1
TESTATOM3 -- THREAD0 MUTEX:0
TESTATOM3 -- THREAD1 MUTEX:0
TESTATOM3 -- THREAD2 MUTEX:0
TESTATOM3 -- THREAD3 MUTEX:0
TESTATOM3 -- THREAD4 MUTEX:0
TESTATOM3 -- THREAD5 MUTEX:0
TESTATOM3 -- THREAD6 MUTEX:0
TESTATOM3 -- THREAD7 MUTEX:0
TESTATOM3 -- THREAD8 MUTEX:0
Notice the first thread returns 0 and the rest return 1
But when I try and implement this in a while blocking loop
printf("TESTATOM0 -- THREAD%d MUTEX:%d\n", ti, *mutex);
while (atomicCAS(mutex, 0, 1) == 1);
for(int i = 0; i < 5; i++){
printf("BLOCKED -- THREAD%d MUTEX:%d\n", ti, *mutex);
}
atomicExch(mutex, 0);
for(int i = 0; i < 5; i++){
printf("UNBLOCKED -- THREAD%d MUTEX:%d\n", ti, *mutex);
}
The ouput is:
TESTATOM0 -- THREAD0 MUTEX:0
TESTATOM0 -- THREAD1 MUTEX:0
TESTATOM0 -- THREAD2 MUTEX:0
TESTATOM0 -- THREAD3 MUTEX:0
TESTATOM0 -- THREAD4 MUTEX:0
TESTATOM0 -- THREAD5 MUTEX:0
TESTATOM0 -- THREAD6 MUTEX:0
TESTATOM0 -- THREAD7 MUTEX:0
TESTATOM0 -- THREAD8 MUTEX:0
The program then hangs and I have to exit from the command line.
Compiled and run as:
nvcc -o ppmer -ccbin "D:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.29.30133\bin\HostX86\x64\cl.exe" ppmwriter.cu > .\outputs\AA_CURRENT_ERROR.txt
ppmer > .\outputs\AA_CURRENT_OUTPUT.txt
I'm using Nvidia GeForce GTX 1650 with Max-Q design
I assume the loop is not terminating but I have no clue why?
As per the comments from paleonix, the solution is the compile option:
-arch=sm_75