I'm trying to understand how to work with shared memory using PyCuda. Running this code to flip an input vector:
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
n = 20
input = np.random.randint(10, size=(n))
output = np.zeros_like(input)
input = input.astype(np.int32)
output = output.astype(np.int32)
mod = SourceModule(
'''
__global__ void flipVectorSM(int* in, int* out, int n) {
extern __shared__ int sData[];
int inOffSet = blockDim.x * blockIdx.x;
int index = inOffSet + threadIdx.x;
if (index < n) {
sData[blockDim.x - 1 - threadIdx.x] = in[index];
__syncthreads();
}
int outOffSet = blockDim.x * (gridDim.x - 1 - blockIdx.x);
int outIndex = outOffSet + threadIdx.x;
out[outIndex] = sData[threadIdx.x];
}
'''
)
flip = mod.get_function('flipVectorSM')
flip(drv.In(input), drv.InOut(output), np.int32(n), block=(4, 1, 1), grid=(1, 1), shared=4)
I get this error:
---------------------------------------------------------------------------
LogicError Traceback (most recent call last)
<ipython-input-114-5b681ffa31fc> in <cell line: 15>()
13 output = output.astype(np.int32)
14
---> 15 mod = SourceModule(
16 '''
17 __global__ void flipVectorSM(int* in, int* out, int n) {
/usr/local/lib/python3.10/dist-packages/pycuda/compiler.py in __init__(self, source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs)
367 from pycuda.driver import module_from_buffer
368
--> 369 self.module = module_from_buffer(cubin)
370
371 self._bind_module()
LogicError: cuModuleLoadDataEx failed: an illegal memory access was encountered -
I've used the code segment about the global and shared memory that I've used for the same code but using CUDA-C and it works. How I can solve it?
I've changed the size of the thread block using the same dimension of the input vector. The number of the elements of the input array as shared memory's size is enough, so with this configuration it works, thank's.
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
n = 20
input = np.random.randint(10, size=(n))
output = np.zeros_like(input)
input = input.astype(np.int32)
output = output.astype(np.int32)
mod = SourceModule(
'''
__global__ void flipVectorSM(int* in, int* out, int n) {
extern __shared__ int sData[];
int inOffSet = blockDim.x * blockIdx.x;
int index = inOffSet + threadIdx.x;
if (index < n) {
sData[blockDim.x - 1 - threadIdx.x] = in[index];
__syncthreads();
}
int outOffSet = blockDim.x * (gridDim.x - 1 - blockIdx.x);
int outIndex = outOffSet + threadIdx.x;
out[outIndex] = sData[threadIdx.x];
}
'''
)
flip = mod.get_function('flipVectorSM')
flip(drv.In(input), drv.InOut(output), np.int32(n), block=(20, 1, 1), grid=(1, 1), shared=20)
print("Input vector:")
print(input)
print("\nOutput vector:")
print(output)