I am trying to write a code in numba cuda. I saw a lot of examples that deal with device memory and shared memory separately. I got stuck and confused. Can the code or the function deal with both, as example can the code multiply numbers using shared memory in some scale and in another scale use device.
Another thing to ask for, As I am trying to complicate the code step by step to calculate a fitness function I used a space i shared memory as intermediate stage sD with reduction step according mark harris presentation with half the threads and add 2 as s Sdata[tid] += Sdata[tid+s]
When I wrote the following code, I got an errors and I can't figure out why.
import numpy as np
import math
from numba import cuda, float32
@cuda.jit
def fast_matmul(A, C):
sA = cuda.shared.array(shape=(1, TPB), dtype=float32)
sD = cuda.shared.array(shape=(1, TPB), dtype=float32)
thread_idx_x = cuda.threadIdx.x
thread_idx_y = cuda.threadIdx.y
totla_No_of_threads_x = cuda.blockDim.x
totla_No_of_threads_y = cuda.blockDim.y
block_idx_x = cuda.blockIdx.x
block_idx_y = cuda.blockIdx.y
x, y = cuda.grid(2)
if x >= A.shape[1]: #and y >= C.shape[1]:
return
s = 0
index_1 = 1
for i in range(int(A.shape[1] / TPB)):
sA[thread_idx_x, thread_idx_y] = A[x, thread_idx_y + i * TPB]
cuda.syncthreads()
if thread_idx_y <= (totla_No_of_threads_y-index_1):
sD[thread_idx_x, thread_idx_y] = sA[thread_idx_x, (thread_idx_y +index_1)] - sA[thread_idx_x, thread_idx_y]
cuda.syncthreads()
for s in range(totla_No_of_threads_y//2):
if thread_idx_y < s:
sD[thread_idx_x, thread_idx_y] += sD[thread_idx_x, thread_idx_y+s]
cuda.syncthreads()
C[x, y] = sD[x,y]
A = np.full((1, 16), 3, dtype=np.float32)
C = np.zeros((1, 16))
print('A:', A, 'C:', C)
TPB = 32
dA = cuda.to_device(A)
dC= cuda.to_device(C)
fast_matmul[(1, 1), (32, 32)](dA, dC)
res= dC.copy_to_host()
print(res)
Error appears as
CudaAPIError Traceback (most recent call last)
<ipython-input-214-780fde9bbab5> in <module>
5 TPB = 32
6
----> 7 dA = cuda.to_device(A)
8 dC= cuda.to_device(C)
9 fast_matmul[(8, 8), (32, 32)](dA, dC)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devices.py in _require_cuda_context(*args, **kws)
222 def _require_cuda_context(*args, **kws):
223 with _runtime.ensure_context():
--> 224 return fn(*args, **kws)
225
226 return _require_cuda_context
~\Anaconda3\lib\site-packages\numba\cuda\api.py in to_device(obj, stream, copy, to)
108 """
109 if to is None:
--> 110 to, new = devicearray.auto_device(obj, stream=stream, copy=copy)
111 return to
112 if copy:
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in auto_device(obj, stream, copy)
764 subok=True)
765 sentry_contiguous(obj)
--> 766 devobj = from_array_like(obj, stream=stream)
767 if copy:
768 devobj.copy_to_device(obj, stream=stream)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in from_array_like(ary, stream, gpu_data)
686 "Create a DeviceNDArray object that is like ary."
687 return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
--> 688 writeback=ary, stream=stream, gpu_data=gpu_data)
689
690
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in __init__(self, shape, strides, dtype, stream, writeback, gpu_data)
102 self.strides,
103 self.dtype.itemsize)
--> 104 gpu_data = devices.get_context().memalloc(self.alloc_size)
105 else:
106 self.alloc_size = _driver.device_memory_size(gpu_data)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, bytesize)
1099
1100 def memalloc(self, bytesize):
-> 1101 return self.memory_manager.memalloc(bytesize)
1102
1103 def memhostalloc(self, bytesize, mapped=False, portable=False, wc=False):
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, size)
849 driver.cuMemAlloc(byref(ptr), size)
850
--> 851 self._attempt_allocation(allocator)
852
853 finalizer = _alloc_finalizer(self, ptr, size)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _attempt_allocation(self, allocator)
709 """
710 try:
--> 711 allocator()
712 except CudaAPIError as e:
713 # is out-of-memory?
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in allocator()
847
848 def allocator():
--> 849 driver.cuMemAlloc(byref(ptr), size)
850
851 self._attempt_allocation(allocator)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in safe_cuda_api_call(*args)
300 _logger.debug('call driver api: %s', libfn.__name__)
301 retcode = libfn(*args)
--> 302 self._check_error(fname, retcode)
303 return safe_cuda_api_call
304
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _check_error(self, fname, retcode)
335 _logger.critical(msg, _getpid(), self.pid)
336 raise CudaDriverError("CUDA initialized before forking")
--> 337 raise CudaAPIError(retcode, msg)
338
339 def get_device(self, devnum=0):
CudaAPIError: [700] Call to cuMemAlloc results in UNKNOWN_CUDA_ERROR
Yes, you can use both. When you copy data from host to device, it will start out in "device memory". Thereafter, if you want to use shared memory, you will have to explicitly copy data into it, from your kernel code. Likewise, when you want to return results back to host code (copy data from device to host) that data must be "device memory".
Shared memory is a smaller, scratchpad-style resource.
This provides a good example/comparison.