[SOLVED] how to use Shared memory and Global memory and is it possible to use shared as intermediate stage in calculating

I am trying to write a code in numba cuda. I saw a lot of examples that deal with device memory and shared memory separately. I got stuck and confused. Can the code or the function deal with both, as example can the code multiply numbers using shared memory in some scale and in another scale use device.

Another thing to ask for, As I am trying to complicate the code step by step to calculate a fitness function I used a space i shared memory as intermediate stage sD with reduction step according mark harris presentation with half the threads and add 2 as s Sdata[tid] += Sdata[tid+s]

When I wrote the following code, I got an errors and I can't figure out why.

import numpy as np
import math
from numba import cuda, float32

@cuda.jit
def fast_matmul(A, C):
    
    sA = cuda.shared.array(shape=(1, TPB), dtype=float32)
    sD = cuda.shared.array(shape=(1, TPB), dtype=float32)

    thread_idx_x = cuda.threadIdx.x
    thread_idx_y = cuda.threadIdx.y
    totla_No_of_threads_x = cuda.blockDim.x
    totla_No_of_threads_y = cuda.blockDim.y
    block_idx_x = cuda.blockIdx.x
    block_idx_y = cuda.blockIdx.y
    
    x, y = cuda.grid(2)
    
    if x >= A.shape[1]: #and y >= C.shape[1]:
        return
    
    s = 0
    index_1 = 1
    for i in range(int(A.shape[1] / TPB)):
        sA[thread_idx_x, thread_idx_y] = A[x, thread_idx_y + i * TPB]
        cuda.syncthreads()

        if thread_idx_y <= (totla_No_of_threads_y-index_1):
            sD[thread_idx_x, thread_idx_y] = sA[thread_idx_x, (thread_idx_y +index_1)] - sA[thread_idx_x, thread_idx_y]
            cuda.syncthreads()
            
        for s in range(totla_No_of_threads_y//2):
            if thread_idx_y < s:
                sD[thread_idx_x, thread_idx_y] += sD[thread_idx_x, thread_idx_y+s]
            cuda.syncthreads()
            C[x, y] = sD[x,y]



A = np.full((1, 16), 3, dtype=np.float32)
C = np.zeros((1, 16))

print('A:', A, 'C:', C)
TPB = 32

dA = cuda.to_device(A)
dC= cuda.to_device(C)
fast_matmul[(1, 1), (32, 32)](dA, dC)
res= dC.copy_to_host()

print(res)

Error appears as

CudaAPIError                              Traceback (most recent call last)
<ipython-input-214-780fde9bbab5> in <module>
      5 TPB = 32
      6 
----> 7 dA = cuda.to_device(A)
      8 dC= cuda.to_device(C)
      9 fast_matmul[(8, 8), (32, 32)](dA, dC)

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devices.py in _require_cuda_context(*args, **kws)
    222     def _require_cuda_context(*args, **kws):
    223         with _runtime.ensure_context():
--> 224             return fn(*args, **kws)
    225 
    226     return _require_cuda_context

~\Anaconda3\lib\site-packages\numba\cuda\api.py in to_device(obj, stream, copy, to)
    108     """
    109     if to is None:
--> 110         to, new = devicearray.auto_device(obj, stream=stream, copy=copy)
    111         return to
    112     if copy:

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in auto_device(obj, stream, copy)
    764                 subok=True)
    765             sentry_contiguous(obj)
--> 766             devobj = from_array_like(obj, stream=stream)
    767         if copy:
    768             devobj.copy_to_device(obj, stream=stream)

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in from_array_like(ary, stream, gpu_data)
    686     "Create a DeviceNDArray object that is like ary."
    687     return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
--> 688                          writeback=ary, stream=stream, gpu_data=gpu_data)
    689 
    690 

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in __init__(self, shape, strides, dtype, stream, writeback, gpu_data)
    102                                                                 self.strides,
    103                                                                 self.dtype.itemsize)
--> 104                 gpu_data = devices.get_context().memalloc(self.alloc_size)
    105             else:
    106                 self.alloc_size = _driver.device_memory_size(gpu_data)

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, bytesize)
   1099 
   1100     def memalloc(self, bytesize):
-> 1101         return self.memory_manager.memalloc(bytesize)
   1102 
   1103     def memhostalloc(self, bytesize, mapped=False, portable=False, wc=False):

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, size)
    849             driver.cuMemAlloc(byref(ptr), size)
    850 
--> 851         self._attempt_allocation(allocator)
    852 
    853         finalizer = _alloc_finalizer(self, ptr, size)

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _attempt_allocation(self, allocator)
    709         """
    710         try:
--> 711             allocator()
    712         except CudaAPIError as e:
    713             # is out-of-memory?

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in allocator()
    847 
    848         def allocator():
--> 849             driver.cuMemAlloc(byref(ptr), size)
    850 
    851         self._attempt_allocation(allocator)

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in safe_cuda_api_call(*args)
    300             _logger.debug('call driver api: %s', libfn.__name__)
    301             retcode = libfn(*args)
--> 302             self._check_error(fname, retcode)
    303         return safe_cuda_api_call
    304 

~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _check_error(self, fname, retcode)
    335                     _logger.critical(msg, _getpid(), self.pid)
    336                     raise CudaDriverError("CUDA initialized before forking")
--> 337             raise CudaAPIError(retcode, msg)
    338 
    339     def get_device(self, devnum=0):

CudaAPIError: [700] Call to cuMemAlloc results in UNKNOWN_CUDA_ERROR