cudamatrix-multiplicationnumbapycuda

CUDA out of memory error when doing matrix multiplication using Numba


I need to multiply a matrix with its transpose and I am running out of memory on my GPU with eror message numba.cuda.cudadrv.driver.CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY

I am expecting the size of my matrix to be around 10k rows and 100k columns so multiplying it with its trnspose will give a result of a square matrix of 10k rows and 10k columns. The matrix only contains 0 and 1.

This is the script that I am running.

from numba import cuda, uint16
import numba
import numpy
import math
import time

TPB = 16

@cuda.jit()
def matmul_shared_mem(A, B, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint16)
    sB = cuda.shared.array((TPB, TPB), dtype=uint16)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    if x >= C.shape[0] and y >= C.shape[1]:
        return
    tmp = 0.
    for i in range(int(A.shape[1] / TPB)):
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]
        cuda.syncthreads()
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]
        cuda.syncthreads()
    C[x, y] = tmp

A = numpy.random.randint(2, size=(TPB * 625, 50000))

B = A.transpose()

C_shared_mem = cuda.device_array((A.shape[0], B.shape[1]))

threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(B.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

start_gpu_shared_memory = time.time()
matmul_shared_mem[blocks_per_grid, threads_per_block](A, B, C_shared_mem)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))

Update 1:

Based on your suggestions, I made certain changes but well I am still running out of memory.

import numpy as np
import numba as nb
colm = int(200000/8)
rows = 100000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
A = np.empty((rows,colm), dtype=np.uint8)

@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def compute(A, AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = AU[i,offset] << 7
            res |= AU[i,offset+1] << 6
            res |= AU[i,offset+2] << 5
            res |= AU[i,offset+3] << 4
            res |= AU[i,offset+4] << 3
            res |= AU[i,offset+5] << 2
            res |= AU[i,offset+6] << 1
            res |= AU[i,offset+7]
            A[i,j] = res

compute(A, AU)

from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time

TPB = 8
TPB1 = 9

@cuda.jit()
def bit_A_AT(A, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    if bx >= by:
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1) // TPB):
            if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty, tx] = A[y, i*TPB+tx]
            else:
                sA[ty, tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
            else:
                sB[ty, tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp1 = sA[ty,j] & sB[tx, j]
                test = uint8(1)
                for k in range(8):
                    if (tmp1 & test) > 0:
                        tmp += 1
                    test <<= 1
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y, x] = tmp

C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))

Any idea how I can fiix this?


Solution

  • The following method should reduce the amount of device memory required for the calculation of A x AT. We'll use the following ideas:

    Here is a worked example:

    $ cat t62.py
    from numba import cuda, int32, int8
    import numba
    import numpy as np
    import math
    import time
    
    TPB = 32
    TPB1 = TPB+1
    @cuda.jit()
    def byte_A_AT(A, C):
        sA = cuda.shared.array((TPB, TPB),  dtype=int8)
        sB = cuda.shared.array((TPB, TPB1), dtype=int8)
        x, y = cuda.grid(2)
        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x
        by = cuda.blockIdx.y
    # uncomment and indent remainder of kernel to only do the "symmetric half" of calculation
    #    if bx >= by:
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1)// TPB):
            if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty, tx] = A[y, i*TPB+tx]
            else:
                sA[ty, tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
            else:
                sB[ty, tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp += int32(sA[ty,j]) * int32(sB[tx, j])
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y, x] = tmp
    rows = 1041
    cols = 1043
    print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (rows*cols+rows*rows*4)//1048576, 'MB')
    A = np.random.randint(2,size=(rows, cols),dtype=np.int8)
    AT = A.transpose()
    CU = np.matmul(A,AT, dtype = np.int32)
    C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
    threads_per_block = (TPB, TPB)
    blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
    blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    byte_A_AT[blocks_per_grid, threads_per_block](A, C)
    cuda.synchronize()
    start_gpu_shared_memory = time.time()
    byte_A_AT[blocks_per_grid, threads_per_block](A, C)
    cuda.synchronize()
    end_gpu_shared_memory = time.time()
    
    time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
    print("GPU time(shared memory):" + str(time_gpu_shared))
    test = np.array_equal(C, CU)
    print(test)
    if test == False:
        for i in range(C.shape[0]):
            for j in range(C.shape[1]):
                if C[i,j] != CU[i,j]:
                    print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
    $ python t62.py
    host mem:  10 MB device mem:  5 MB
    GPU time(shared memory):0.019593000411987305
    True
    $
    

    Notes:

    Here is the one bit per element version, it has a requirement that the number of columns be whole-number divisible by 8:

    $ cat t61.py
    from numba import cuda, uint8, int32
    import numba
    import numpy as np
    import math
    import time
    
    TPB = 32
    TPB1 = 33
    
    @cuda.jit()
    def bit_A_AT(A, C):
        sA = cuda.shared.array((TPB, TPB), dtype=uint8)
        sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
        x, y = cuda.grid(2)
        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x
        by = cuda.blockIdx.y
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1) // TPB):
            if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty, tx] = A[y, i*TPB+tx]
            else:
                sA[ty, tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
            else:
                sB[ty, tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp1 = sA[ty,j] & sB[tx, j]
                test = uint8(1)
                for k in range(8):
                    if (tmp1 & test) > 0:
                        tmp += 1
                    test <<= 1
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y, x] = tmp
    colm = 131
    rows = 1041
    cols = int(colm*8)
    print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+rows*rows*4)//1048576, 'MB')
    AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
    AUT = AU.transpose()
    CU = np.matmul(AU,AUT,dtype=np.int32)
    A = np.empty((rows,colm), dtype=np.uint8)
    for i in range(A.shape[0]):
        for j in range(A.shape[1]):
            A[i,j] = 0
            for k in range(8):
                if AU[i,(j*8)+k] == 1:
                    A[i,j] = A[i,j] | (1<<(7-k))
    C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
    threads_per_block = (TPB, TPB)
    blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
    blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    bit_A_AT[blocks_per_grid, threads_per_block](A, C)
    cuda.synchronize()
    
    start_gpu_shared_memory = time.time()
    bit_A_AT[blocks_per_grid, threads_per_block](A, C)
    cuda.synchronize()
    end_gpu_shared_memory = time.time()
    
    time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
    print("GPU time(shared memory):" + str(time_gpu_shared))
    test = np.array_equal(C, CU)
    print(test)
    if test == False:
        for i in range(C.shape[0]):
            for j in range(C.shape[1]):
                if C[i,j] != CU[i,j]:
                    print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
                    break
    $ python t61.py
    host mem:  10 MB device mem:  4 MB
    GPU time(shared memory):0.009343624114990234
    True
    $
    

    EDIT: Responding to some questions in the comments, updates, and now taking into account that the A matrix may have significantly more than 30k rows, this will cause the C matrix to increase as well. If the A matrix can be fit in GPU memory, we can reduce the memory demand of the C matrix by computing it in pieces. These pieces will be a group of rows computed together, which I refer to as a row_slice of the C matrix. The following code demonstrates that this can be achieved with relatively minor changes to the code above:

    $ cat t63.py
    from numba import cuda, uint8, int32
    import numba as nb
    import numpy as np
    import math
    import time
    
    TPB = 32
    TPB1 = 33
    
    @cuda.jit()
    def bit_slice_A_AT(A, C, row_offset):
        sA = cuda.shared.array((TPB, TPB), dtype=uint8)
        sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
        x, y = cuda.grid(2)
        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x
        by = cuda.blockIdx.y
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1) // TPB):
            if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty, tx] = A[y+row_offset, i*TPB+tx]
            else:
                sA[ty, tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
            else:
                sB[ty, tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp1 = sA[ty,j] & sB[tx, j]
                test = uint8(1)
                for k in range(8):
                    if (tmp1 & test) > 0:
                        tmp += 1
                    test <<= 1
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y, x] = tmp
    
    @nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
    def bitpack(A, AU):
        for i in nb.prange(A.shape[0]):
            for j in range(A.shape[1]):
                offset = j * 8
                res = AU[i,offset] << 7
                res |= AU[i,offset+1] << 6
                res |= AU[i,offset+2] << 5
                res |= AU[i,offset+3] << 4
                res |= AU[i,offset+4] << 3
                res |= AU[i,offset+5] << 2
                res |= AU[i,offset+6] << 1
                res |= AU[i,offset+7]
                A[i,j] = res
    
    colm = 131
    rows = 1535
    cols = int(colm*8)
    row_slice = 512
    print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
    AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
    CU = np.matmul(AU,AU.T,dtype=np.int32)
    A = np.empty((rows,colm), dtype=np.uint8)
    bitpack(A, AU)
    A_dev = cuda.to_device(A)
    threads_per_block = (TPB, TPB)
    C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
    blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
    blocks_per_grid_y = int(row_slice / threads_per_block[1])
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    for i in range((A.shape[0]+row_slice-1)//row_slice):
        bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
        lower = i*row_slice
        upper = min(lower+row_slice, CU.shape[0])
        width = upper-lower
        test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
        print(test)
    cuda.synchronize()
    C_dev = cuda.device_array_like(C)
    start_gpu_shared_memory = time.time()
    for i in range((A.shape[0]+row_slice-1)//row_slice):
        bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
    cuda.synchronize()
    end_gpu_shared_memory = time.time()
    
    time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
    print("GPU time(shared memory):" + str(time_gpu_shared))
    $ python t63.py
    host mem:  21 MB device mem:  3 MB
    True
    True
    True
    GPU time(shared memory):0.010116815567016602
    $
    

    This means, as suggested, for the case of rows = 100k and columns = 200k as given in the latest update to the question, we should be able to divide the C matrix into chunks of say 5k rows. The memory usage for the A matrix would be 2.5GB, but for the C matrix, since we are only computing a 5k row slice at a time, the device memory storage required would be 100k*5k*4 bytes, so 2GB for this example.

    After some further study, we can speed up the host code matmul operation by switching from int8 datatype to float32 datatype. This makes that op quite a bit faster, but the GPU code still seems to ~4x faster than that:

    $ cat t64.py
    from numba import cuda, uint8, int32
    import numba as nb
    import numpy as np
    import math
    import time
    
    TPB = 32
    TPB1 = 33
    
    @cuda.jit()
    def bit_slice_A_AT(A, C, row_offset):
        sA = cuda.shared.array((TPB, TPB), dtype=uint8)
        sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
        x, y = cuda.grid(2)
        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x
        by = cuda.blockIdx.y
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1) // TPB):
            if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty, tx] = A[y+row_offset, i*TPB+tx]
            else:
                sA[ty, tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
            else:
                sB[ty, tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp1 = sA[ty,j] & sB[tx, j]
                test = uint8(1)
                for k in range(8):
                    if (tmp1 & test) > 0:
                        tmp += 1
                    test <<= 1
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y, x] = tmp
    
    @nb.njit('void(uint8[:,:],float32[:,:])', parallel=True)
    def bitpack(A, AU):
        for i in nb.prange(A.shape[0]):
            for j in range(A.shape[1]):
                offset = j * 8
                res = int(AU[i,offset]) << 7
                res |= int(AU[i,offset+1]) << 6
                res |= int(AU[i,offset+2]) << 5
                res |= int(AU[i,offset+3]) << 4
                res |= int(AU[i,offset+4]) << 3
                res |= int(AU[i,offset+5]) << 2
                res |= int(AU[i,offset+6]) << 1
                res |= int(AU[i,offset+7])
                A[i,j] = res
    
    colm = 1000
    rows = 6000
    cols = int(colm*8)
    row_slice = 512
    print('host mem: ', (rows*cols*4+rows*colm+rows*rows*4+rows*row_slice*4)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
    t1 = time.time()
    AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
    AU = AU.astype(np.float32)
    print("randint:" + str(time.time()-t1))
    t1 = time.time()
    #CU = np.empty((rows, rows), dtype=np.int32)
    CU = np.matmul(AU,AU.T,dtype=np.float32)
    print("matmul:" + str(time.time()-t1))
    t1 = time.time()
    A = np.empty((rows,colm), dtype=np.uint8)
    print("np.empty:" + str(time.time()-t1))
    t1 = time.time()
    bitpack(A, AU)
    print("bitpack:" + str(time.time()-t1))
    t1 = time.time()
    A_dev = cuda.to_device(A)
    threads_per_block = (TPB, TPB)
    C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
    blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
    blocks_per_grid_y = int(row_slice / threads_per_block[1])
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    for i in range((A.shape[0]+row_slice-1)//row_slice):
        bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
        lower = i*row_slice
        upper = min(lower+row_slice, CU.shape[0])
        width = upper-lower
        test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
        print(test)
    cuda.synchronize()
    C_dev = cuda.device_array_like(C)
    start_gpu_shared_memory = time.time()
    for i in range((A.shape[0]+row_slice-1)//row_slice):
        bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
    cuda.synchronize()
    end_gpu_shared_memory = time.time()
    
    time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
    print("GPU time(shared memory):" + str(time_gpu_shared))
    $ python t64.py
    host mem:  337 MB device mem:  17 MB
    randint:0.1817936897277832
    matmul:3.498671531677246
    np.empty:7.62939453125e-05
    bitpack:0.03707313537597656
    True
    True
    True
    True
    True
    True
    True
    True
    True
    True
    True
    True
    GPU time(shared memory):0.8318064212799072
    $
    

    I haven't thoroughly tested these codes. Bugs may exist. Use at your own risk.

    For attribution, the numba bit-packing code seems to have come from here.