I need to multiply a matrix with its transpose and I am running out of memory on my GPU with eror message numba.cuda.cudadrv.driver.CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY
I am expecting the size of my matrix to be around 10k rows and 100k columns so multiplying it with its trnspose will give a result of a square matrix of 10k rows and 10k columns. The matrix only contains 0 and 1.
This is the script that I am running.
from numba import cuda, uint16
import numba
import numpy
import math
import time
TPB = 16
@cuda.jit()
def matmul_shared_mem(A, B, C):
sA = cuda.shared.array((TPB, TPB), dtype=uint16)
sB = cuda.shared.array((TPB, TPB), dtype=uint16)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
if x >= C.shape[0] and y >= C.shape[1]:
return
tmp = 0.
for i in range(int(A.shape[1] / TPB)):
sA[tx, ty] = A[x, ty + i * TPB]
sB[tx, ty] = B[tx + i * TPB, y]
cuda.syncthreads()
for j in range(TPB):
tmp += sA[tx, j] * sB[j, ty]
cuda.syncthreads()
C[x, y] = tmp
A = numpy.random.randint(2, size=(TPB * 625, 50000))
B = A.transpose()
C_shared_mem = cuda.device_array((A.shape[0], B.shape[1]))
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(B.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
start_gpu_shared_memory = time.time()
matmul_shared_mem[blocks_per_grid, threads_per_block](A, B, C_shared_mem)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
Update 1:
Based on your suggestions, I made certain changes but well I am still running out of memory.
import numpy as np
import numba as nb
colm = int(200000/8)
rows = 100000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
A = np.empty((rows,colm), dtype=np.uint8)
@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def compute(A, AU):
for i in nb.prange(A.shape[0]):
for j in range(A.shape[1]):
offset = j * 8
res = AU[i,offset] << 7
res |= AU[i,offset+1] << 6
res |= AU[i,offset+2] << 5
res |= AU[i,offset+3] << 4
res |= AU[i,offset+4] << 3
res |= AU[i,offset+5] << 2
res |= AU[i,offset+6] << 1
res |= AU[i,offset+7]
A[i,j] = res
compute(A, AU)
from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time
TPB = 8
TPB1 = 9
@cuda.jit()
def bit_A_AT(A, C):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
if bx >= by:
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
Any idea how I can fiix this?
The following method should reduce the amount of device memory required for the calculation of A x AT. We'll use the following ideas:
A
) only takes on values of 0,1, we'll reduce the storage for that array down to the minimum convenient size, int8
, i.e. one byte per elementB
array is just the transpose of the A
array, there is no need to handle it explicitly. We can derive it from the A
array, somehwhat similar to here, although that is performing AT x AsB
array using adjusted indexingsyncthreads
and modified the code to allow arbitrary values for row and column dimensionsHere is a worked example:
$ cat t62.py
from numba import cuda, int32, int8
import numba
import numpy as np
import math
import time
TPB = 32
TPB1 = TPB+1
@cuda.jit()
def byte_A_AT(A, C):
sA = cuda.shared.array((TPB, TPB), dtype=int8)
sB = cuda.shared.array((TPB, TPB1), dtype=int8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
# uncomment and indent remainder of kernel to only do the "symmetric half" of calculation
# if bx >= by:
tmp = int32(0)
for i in range((A.shape[1]+TPB-1)// TPB):
if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp += int32(sA[ty,j]) * int32(sB[tx, j])
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
rows = 1041
cols = 1043
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (rows*cols+rows*rows*4)//1048576, 'MB')
A = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AT = A.transpose()
CU = np.matmul(A,AT, dtype = np.int32)
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
start_gpu_shared_memory = time.time()
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
for i in range(C.shape[0]):
for j in range(C.shape[1]):
if C[i,j] != CU[i,j]:
print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
$ python t62.py
host mem: 10 MB device mem: 5 MB
GPU time(shared memory):0.019593000411987305
True
$
Notes:
np.matmul()
operation, which is really only used to verify results, it should not be necessary for an actual implementation), not in the GPU portion. As the matrices are made larger, the code will run much more slowly.np.array_equal
test to fail, of course.A
matrix, which would further reduce memory demand, however writing this code to handle the case of arbitrary column dimensions (vs. multiples of 8) proves to be rather messy. However that code could get the total device memory consumption down to about 5GB for 30k rows and 200k columns case.Here is the one bit per element version, it has a requirement that the number of columns be whole-number divisible by 8:
$ cat t61.py
from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time
TPB = 32
TPB1 = 33
@cuda.jit()
def bit_A_AT(A, C):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
colm = 131
rows = 1041
cols = int(colm*8)
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+rows*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AUT = AU.transpose()
CU = np.matmul(AU,AUT,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
for i in range(A.shape[0]):
for j in range(A.shape[1]):
A[i,j] = 0
for k in range(8):
if AU[i,(j*8)+k] == 1:
A[i,j] = A[i,j] | (1<<(7-k))
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
for i in range(C.shape[0]):
for j in range(C.shape[1]):
if C[i,j] != CU[i,j]:
print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
break
$ python t61.py
host mem: 10 MB device mem: 4 MB
GPU time(shared memory):0.009343624114990234
True
$
EDIT: Responding to some questions in the comments, updates, and now taking into account that the A matrix may have significantly more than 30k rows, this will cause the C matrix to increase as well. If the A matrix can be fit in GPU memory, we can reduce the memory demand of the C matrix by computing it in pieces. These pieces will be a group of rows computed together, which I refer to as a row_slice
of the C matrix. The following code demonstrates that this can be achieved with relatively minor changes to the code above:
$ cat t63.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time
TPB = 32
TPB1 = 33
@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y+row_offset, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def bitpack(A, AU):
for i in nb.prange(A.shape[0]):
for j in range(A.shape[1]):
offset = j * 8
res = AU[i,offset] << 7
res |= AU[i,offset+1] << 6
res |= AU[i,offset+2] << 5
res |= AU[i,offset+3] << 4
res |= AU[i,offset+4] << 3
res |= AU[i,offset+5] << 2
res |= AU[i,offset+6] << 1
res |= AU[i,offset+7]
A[i,j] = res
colm = 131
rows = 1535
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
CU = np.matmul(AU,AU.T,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
bitpack(A, AU)
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
lower = i*row_slice
upper = min(lower+row_slice, CU.shape[0])
width = upper-lower
test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t63.py
host mem: 21 MB device mem: 3 MB
True
True
True
GPU time(shared memory):0.010116815567016602
$
This means, as suggested, for the case of rows = 100k and columns = 200k as given in the latest update to the question, we should be able to divide the C matrix into chunks of say 5k rows. The memory usage for the A matrix would be 2.5GB, but for the C matrix, since we are only computing a 5k row slice at a time, the device memory storage required would be 100k*5k*4
bytes, so 2GB for this example.
After some further study, we can speed up the host code matmul
operation by switching from int8
datatype to float32
datatype. This makes that op quite a bit faster, but the GPU code still seems to ~4x faster than that:
$ cat t64.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time
TPB = 32
TPB1 = 33
@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y+row_offset, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
@nb.njit('void(uint8[:,:],float32[:,:])', parallel=True)
def bitpack(A, AU):
for i in nb.prange(A.shape[0]):
for j in range(A.shape[1]):
offset = j * 8
res = int(AU[i,offset]) << 7
res |= int(AU[i,offset+1]) << 6
res |= int(AU[i,offset+2]) << 5
res |= int(AU[i,offset+3]) << 4
res |= int(AU[i,offset+4]) << 3
res |= int(AU[i,offset+5]) << 2
res |= int(AU[i,offset+6]) << 1
res |= int(AU[i,offset+7])
A[i,j] = res
colm = 1000
rows = 6000
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*4+rows*colm+rows*rows*4+rows*row_slice*4)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
t1 = time.time()
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AU = AU.astype(np.float32)
print("randint:" + str(time.time()-t1))
t1 = time.time()
#CU = np.empty((rows, rows), dtype=np.int32)
CU = np.matmul(AU,AU.T,dtype=np.float32)
print("matmul:" + str(time.time()-t1))
t1 = time.time()
A = np.empty((rows,colm), dtype=np.uint8)
print("np.empty:" + str(time.time()-t1))
t1 = time.time()
bitpack(A, AU)
print("bitpack:" + str(time.time()-t1))
t1 = time.time()
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
lower = i*row_slice
upper = min(lower+row_slice, CU.shape[0])
width = upper-lower
test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t64.py
host mem: 337 MB device mem: 17 MB
randint:0.1817936897277832
matmul:3.498671531677246
np.empty:7.62939453125e-05
bitpack:0.03707313537597656
True
True
True
True
True
True
True
True
True
True
True
True
GPU time(shared memory):0.8318064212799072
$
I haven't thoroughly tested these codes. Bugs may exist. Use at your own risk.
For attribution, the numba bit-packing code seems to have come from here.