I am doing 2D or 3D binary masks around given coordinates and then identifying them as labels with scipy.ndimage.label. Now, I have a cupy solution, a numpy solution. Cupy is fast, numpy is very slow, both do their job. I'm trying to make it run on other non nvidia GPUs or CPUs with openCl or any other form that would be more "cross platform" such as pyopencl or numba, or tensorflow even (or all of the above)
This is my current function:
def make_labels_links(shape,j,radius=5,_algo="GPU"):
import scipy.ndimage as ndi
if _algo == "GPU":
import cupy as cp
if 'z' in j:
pos = cp.dstack((j.z,j.y,j.x))[0]#.astype(int)
print("3D",j)
else:
pos = cp.dstack((j.y,j.x))[0]#.astype(int)
print("2D",j)
ndim = len(shape)
# radius = validate_tuple(radius, ndim)
pos = cp.atleast_2d(pos)
# if include_edge:
in_mask = cp.array([cp.sum(((cp.indices(shape).T - p) / radius)**2, -1) <= 1
for p in pos])
# else:
# in_mask = [np.sum(((np.indices(shape).T - p) / radius)**2, -1) < 1
# for p in pos]
mask_total = cp.any(in_mask, axis=0).T
##if they overlap the labels won't match the points
#we can make np.ones * ID of the point and then np.max(axis=-1)
labels, nb = ndi.label(cp.asnumpy(mask_total))
#this is super slow
elif _algo=='CPU':
if 'z' in j:
# "Need to loop each t and do one at a time"
pos = np.dstack((j.z,j.y,j.x))[0]#.astype(int)
print("3D",j)
else:
pos = np.dstack((j.y,j.x))[0]#.astype(int)
print("2D",j)
ndim = len(shape)
pos = np.atleast_2d(pos)
in_mask = np.array([np.sum(((np.indices(shape).T - p) / radius)**2, -1) <= 1
for p in pos])
mask_total = np.any(in_mask, axis=0).T
labels, nb = ndi.label(mask_total)
shape is for example (1024,1024) or (20,1024,1024) j is the positions as a pandas dataframe for example:
ID Frame z y x mass size ecc
0 14 0 9.121369 24.139295 297.156056 5311.661985 1.383264 NaN
1 7 0 8.270742 24.880990 308.757851 6341.082129 1.403242 NaN
2 1 0 4.881552 28.963021 291.900410 6587.918081 1.330116 NaN
3 8 0 8.015537 31.137655 304.999176 5423.140504 1.341228 NaN
4 15 0 10.910010 32.275774 298.940183 6033.069086 1.333235 NaN
5 4 0 7.057360 33.958352 313.252877 8123.989610 1.395905 NaN
6 9 0 7.922786 39.966382 308.092925 6683.742777 1.398941 NaN
7 5 0 7.163435 40.774834 329.106137 5225.643216 1.402079 NaN
8 10 0 7.665061 49.844910 306.998559 6054.428176 1.408336 NaN
9 11 0 7.909787 50.951857 320.125061 5057.551819 1.314976 NaN
10 6 0 7.148229 52.051917 312.929513 9427.345719 1.351461 NaN
11 12 0 7.599882 55.917379 303.916436 6220.891194 1.401507 NaN
12 0 0 3.992557 59.060168 298.188021 7489.421410 1.391892 NaN
13 13 0 7.684166 62.945226 303.940392 7970.470413 1.384929 NaN
14 3 0 6.102017 70.796726 316.141957 5091.926046 1.393204 NaN
15 2 0 4.937488 87.916144 192.836998 5068.320508 1.330015 NaN
I tried to do it with pyopencl but I was not able to implement it, any suggestions? Thanks
This works for pyopencl
def make_labels_links_opencl(shape, j, radius=5):
import numpy as np
import pyopencl as cl
from scipy.ndimage import label
if 'z' in j:
# "Need to loop each t and do one at a time"
positions = np.dstack((j.z,j.y,j.x))[0]#.astype(int)
print("3D",j)
else:
positions = np.dstack((j.y,j.x))[0]#.astype(int)
print("2D",j)
# Prepare data
mask = np.zeros(shape, dtype=np.uint8)
positions_flat = positions.flatten().astype(np.float32)
radius = np.float32(radius)
# platform = cl.get_platforms()
# my_gpu_devices = platform[1].get_devices(device_type=cl.device_type.GPU)
# ctx = cl.Context(devices=my_gpu_devices)
# PyOpenCL setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
# Create buffers
mask_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=mask)
positions_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=positions_flat)
# Kernel code
kernel_code = """
__kernel void fill_mask(__global uchar *mask, __global const float *positions, const float radius, const int num_positions) {
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int width = get_global_size(0);
int height = get_global_size(1);
int depth = get_global_size(2);
int idx = x + y * width + z * width * height;
for (int i = 0; i < num_positions; ++i) {
float pz = positions[i * 3];
float py = positions[i * 3 + 1];
float px = positions[i * 3 + 2];
float distance = sqrt(pow(px - x, 2) + pow(py - y, 2) + pow(pz - z, 2));
if (distance <= radius) {
mask[idx] = 1;
break;
}
}
}
"""
# Build kernel
prg = cl.Program(ctx, kernel_code).build()
# Execute kernel
global_size = shape[::-1] # Note: PyOpenCL uses column-major order, so we reverse the dimensions
prg.fill_mask(queue, global_size, None, mask_buf, positions_buf, radius, np.int32(len(positions)))
# Read back the results
cl.enqueue_copy(queue, mask, mask_buf)
labels,_=label(mask)
return labels,positions