I am trying to load an array of u16s from memory and find the first element that is less than some number, as fast as possible on an M1 mac. I have been looking through the NEON instructions, but I wasn't able to find a good way to do it. There are comparison vector instructions, but they leave you with vectors of elements of all 1 or 0. How would you get that into a general purpose register? And is there a way to break a loop by testing a vector instruction?
// int32_t searchArrayU16(uint16_t *pArray, uint16_t threshold, uint32_t len);
// assert(len & 127 == 0);
// assert(len >= 128);
.arch armv8-a
.global searchArrayU16
.text
pArray0 .req x0
thresh .req w1
len .req x2
stride .req x3
pArray1 .req x4
count .req w5
val0 .req x6
val0w .req w6
val1 .req x7
val1w .req w7
.balign 64
.func
searchArrayU16:
adr x15, 2f
dup v0.8h, thresh
ld1r {v1.2d}, [x15]
mov stride, #128
add pArray1, pArray0, #64
mov count, #0
b 1f
.balign 8
2:
.byte 1, 2, 4, 8, 16, 32, 64, 128
.balign 64
1:
ld1 {v16.8h-v19.8h}, [pArray0], stride
ld1 {v20.8h-v23.8h}, [pArray1], stride
ld1 {v24.8h-v27.8h}, [pArray0], stride
ld1 {v28.8h-v31.8h}, [pArray1], stride
cmhi v16.8h, v0.8h, v16.8h
cmhi v17.8h, v0.8h, v17.8h
cmhi v18.8h, v0.8h, v18.8h
cmhi v19.8h, v0.8h, v19.8h
cmhi v20.8h, v0.8h, v20.8h
cmhi v21.8h, v0.8h, v21.8h
cmhi v22.8h, v0.8h, v22.8h
cmhi v23.8h, v0.8h, v23.8h
cmhi v24.8h, v0.8h, v24.8h
cmhi v25.8h, v0.8h, v25.8h
cmhi v26.8h, v0.8h, v26.8h
cmhi v27.8h, v0.8h, v27.8h
cmhi v28.8h, v0.8h, v28.8h
cmhi v29.8h, v0.8h, v29.8h
cmhi v30.8h, v0.8h, v30.8h
cmhi v31.8h, v0.8h, v31.8h
uzp1 v16.16b, v16.16b, v17.16b
uzp1 v18.16b, v18.16b, v19.16b
uzp1 v20.16b, v20.16b, v21.16b
uzp1 v22.16b, v22.16b, v23.16b
uzp1 v24.16b, v24.16b, v25.16b
uzp1 v26.16b, v26.16b, v27.16b
uzp1 v28.16b, v28.16b, v29.16b
uzp1 v30.16b, v30.16b, v31.16b
and v16.16b, v16.16b, v1.16b
and v18.16b, v18.16b, v1.16b
and v20.16b, v20.16b, v1.16b
and v22.16b, v22.16b, v1.16b
and v24.16b, v24.16b, v1.16b
and v26.16b, v26.16b, v1.16b
and v28.16b, v28.16b, v1.16b
and v30.16b, v30.16b, v1.16b
addp v16.16b, v16.16b, v18.16b
addp v20.16b, v20.16b, v22.16b
addp v24.16b, v24.16b, v26.16b
addp v28.16b, v28.16b, v30.16b
addp v16.16b, v16.16b, v20.16b
addp v24.16b, v24.16b, v28.16b
add count, count, #128
addp v16.16b, v16.16b, v24.16b
// total pipeline stall here
mov val0, v16.d[0]
mov val1, v16.d[1]
orr x15, val0, val1
cbnz x15, 1f // found a match!!!
cmp len, count, uxtw
b.hi 1b
.balign 16
mov w0, #-1 // no match found
ret
.balign 16
1:
rbit val0, val0
rbit val1, val1
cmp val0, #0
sub w0, count, #128
sub w1, count, #64
clz val0, val0
clz val1, val1
add w0, w0, val0w
add w1, w1, val1w
csel w0, w0, w1, ne
ret
.endfunc
.end
Here you are. It returns -1 when no match is found.
It should work on all armv8-a
cores or above.