I am trying to implement an algorithm using ARM intrinsics.
On step of the algorithm requires a right shift of a signed integer, but it needs to round negative values upwards (i.e. less negative). For example, if shifting right by one, then -8 should give -4, but -1 should give 0.
In other words, I want something that rounds negative values towards zero instead of rounding down:
int rightshift(int number, unsigned int shift)
{
return ((number < 0) ? -1 : 1) * (abs(number) >> shift);
}
I am not able to find a suitable function to do this in SIMD fashion. Is there a way to do this in one function call, or some trick that could be used?
I don't think there's a single-instruction shift with round-toward-zero behaviour.
However, you can do it fairly simply with a couple of shift and mask instructions. What we need to do is to add one to the result if we started with a negative number and there is a 'carry' out (i.e. any bit to the right of the result would have been 1).
I can demonstrate this with the following pure C code:
#include <stddef.h>
#include <limits.h>
int16_t rightshift(int number, unsigned int shift)
{
static const size_t bits = sizeof number * CHAR_BIT;
number += ((1<<shift) - 1) & (number >> bits-1);
return number >> shift;
}
#include <stdio.h>
int main() {
for (int i = -16; i <= 16; ++i) {
printf(" %3d: ", i);
for (int j = 0; j < 4; ++j)
printf("%4d", rightshift(i, j));
puts("");
}
}
This compiles to some nice branch-fee assembly, which looks amenable to inlining (especially when shift
is a compile-time constant):
rightshift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
movs r3, #1
lsls r3, r3, r1
subs r3, r3, #1
and r3, r3, r0, asr #31
add r0, r0, r3
asrs r0, r0, r1
bx lr
To target Neon, I wrote another function, to exercise it with multiple data:
void do_shift(int16_t *restrict dest, const int16_t *restrict src,
size_t count, unsigned int shift)
{
for (size_t j = 0; j < count; ++j) {
dest[j] = rightshift(src[j], shift);
}
}
And a test program for it:
#include <stdio.h>
int main() {
static const int16_t src[] = {
-32768, -32767, -32766, -32765, -32764,
-16384, -16383, -16382, -16381, -16380,
-8193, -8192, -8191, -8190, -8189,
-16, -15, -14, -13, -12, -10, -9,
-8, -7, -6, -5, -4, -3, -2, -1, 0,
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
1023, 1024, 32767,
};
static const size_t count = sizeof src / sizeof *src;
int16_t dest[16][count];
for (unsigned int i = 0; i < 16; ++i) {
do_shift(dest[i], src, count, i);
}
for (size_t i = 0; i < count; ++i) {
printf("%7d: ", src[i]);
for (int j = 0; j < 16; ++j)
printf("%7d", dest[j][i]);
puts("");
}
}
I compiled this with gcc -O3 -march=armv7 -mfpu=neon
. I confess I'm not familiar with the Neon instructions, but the results may be instructive:
do_shift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
cmp r2, #0
beq .L21
push {r4, r5, r6, r7, r8, lr}
ubfx r4, r1, #1, #2
negs r4, r4
movs r5, #1
and r4, r4, #7
lsls r5, r5, r3
adds r7, r4, #7
subs r6, r2, #1
subs r5, r5, #1
cmp r6, r7
sxth r5, r5
bcc .L8
cmp r4, #0
beq .L9
ldrsh r7, [r1]
cmp r4, #1
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0] @ movhi
beq .L9
ldrsh r7, [r1, #2]
cmp r4, #2
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #2] @ movhi
beq .L9
ldrsh r7, [r1, #4]
cmp r4, #3
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #4] @ movhi
beq .L9
ldrsh r7, [r1, #6]
cmp r4, #4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #6] @ movhi
beq .L9
ldrsh r7, [r1, #8]
cmp r4, #5
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #8] @ movhi
beq .L9
ldrsh r7, [r1, #10]
cmp r4, #7
ite eq
moveq r8, r4
movne r8, #6
and r6, r5, r7, asr #31
add r6, r6, r7
it eq
ldrsheq r7, [r1, #12]
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #10] @ movhi
itttt eq
andeq r6, r5, r7, asr #31
addeq r6, r6, r7
sxtheq r6, r6
asreq r6, r6, r3
it eq
strheq r6, [r0, #12] @ movhi
.L4:
vdup.32 q10, r3
sub lr, r2, r4
lsls r4, r4, #1
movs r7, #0
vneg.s32 q10, q10
adds r6, r1, r4
lsr ip, lr, #3
add r4, r4, r0
vdup.16 q12, r5
.L6:
adds r7, r7, #1
adds r6, r6, #16
vldr d18, [r6, #-16]
vldr d19, [r6, #-8]
cmp r7, ip
vshr.s16 q8, q9, #15
vand q8, q8, q12
vadd.i16 q8, q8, q9
vmovl.s16 q9, d16
vmovl.s16 q8, d17
vshl.s32 q9, q9, q10
vshl.s32 q8, q8, q10
vmovn.i32 d22, q9
vmovn.i32 d23, q8
vst1.16 {q11}, [r4]
add r4, r4, #16
bcc .L6
bic r6, lr, #7
cmp lr, r6
add r4, r8, r6
beq .L1
.L3:
ldrsh ip, [r1, r4, lsl #1]
adds r7, r4, #1
cmp r2, r7
and r6, r5, ip, asr #31
add r6, r6, ip
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r4, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #2
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #3
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #4
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #5
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #6
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #7
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #8
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #9
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #10
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #11
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #12
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh r7, [r1, ip, lsl #1]
adds r4, r4, #13
cmp r2, r4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh r1, [r1, r4, lsl #1]
and r2, r5, r1, asr #31
add r2, r2, r1
sxth r2, r2
asr r3, r2, r3
strh r3, [r0, r4, lsl #1] @ movhi
.L1:
pop {r4, r5, r6, r7, r8, pc}
.L9:
mov r8, r4
b .L4
.L21:
bx lr
.L8:
movs r4, #0
b .L3
There's a lot of loop unrolling which makes the code longer, but the pattern should be clear.