[SOLVED] Are there are ARM NEON instructions for signed right-shift that round toward zero?

I don't think there's a single-instruction shift with round-toward-zero behaviour.

However, you can do it fairly simply with a couple of shift and mask instructions. What we need to do is to add one to the result if we started with a negative number and there is a 'carry' out (i.e. any bit to the right of the result would have been 1).

I can demonstrate this with the following pure C code:

#include <stddef.h>
#include <limits.h>

int16_t rightshift(int number, unsigned int shift)
{
    static const size_t bits = sizeof number * CHAR_BIT;

    number += ((1<<shift) - 1) & (number >> bits-1);
    return number >> shift;
}

#include <stdio.h>
int main() {
    for (int i = -16; i <= 16;  ++i) {
        printf(" %3d: ", i);
        for (int j = 0;  j < 4;  ++j)
            printf("%4d", rightshift(i, j));
        puts("");
    }
}

This compiles to some nice branch-fee assembly, which looks amenable to inlining (especially when shift is a compile-time constant):

rightshift:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    movs    r3, #1
    lsls    r3, r3, r1
    subs    r3, r3, #1
    and r3, r3, r0, asr #31
    add r0, r0, r3
    asrs    r0, r0, r1
    bx  lr

To target Neon, I wrote another function, to exercise it with multiple data:

void do_shift(int16_t *restrict dest, const int16_t *restrict src,
              size_t count, unsigned int shift)
{
    for (size_t j = 0;  j < count;  ++j) {
        dest[j] = rightshift(src[j], shift);
    }
}

And a test program for it:

#include <stdio.h>
int main() {
    static const int16_t src[] = {
        -32768, -32767, -32766, -32765, -32764,
        -16384, -16383, -16382, -16381, -16380,
        -8193,  -8192,  -8191,  -8190,  -8189,
        -16, -15, -14, -13, -12, -10, -9,
        -8, -7, -6, -5, -4, -3, -2, -1, 0,
        1, 2, 3, 4, 5, 6, 7, 8,
        9, 10, 11, 12, 13, 14, 15, 16,
        1023, 1024, 32767,
    };
    static const size_t count = sizeof src / sizeof *src;
    int16_t dest[16][count];

    for (unsigned int i = 0; i < 16;  ++i) {
        do_shift(dest[i], src, count, i);
    }

    for (size_t i = 0;  i < count;  ++i) {
        printf("%7d: ", src[i]);
        for (int j = 0;  j < 16;  ++j)
            printf("%7d", dest[j][i]);
        puts("");
    }
}

I compiled this with gcc -O3 -march=armv7 -mfpu=neon. I confess I'm not familiar with the Neon instructions, but the results may be instructive:

do_shift:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    cmp r2, #0
    beq .L21
    push    {r4, r5, r6, r7, r8, lr}
    ubfx    r4, r1, #1, #2
    negs    r4, r4
    movs    r5, #1
    and r4, r4, #7
    lsls    r5, r5, r3
    adds    r7, r4, #7
    subs    r6, r2, #1
    subs    r5, r5, #1
    cmp r6, r7
    sxth    r5, r5
    bcc .L8
    cmp r4, #0
    beq .L9
    ldrsh   r7, [r1]
    cmp r4, #1
    and r6, r5, r7, asr #31
    add r6, r6, r7
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0]    @ movhi
    beq .L9
    ldrsh   r7, [r1, #2]
    cmp r4, #2
    and r6, r5, r7, asr #31
    add r6, r6, r7
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, #2]    @ movhi
    beq .L9
    ldrsh   r7, [r1, #4]
    cmp r4, #3
    and r6, r5, r7, asr #31
    add r6, r6, r7
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, #4]    @ movhi
    beq .L9
    ldrsh   r7, [r1, #6]
    cmp r4, #4
    and r6, r5, r7, asr #31
    add r6, r6, r7
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, #6]    @ movhi
    beq .L9
    ldrsh   r7, [r1, #8]
    cmp r4, #5
    and r6, r5, r7, asr #31
    add r6, r6, r7
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, #8]    @ movhi
    beq .L9
    ldrsh   r7, [r1, #10]
    cmp r4, #7
    ite eq
    moveq   r8, r4
    movne   r8, #6
    and r6, r5, r7, asr #31
    add r6, r6, r7
    it  eq
    ldrsheq r7, [r1, #12]
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, #10]   @ movhi
    itttt   eq
    andeq   r6, r5, r7, asr #31
    addeq   r6, r6, r7
    sxtheq  r6, r6
    asreq   r6, r6, r3
    it  eq
    strheq  r6, [r0, #12]   @ movhi
.L4:
    vdup.32 q10, r3
    sub lr, r2, r4
    lsls    r4, r4, #1
    movs    r7, #0
    vneg.s32    q10, q10
    adds    r6, r1, r4
    lsr ip, lr, #3
    add r4, r4, r0
    vdup.16 q12, r5
.L6:
    adds    r7, r7, #1
    adds    r6, r6, #16
    vldr    d18, [r6, #-16]
    vldr    d19, [r6, #-8]
    cmp r7, ip
    vshr.s16    q8, q9, #15
    vand    q8, q8, q12
    vadd.i16    q8, q8, q9
    vmovl.s16 q9, d16
    vmovl.s16 q8, d17
    vshl.s32    q9, q9, q10
    vshl.s32    q8, q8, q10
    vmovn.i32   d22, q9
    vmovn.i32   d23, q8
    vst1.16 {q11}, [r4]
    add r4, r4, #16
    bcc .L6
    bic r6, lr, #7
    cmp lr, r6
    add r4, r8, r6
    beq .L1
.L3:
    ldrsh   ip, [r1, r4, lsl #1]
    adds    r7, r4, #1
    cmp r2, r7
    and r6, r5, ip, asr #31
    add r6, r6, ip
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, r4, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, r7, lsl #1]
    add ip, r4, #2
    cmp r2, ip
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, r7, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, ip, lsl #1]
    adds    r7, r4, #3
    cmp r2, r7
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, ip, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, r7, lsl #1]
    add ip, r4, #4
    cmp r2, ip
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, r7, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, ip, lsl #1]
    adds    r7, r4, #5
    cmp r2, r7
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, ip, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, r7, lsl #1]
    add ip, r4, #6
    cmp r2, ip
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, r7, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, ip, lsl #1]
    adds    r7, r4, #7
    cmp r2, r7
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, ip, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, r7, lsl #1]
    add ip, r4, #8
    cmp r2, ip
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, r7, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, ip, lsl #1]
    add r7, r4, #9
    cmp r2, r7
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, ip, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, r7, lsl #1]
    add ip, r4, #10
    cmp r2, ip
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, r7, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, ip, lsl #1]
    add r7, r4, #11
    cmp r2, r7
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, ip, lsl #1]    @ movhi
    bls .L1
    ldrsh   lr, [r1, r7, lsl #1]
    add ip, r4, #12
    cmp r2, ip
    and r6, r5, lr, asr #31
    add r6, r6, lr
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, r7, lsl #1]    @ movhi
    bls .L1
    ldrsh   r7, [r1, ip, lsl #1]
    adds    r4, r4, #13
    cmp r2, r4
    and r6, r5, r7, asr #31
    add r6, r6, r7
    sxth    r6, r6
    asr r6, r6, r3
    strh    r6, [r0, ip, lsl #1]    @ movhi
    bls .L1
    ldrsh   r1, [r1, r4, lsl #1]
    and r2, r5, r1, asr #31
    add r2, r2, r1
    sxth    r2, r2
    asr r3, r2, r3
    strh    r3, [r0, r4, lsl #1]    @ movhi
.L1:
    pop {r4, r5, r6, r7, r8, pc}
.L9:
    mov r8, r4
    b   .L4
.L21:
    bx  lr
.L8:
    movs    r4, #0
    b   .L3

There's a lot of loop unrolling which makes the code longer, but the pattern should be clear.