c++cbit-shiftavx2

How to implement lane crossing logical bit-wise shift/rotate (left and right) in AVX2


How to implement lane crossing logical bit-wise shift (left and right) in AVX2? I want to shift a whole __m256i as if it was a single 256-bit integer, with no element or lane boundaries.


An answer on another Q&A looked useful but turned out to actually be about byte-shifts, using _mm256_alignr_epi8 and _mm256_permute2x128_si256 with operands that depend on the compile-time-constant shift count. (See the revision history of this question for a full test program written before realizing it was just byte shifts, so only useful for bit-shift counts that are multiples of 8.)


Solution

  • The following code implements lane-crossing logical bit-wise shift/rotate (left and right) in AVX2:

    // Prototypes...
    
    __m256i _mm256_sli_si256 ( __m256i, int );
    __m256i _mm256_sri_si256 ( __m256i, int );
    __m256i _mm256_rli_si256 ( __m256i, int );
    __m256i _mm256_rri_si256 ( __m256i, int );
    
    
    // Implementations...
    
    __m256i left_shift_000_063 ( __m256i a, int n ) { // 6
    
        return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 0 ) ), _MM_SHUFFLE ( 3, 3, 3, 0 ) ) );
    }
    
    __m256i left_shift_064_127 ( __m256i a, int n ) { // 7
    
        __m256i b = _mm256_slli_epi64 ( a, n );
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 0 ) );
    
        __m256i c = _mm256_srli_epi64 ( a, 64 - n );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
    
        __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 3, 0 ) );
        __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 3, 0, 0 ) ); // 6
    
        return _mm256_or_si256 ( f, g );
    }
    
    __m256i left_shift_128_191 ( __m256i a, int n ) { // 7
    
        __m256i b = _mm256_slli_epi64 ( a, n );
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
    
        __m256i c = _mm256_srli_epi64 ( a, 64 - n );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
    
        __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 0, 0 ) );
        __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 0, 0, 0 ) );
    
        return _mm256_or_si256 ( f, g );
    }
    
    __m256i left_shift_192_255 ( __m256i a, int n ) { // 5
    
        return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_slli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 0 ) ), n ), _MM_SHUFFLE ( 3, 0, 0, 0 ) );
    }
    
    __m256i _mm256_sli_si256 ( __m256i a, int n ) {
    
        if ( n < 128 ) return n <  64 ? left_shift_000_063 ( a, n ) : left_shift_064_127 ( a, n % 64 );
        else           return n < 192 ? left_shift_128_191 ( a, n % 64 ) : left_shift_192_255 ( a, n % 64 );
    }
    
    
    __m256i right_shift_000_063 ( __m256i a, int n ) { // 6
    
        return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ), _MM_SHUFFLE ( 0, 3, 3, 3 ) ) );
    }
    
    __m256i right_shift_064_127 ( __m256i a, int n ) { // 7
    
        __m256i b = _mm256_srli_epi64 ( a, n );
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 3, 2, 1 ) );
    
        __m256i c = _mm256_slli_epi64 ( a, 64 - n );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 3, 3, 2 ) );
    
        __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 3, 3, 3 ) );
        __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 3, 3 ) );
    
        return _mm256_or_si256 ( f, g );
    }
    
    __m256i right_shift_128_191 ( __m256i a, int n ) { // 7
    
        __m256i b = _mm256_srli_epi64 ( a, n );
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 2, 3, 2 ) );
    
        __m256i c = _mm256_slli_epi64 ( a, 64 - n );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 2, 1, 3 ) );
    
        __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 0, 3, 3 ) );
        __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 0, 3 ) );
    
        return _mm256_or_si256 ( f, g );
    }
    
    __m256i right_shift_192_255 ( __m256i a, int n ) { // 5
    
        return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_srli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 3 ) ), n ), _MM_SHUFFLE ( 0, 0, 0, 3 ) );
    }
    
    __m256i _mm256_sri_si256 ( __m256i a, int n ) {
    
        if ( n < 128 ) return n <  64 ? right_shift_000_063 ( a, n ) : right_shift_064_127 ( a, n % 64 );
        else           return n < 192 ? right_shift_128_191 ( a, n % 64 ) : right_shift_192_255 ( a, n % 64 );
    }
    
    
    __m256i left_rotate_000_063 ( __m256i a, int n ) { // 5
    
        return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
    }
    
    __m256i left_rotate_064_127 ( __m256i a, int n ) { // 6
    
        __m256i b = _mm256_slli_epi64 ( a, n );
        __m256i c = _mm256_srli_epi64 ( a, 64 - n );
    
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 3 ) );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
    
        return _mm256_or_si256 ( d, e );
    }
    
    __m256i left_rotate_128_191 ( __m256i a, int n ) { // 6
    
        __m256i b = _mm256_slli_epi64 ( a, n );
        __m256i c = _mm256_srli_epi64 ( a, 64 - n );
    
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 0, 3, 2, 1 ) );
    
        return _mm256_or_si256 ( d, e );
    }
    
    __m256i left_rotate_192_255 ( __m256i a, int n ) { // 5
    
        return _mm256_or_si256 ( _mm256_srli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
    }
    
    __m256i _mm256_rli_si256 ( __m256i a, int n ) {
    
        if ( n < 128 ) return n <  64 ? left_rotate_000_063 ( a, n ) : left_rotate_064_127 ( a, n % 64 );
        else           return n < 192 ? left_rotate_128_191 ( a, n % 64 ) : left_rotate_192_255 ( a, n % 64 );
    }
    
    
    __m256i right_rotate_000_063 ( __m256i a, int n ) { // 5
    
        return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
    }
    
    __m256i right_rotate_064_127 ( __m256i a, int n ) { // 6
    
        __m256i b = _mm256_srli_epi64 ( a, n );
        __m256i c = _mm256_slli_epi64 ( a, 64 - n );
    
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 0, 3, 2, 1 ) );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
    
        return _mm256_or_si256 ( d, e );
    }
    
    __m256i right_rotate_128_191 ( __m256i a, int n ) { // 6
    
        __m256i b = _mm256_srli_epi64 ( a, n );
        __m256i c = _mm256_slli_epi64 ( a, 64 - n );
    
        __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
        __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 2, 1, 0, 3 ) );
    
        return _mm256_or_si256 ( d, e );
    }
    __m256i right_rotate_192_255 ( __m256i a, int n ) { // 5
    
        return _mm256_or_si256 ( _mm256_slli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
    }
    
    __m256i _mm256_rri_si256 ( __m256i a, int n ) {
    
        if ( n < 128 ) return n <  64 ? right_rotate_000_063 ( a, n      ) : right_rotate_064_127 ( a, n % 64 );
        else           return n < 192 ? right_rotate_128_191 ( a, n % 64 ) : right_rotate_192_255 ( a, n % 64 );
    }
    

    I have tried to make the _mm256_permute4x64_epi64 ops (when there in any case have to be two) to partially overlap, which should keep the overall latency to a minimum.

    Most of the suggestions and or clues given by commenters were helpful in putting together the code, thanks to those. Obviously, improvements and or any other comments are welcome.

    I think that Mystical's answer is interesting, but too complicated to be used effectively for generalized shifting/rotating for use f.e. in a library.