How to implement lane crossing logical bit-wise shift (left and right) in AVX2? I want to shift a whole __m256i
as if it was a single 256-bit integer, with no element or lane boundaries.
An answer on another Q&A looked useful but turned out to actually be about byte-shifts, using _mm256_alignr_epi8
and _mm256_permute2x128_si256
with operands that depend on the compile-time-constant shift count. (See the revision history of this question for a full test program written before realizing it was just byte shifts, so only useful for bit-shift counts that are multiples of 8.)
The following code implements lane-crossing logical bit-wise shift/rotate (left and right) in AVX2:
// Prototypes...
__m256i _mm256_sli_si256 ( __m256i, int );
__m256i _mm256_sri_si256 ( __m256i, int );
__m256i _mm256_rli_si256 ( __m256i, int );
__m256i _mm256_rri_si256 ( __m256i, int );
// Implementations...
__m256i left_shift_000_063 ( __m256i a, int n ) { // 6
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 0 ) ), _MM_SHUFFLE ( 3, 3, 3, 0 ) ) );
}
__m256i left_shift_064_127 ( __m256i a, int n ) { // 7
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 0 ) );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 3, 0 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 3, 0, 0 ) ); // 6
return _mm256_or_si256 ( f, g );
}
__m256i left_shift_128_191 ( __m256i a, int n ) { // 7
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 0, 0 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 0, 0, 0 ) );
return _mm256_or_si256 ( f, g );
}
__m256i left_shift_192_255 ( __m256i a, int n ) { // 5
return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_slli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 0 ) ), n ), _MM_SHUFFLE ( 3, 0, 0, 0 ) );
}
__m256i _mm256_sli_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? left_shift_000_063 ( a, n ) : left_shift_064_127 ( a, n % 64 );
else return n < 192 ? left_shift_128_191 ( a, n % 64 ) : left_shift_192_255 ( a, n % 64 );
}
__m256i right_shift_000_063 ( __m256i a, int n ) { // 6
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ), _MM_SHUFFLE ( 0, 3, 3, 3 ) ) );
}
__m256i right_shift_064_127 ( __m256i a, int n ) { // 7
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 3, 2, 1 ) );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 3, 3, 2 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 3, 3, 3 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 3, 3 ) );
return _mm256_or_si256 ( f, g );
}
__m256i right_shift_128_191 ( __m256i a, int n ) { // 7
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 2, 3, 2 ) );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 2, 1, 3 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 0, 3, 3 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 0, 3 ) );
return _mm256_or_si256 ( f, g );
}
__m256i right_shift_192_255 ( __m256i a, int n ) { // 5
return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_srli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 3 ) ), n ), _MM_SHUFFLE ( 0, 0, 0, 3 ) );
}
__m256i _mm256_sri_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? right_shift_000_063 ( a, n ) : right_shift_064_127 ( a, n % 64 );
else return n < 192 ? right_shift_128_191 ( a, n % 64 ) : right_shift_192_255 ( a, n % 64 );
}
__m256i left_rotate_000_063 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
}
__m256i left_rotate_064_127 ( __m256i a, int n ) { // 6
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 3 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
return _mm256_or_si256 ( d, e );
}
__m256i left_rotate_128_191 ( __m256i a, int n ) { // 6
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 0, 3, 2, 1 ) );
return _mm256_or_si256 ( d, e );
}
__m256i left_rotate_192_255 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
}
__m256i _mm256_rli_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? left_rotate_000_063 ( a, n ) : left_rotate_064_127 ( a, n % 64 );
else return n < 192 ? left_rotate_128_191 ( a, n % 64 ) : left_rotate_192_255 ( a, n % 64 );
}
__m256i right_rotate_000_063 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
}
__m256i right_rotate_064_127 ( __m256i a, int n ) { // 6
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 0, 3, 2, 1 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
return _mm256_or_si256 ( d, e );
}
__m256i right_rotate_128_191 ( __m256i a, int n ) { // 6
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 2, 1, 0, 3 ) );
return _mm256_or_si256 ( d, e );
}
__m256i right_rotate_192_255 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
}
__m256i _mm256_rri_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? right_rotate_000_063 ( a, n ) : right_rotate_064_127 ( a, n % 64 );
else return n < 192 ? right_rotate_128_191 ( a, n % 64 ) : right_rotate_192_255 ( a, n % 64 );
}
I have tried to make the _mm256_permute4x64_epi64 ops (when there in any case have to be two) to partially overlap, which should keep the overall latency to a minimum.
Most of the suggestions and or clues given by commenters were helpful in putting together the code, thanks to those. Obviously, improvements and or any other comments are welcome.
I think that Mystical's answer is interesting, but too complicated to be used effectively for generalized shifting/rotating for use f.e. in a library.