I've met a set of code with the following "kernel" as performance blocker. Since I have access to the latest Intel(R) Xeon Phi(TM) CPU 7210 (KNL), I wish to speed it up using AVX512 intrinsic.
for( int y = starty; y <= endy; y++)
{
// hence data[][] is "unsigned char" while result[] is "int"
for( int x = startx; x <= endx; x++)
{
if( (data[y][x]&0x1) == 0 )
result[x] += data[y][x];
}
}
after analyzing the code's behavior, I've found that the inner loop's length is mostly less than 16, so that I've written the following
register int xlen = xend - xstart + 1;
__m512i zero5 = _mm512_setzero_si512();
__m256i zero2 = _mm512_castsi512_si256(zero5);
__m128i zero1 = _mm512_castsi512_si128(zero5);
__m256i mask2 = _mm256_set1_epi8(0x1);
__m128i mask1 = _mm256_castsi256_si128(mask2);
register __m512i psprof0 = zero5;
for( int i = 0; i < (16-xlen)&(~0x1); i += 2 ) mask1 = _mm_srli_si128(mask1, 2);
if( (16-xlen)&(0x1) ) mask1 = _mm_srli_si128(mask1, 1);
#pragma vector nontemporal
#pragma prefetch data
for( int y = starty; y <= endy; y++ )
{
__m128i pixel16 = _mm_loadu_si128((__m128i*)&data[y][startx]);
// if ( _mm_testc_si128(pixel16, mask1) ) continue;
__m128i mask16 = _mm_andnot_si128(pixel16, mask1);
__m128i pixel16n = _mm_sign_epi8(pixel16, mask16);
psprof0 = _mm512_add_epi32(psprof0, _mm512_cvtepu8_epi32(pixel16n));
}
_mm512_storeu_si512(&result[startx], psprof0);
A few questions here:
Point 3 above:
static inline __m256i vec_256_combine_128(__m128i a, __m128i b)
{
// combine two __m128i into one __m256i
return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
}
static inline __m128i vec_256_add_128(__m256i a)
{
// add lower 128bit and higher 128bit of __m256i consists of epi16
return _mm_add_epi16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
}
for( int y = starty; y <= endy; y += 2 )
{
__m128i pixel16a = _mm_load_si128((__m128i*)&pEdgeImage[y][sx]);
__m128i pixel16b = _mm_load_si128((__m128i*)&pEdgeImage[y+1][sx]);
if ( y == ye )
pixel16b = zero1;
__m256i pixel16 = vec_256_combine_128(pixel16a, pixel16b);
if ( _mm256_testc_si256(pixel16, mask1) ) continue;
__m256i mask16 = _mm256_andnot_si256(pixel16, mask1);
__m256i pixel16n = _mm256_sign_epi8(pixel16, mask16);
__m256i pixel16lo = _mm256_unpacklo_epi8(pixel16n, zero2);
__m256i pixel16hi = _mm256_unpackhi_epi8(pixel16n, zero2);
psprof0 = _mm256_add_epi16(psprof0, vec_256_combine_128(vec_256_add_128(pixel16lo), vec_256_add_128(pixel16hi)));
}
Here is a linear version that return a normalized bit count (8 floats), where it is normalized by the number of the entries. (Poked in by hand so likely a typo or two)
PURE FUNCTION BITS8(nGot, DataIn, nBits) BIND(C, NAME='BITS8')
USE OMP_LIB
USE, INTRINSIC :: IOS_C_BINDING, ONLY: C_FLOAT, C_INT8_Y, C_INT
IMPLICIT NONE
INTEGER(C_INT) , INTENT(IN) :: nGot
INTEGER(C_INT8_T) , INTENT(IN) :: nBits !Which should come in as 8
INTEGER(C_INT8_T), DIMENSION(nGot), INTENT(IN) :: DataIn
!DIR$ ATTRIBUTES ALIGN : 64 :: Bits
REAL(C_FLOAT), DIMENSION(nBits) :: Bits8
Bits8 = 0.0E0
!DIR$ ASSUME_ALIGNED DataIn:2
!DIR$ PREFETCH DataIn:1:64
Sum_Loop: DO I = 1, nGot
!$OMP SIMD REDUCTION(+:Bits8) LINEAR(DataIn) SAFELEN(64)
Bit_Loop: DO J = 0, nBits-1
Bits8(J+1) = IBITS(DataIn(I),J, 1) + Bits8(J+1)
ENDDO Bit_Loop
ENDDO Sum_Loop
!$OMP END
!DIR$ SIMD
Norm_Loop: DO J = 1, nBits
Bits8(J) = Bits8(J)/nGot
ENDDO Norm_Loop
RETURN
END FUNCTION Bits8
You compile it with 'ifort -openmp -O3' etc. Obviously for a 2 array you will need #rows and # columns, and where the start and end of the rows and columns you want to check are located. I am sure you know that the rows and columns are reversed in c versus fortran.
To work out the trailing underscore dramas use 'nm' on the .o files, and the BIND(C, NAME=) can also help.
Probably you could use something even more skinnied down, and then in your C inline the function, and put the SIMD REDUCTION on the C side. You have the advantage of not having any concern with the row/column difference if you working through the array is on the 'c side'.
PURE FUNCTION BITS8(DataIn) BIND(C, NAME='BITS8')
USE OMP_LIB
USE, INTRINSIC :: IOS_C_BINDING, ONLY: C_INT8_T, C_INT
IMPLICIT NONE
INTEGER(C_INT8_T), PARAMETER :: nBits = 8
INTEGER(C_INT8_T) , INTENT(IN) :: DataIn
INTEGER(C_INT), DIMENSION(nBits) :: Bits8
! Bits = 0.0E0
!DIR$ ASSUME_ALIGNED DataIn:2
!DIR$ PREFETCH DataIn:1:64
Bit_Loop: DO J = 0, nBits-1
Bits8(J+1) = IBITS(DataIn(I),J, 1)
ENDDO Bit_Loop
!$OMP END
RETURN
END FUNCTION Bits8
Another way would be something like:
PURE FUNCTION BITS8(nrows, ncols, startrow, startCol, EndRow, EndCol, DataIn) BIND(C, NAME='BITS8')
USE OMP_LIB
USE, INTRINSIC :: IOS_C_BINDING, ONLY: C_FLOAT, C_INT8_Y, C_INT
IMPLICIT NONE
INTEGER(C_INT8_T) , PARAMETER :: nBits = 8
INTEGER(C_INT) , INTENT(IN) :: nRows
INTEGER(C_INT) , INTENT(IN) :: nCols
INTEGER(C_INT) , INTENT(IN) :: StartRow
INTEGER(C_INT) , INTENT(IN) :: StartCol
INTEGER(C_INT) , INTENT(IN) :: EndRow
INTEGER(C_INT) , INTENT(IN) :: EndCol
INTEGER(C_INT8_T), DIMENSION(ncols,nrows), INTENT(IN) :: DataIn
!DIR$ ATTRIBUTES ALIGN : 64 :: Bits8
INTEGER(C_INT), DIMENSION(nBits) :: Bits8
INTEGER(C_INT) :: I, J, K
!DIR$ ASSUME_ALIGNED DataIn:64
Bits8 = 0
Row_Loop: DO J = StartCol, EndCol
!DIR$ PREFETCH DataIn:1:64
Col_Loop: DO I = StartRow, EndRow
!$OMP SIMD REDUCTION(+:Bits8) LINEAR(DataIn) SAFELEN(64)
Bit_Loop: DO K = 0, nBits-1
Bits8(K+1) = IBITS(DataIn(I,J),K, 1) + Bits8(K+1)
ENDDO Bit_Loop
ENDDO Sum_Loop
ENDDO Sum_Loop
!$OMP END
RETURN
END FUNCTION Bits8
All that aside, I think your data[y][x]&0x1 should be able to work with some #pragma vector always or #pragma simd (etc)... The -vec-report 3 should allow you work it out.
If not, then the small section, inlined might be best??
I do not know what you need, but I get >250 MB/second of bit throughput with that on a single core in the first example... So you know what to expect.
I am pretty convinced that the best way is just to histogram the data. Then do the bit testing on each histogram index value and multiply by the histogram bin count at that bin. Certainly it is faster for large count values. Once you know the bit pattern per histogram index that part never changes. So for large 'summing count numbers' and small 'byte sizes' that would certainly be faster. For small count size and 64 bits or larger, then it may be faster using IBITS.
The histogram was covered on the intel webinar from around September-16 (c and fortran).
The one advantage of fortran is that for a single byte, one can have the histogram be dimensioned as (-128:128) which makes it straightforward to drop the value into the right bin.