I want to multiply with SSE4 a __m128i
object with 16 unsigned 8 bit integers, but I could only find an intrinsic for multiplying 16 bit integers. Is there nothing such as _mm_mult_epi8
?
There is no 8-bit multiplication in MMX/SSE/AVX. However, you can emulate 8-bit multiplication intrinsic using 16-bit multiplication as follows:
inline __m128i _mm_mullo_epi8(__m128i a, __m128i b)
{
__m128i zero = _mm_setzero_si128();
__m128i Alo = _mm_cvtepu8_epi16(a);
__m128i Ahi = _mm_unpackhi_epi8(a, zero);
__m128i Blo = _mm_cvtepu8_epi16(b);
__m128i Bhi = _mm_unpackhi_epi8(b, zero);
__m128i Clo = _mm_mullo_epi16(Alo, Blo);
__m128i Chi = _mm_mullo_epi16(Ahi, Bhi);
__m128i maskLo = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
__m128i maskHi = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
__m128i C = _mm_or_si128(_mm_shuffle_epi8(Clo, maskLo), _mm_shuffle_epi8(Chi, maskHi));
return C;
}