What is the difference between, say, _mm512_mask_mov_epi64
and _mm512_mask_blend_epi64
. Besides the order and name of the arguments I cannot see any difference. Pseudo-code in Intels intrinsics guide looks completely equivalent as well:
Blend:
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
Mov:
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
I wrote two functions plus a main()
function:
#include <immintrin.h>
#include <iostream>
#include <iomanip>
__m512i mov_mask(__m512i & a, __m512i & b)
{
return _mm512_mask_mov_epi64(a, 0x0F, b);
}
__m512i blend_mask(__m512i & c, __m512i & d)
{
return _mm512_mask_blend_epi64(0x0F, c, d);
}
int main(int argc, char * argv[])
{
__m512i a = { 1, 2, 3, 4, 5, 6, 7, 8 };
__m512i b = { 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28 };
__m512i c = a;
__m512i d = b;
__m512i r = mov_mask(a, b);
for(int i(0); i < 512 / 64; ++i)
{
std::cout << std::hex << std::setfill('0')
<< "a: 0x" << std::setw(16) << ((uint64_t*)&a)[i]
<< " -- b: 0x" << std::setw(16) << ((uint64_t*)&b)[i]
<< " -- r: 0x" << std::setw(16) << ((uint64_t*)&r)[i]
<< '\n';
}
r = blend_mask(c, d);
for(int i(0); i < 512 / 64; ++i)
{
std::cout << std::hex << std::setfill('0')
<< "c: 0x" << std::setw(16) << ((uint64_t*)&c)[i]
<< " -- d: 0x" << std::setw(16) << ((uint64_t*)&d)[i]
<< " -- r: 0x" << std::setw(16) << ((uint64_t*)&r)[i]
<< '\n';
}
return 0;
}
I compiled with:
g++ -std=gnu++23 -mavx512dq -O3 -o a a.cpp
I checked the output of the two functions with
objdump -d a | less
and it is exactly the same:
00000000000015e0 <_Z8mov_maskRDv8_xS0_>:
15e0: f3 0f 1e fa endbr64
15e4: 62 f1 fd 48 6f 0f vmovdqa64 (%rdi),%zmm1
15ea: b8 0f 00 00 00 mov $0xf,%eax
15ef: c5 f9 92 c8 kmovb %eax,%k1
15f3: 62 f2 f5 49 64 06 vpblendmq (%rsi),%zmm1,%zmm0{%k1}
15f9: c3 ret
0000000000001600 <_Z10blend_maskRDv8_xS0_>:
1600: f3 0f 1e fa endbr64
1604: 62 f1 fd 48 6f 0f vmovdqa64 (%rdi),%zmm1
160a: b8 0f 00 00 00 mov $0xf,%eax
160f: c5 f9 92 c8 kmovb %eax,%k1
1613: 62 f2 f5 49 64 06 vpblendmq (%rsi),%zmm1,%zmm0{%k1}
1619: c3 ret
(I used g++ hence the extra decoration to the function names).
So you are right, they are exactly the same thing (except for the position of the mask in the list of parameters).
Since Peter Cordes mentioned icc
(which is really icpx
now), I though I could give that a try. It's not even using VPBLENDMQ
or some MOV
instruction. It uses the VSHUFI64X2
instead. Better optimized for sure. Two instead of four instructions (not counting the ENDBR64
and RET
instructions).
Compiled with:
icpx -mavx512dq -oa ~/tmp/a.cpp
Output:
00000000004011c0 <_Z8mov_maskRDv8_xS0_>:
4011c0: 62 f1 fe 48 6f 06 vmovdqu64 (%rsi),%zmm0
4011c6: 62 f3 fd 48 43 07 e4 vshufi64x2 $0xe4,(%rdi),%zmm0,%zmm0
4011cd: c3 ret
00000000004011d0 <_Z10blend_maskRDv8_xS0_>:
4011d0: 62 f1 fe 48 6f 06 vmovdqu64 (%rsi),%zmm0
4011d6: 62 f3 fd 48 43 07 e4 vshufi64x2 $0xe4,(%rdi),%zmm0,%zmm0
4011dd: c3 ret
That being said, again, the two functions are exactly the same, bit for bit.