intrinsicsavx512

What is the difference between "mask_mov" and "mask_blend" when using intrinsics / AVX?


What is the difference between, say, _mm512_mask_mov_epi64 and _mm512_mask_blend_epi64. Besides the order and name of the arguments I cannot see any difference. Pseudo-code in Intels intrinsics guide looks completely equivalent as well:

Blend:

FOR j := 0 to 7
    i := j*64
    IF k[j]
        dst[i+63:i] := b[i+63:i]
    ELSE
        dst[i+63:i] := a[i+63:i]
    FI
ENDFOR
dst[MAX:512] := 0

Mov:

FOR j := 0 to 7
    i := j*64
    IF k[j]
        dst[i+63:i] := a[i+63:i]
    ELSE
        dst[i+63:i] := src[i+63:i]
    FI
ENDFOR
dst[MAX:512] := 0

Solution

  • I wrote two functions plus a main() function:

    #include    <immintrin.h>
    #include    <iostream>
    #include    <iomanip>
    
    __m512i mov_mask(__m512i & a, __m512i & b)
    {
        return _mm512_mask_mov_epi64(a, 0x0F, b);
    }
    
    __m512i blend_mask(__m512i & c, __m512i & d)
    {
        return _mm512_mask_blend_epi64(0x0F, c, d);
    }
    
    int main(int argc, char * argv[])
    {
        __m512i a = { 1, 2, 3, 4, 5, 6, 7, 8 };
        __m512i b = { 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28 };
        __m512i c = a;
        __m512i d = b;
    
        __m512i r = mov_mask(a, b);
        for(int i(0); i < 512 / 64; ++i)
        {
            std::cout << std::hex << std::setfill('0')
                    << "a: 0x" << std::setw(16) << ((uint64_t*)&a)[i]
                << " -- b: 0x" << std::setw(16) << ((uint64_t*)&b)[i]
                << " -- r: 0x" << std::setw(16) << ((uint64_t*)&r)[i]
                << '\n';
        }
    
        r = blend_mask(c, d);
        for(int i(0); i < 512 / 64; ++i)
        {
            std::cout << std::hex << std::setfill('0')
                   << "c: 0x" << std::setw(16) << ((uint64_t*)&c)[i]
                << " -- d: 0x" << std::setw(16) << ((uint64_t*)&d)[i]
                << " -- r: 0x" << std::setw(16) << ((uint64_t*)&r)[i]
                << '\n';
        }
    
        return 0;
    }
    

    I compiled with:

    g++ -std=gnu++23 -mavx512dq -O3 -o a a.cpp
    

    I checked the output of the two functions with

    objdump -d a | less
    

    and it is exactly the same:

    00000000000015e0 <_Z8mov_maskRDv8_xS0_>:
        15e0:       f3 0f 1e fa             endbr64
        15e4:       62 f1 fd 48 6f 0f       vmovdqa64 (%rdi),%zmm1
        15ea:       b8 0f 00 00 00          mov    $0xf,%eax
        15ef:       c5 f9 92 c8             kmovb  %eax,%k1
        15f3:       62 f2 f5 49 64 06       vpblendmq (%rsi),%zmm1,%zmm0{%k1}
        15f9:       c3                      ret
    
    0000000000001600 <_Z10blend_maskRDv8_xS0_>:
        1600:       f3 0f 1e fa             endbr64
        1604:       62 f1 fd 48 6f 0f       vmovdqa64 (%rdi),%zmm1
        160a:       b8 0f 00 00 00          mov    $0xf,%eax
        160f:       c5 f9 92 c8             kmovb  %eax,%k1
        1613:       62 f2 f5 49 64 06       vpblendmq (%rsi),%zmm1,%zmm0{%k1}
        1619:       c3                      ret
    

    (I used g++ hence the extra decoration to the function names).

    So you are right, they are exactly the same thing (except for the position of the mask in the list of parameters).


    Since Peter Cordes mentioned icc (which is really icpx now), I though I could give that a try. It's not even using VPBLENDMQ or some MOV instruction. It uses the VSHUFI64X2 instead. Better optimized for sure. Two instead of four instructions (not counting the ENDBR64 and RET instructions).

    Compiled with:

    icpx -mavx512dq -oa ~/tmp/a.cpp
    

    Output:

    00000000004011c0 <_Z8mov_maskRDv8_xS0_>:
      4011c0:       62 f1 fe 48 6f 06       vmovdqu64 (%rsi),%zmm0
      4011c6:       62 f3 fd 48 43 07 e4    vshufi64x2 $0xe4,(%rdi),%zmm0,%zmm0
      4011cd:       c3                      ret    
    
    00000000004011d0 <_Z10blend_maskRDv8_xS0_>:
      4011d0:       62 f1 fe 48 6f 06       vmovdqu64 (%rsi),%zmm0
      4011d6:       62 f3 fd 48 43 07 e4    vshufi64x2 $0xe4,(%rdi),%zmm0,%zmm0
      4011dd:       c3                      ret
    

    That being said, again, the two functions are exactly the same, bit for bit.