Is there a reason that 8 byte std::array comparisons seem to be producing different assembly for char vs. std::byte?

I noticed 8 byte std::array comparisons seem to be producing assembly different from bit_casting. GCC seems to do what I expect for a char array, but clang generates an extra mov instruction (spilling the by-value array<> arg from an 8-byte register to the red zone, but still comparing the register arg with the memory pointed-to by the other arg).

In the std::byte case we get 8 separate single-byte cmp vs. a single efficient qword compare for array<char>. Curious if there is a reason for this difference?

#include <array>
#include <bit>
#include <cstdint>

// produces completely different asm then the other 2 functions
bool compare1(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
    return p == r;
}

// seems to be similar to bit_casting, but clang generates 1 more instruction
bool compare2(const std::array<char, 8> &p, std::array<char, 8> r)
{
    return p == r;
}

// same assembly if you use char instead of byte
bool compare3(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
    return std::bit_cast<uint64_t>(p) == std::bit_cast<uint64_t>(r);
}

link to compiler explorer

clang asm:

compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>):    # @compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>)
        cmp     dil, sil
        sete    al
        jne     .LBB0_8
        mov     eax, edi
        shr     eax, 8
        mov     ecx, esi
        shr     ecx, 8
        cmp     al, cl
        sete    al
        jne     .LBB0_8
        mov     eax, edi
        shr     eax, 16
        mov     ecx, esi
        shr     ecx, 16
        cmp     al, cl
        sete    al
        jne     .LBB0_8
        mov     eax, edi
        shr     eax, 24
        mov     ecx, esi
        shr     ecx, 24
        cmp     al, cl
        sete    al
        jne     .LBB0_8
        mov     rax, rdi
        shr     rax, 32
        mov     rcx, rsi
        shr     rcx, 32
        cmp     al, cl
        sete    al
        jne     .LBB0_8
        mov     rax, rdi
        shr     rax, 40
        mov     rcx, rsi
        shr     rcx, 40
        cmp     al, cl
        sete    al
        jne     .LBB0_8
        mov     rax, rdi
        shr     rax, 48
        mov     rcx, rsi
        shr     rcx, 48
        cmp     al, cl
        sete    al
        jne     .LBB0_8
        xor     rdi, rsi
        shr     rdi, 56
        sete    al
.LBB0_8:
        ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>):        # @compare2(std::array<char, 8ul> const&, std::array<char, 8ul>)
        mov     qword ptr [rsp - 8], rsi
        cmp     qword ptr [rdi], rsi
        sete    al
        ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>):  # @compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>)
        cmp     qword ptr [rdi], rsi
        sete    al
        ret

gcc asm:

compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>):
        mov     rdx, rdi
        mov     rax, rsi
        cmp     sil, dil
        jne     .L9
        movzx   ecx, ah
        cmp     dh, cl
        jne     .L9
        mov     rsi, rdi
        mov     rcx, rax
        shr     rsi, 16
        shr     rcx, 16
        cmp     sil, cl
        jne     .L9
        mov     rsi, rdi
        mov     rcx, rax
        shr     rsi, 24
        shr     rcx, 24
        cmp     sil, cl
        jne     .L9
        mov     rsi, rdi
        mov     rcx, rax
        shr     rsi, 32
        shr     rcx, 32
        cmp     sil, cl
        jne     .L9
        mov     rsi, rdi
        mov     rcx, rax
        shr     rsi, 40
        shr     rcx, 40
        cmp     sil, cl
        jne     .L9
        mov     rsi, rdi
        mov     rcx, rax
        shr     rsi, 48
        shr     rcx, 48
        cmp     sil, cl
        jne     .L9
        shr     rdx, 56
        shr     rax, 56
        cmp     dl, al
        sete    al
        ret
.L9:
        xor     eax, eax
        ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>):
        cmp     QWORD PTR [rdi], rsi
        sete    al
        ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>):
        cmp     QWORD PTR [rdi], rsi
        sete    al
        ret

Solution

There is a missed optimization for both Clang and GCC. For GCC, this issue has already been reported at Bug 101485 - Calling std::equal with std::byte* does not use memcmp The difference between char and std::byte boils down to a library issue.

As Jonathan Wakely (libstdc++ maintainer, LWG chair) put it in your bug report:

we already have the hack in libstdc++ it just doesn't work for std::byte

std::array::operator== is implemented in terms of std::equal, which is implemented using (see bits/atl_algobase.h):

  template<typename _II1, typename _II2>
    _GLIBCXX20_CONSTEXPR
    inline bool
    __equal_aux1(_II1 __first1, _II1 __last1, _II2 __first2)
    {
      typedef typename iterator_traits<_II1>::value_type _ValueType1;
      const bool __simple = ((__is_integer<_ValueType1>::__value
#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
                  || __is_pointer(_ValueType1)
#endif
                 ) && __memcmpable<_II1, _II2>::__value);
      return std::__equal<__simple>::equal(__first1, __last1, __first2);
    }

When __simple is true, this function uses std::__memcmp to compare objects, and otherwise, it does it naively by going through each iterator element. std::byte is an enumeration, so (__is_integer<_ValueType1>::__value || __is_pointer(_ValueType1) is false and the naive implementation is used instead.

Optimizing the naive implementation down to a single cmp is possible in theory, but sadly, both GCC and Clang miss this optimization.