I noticed 8 byte std::array comparisons seem to be producing assembly different from bit_cast
ing. GCC seems to do what I expect for a char array, but clang generates an extra mov instruction (spilling the by-value array<>
arg from an 8-byte register to the red zone, but still comparing the register arg with the memory pointed-to by the other arg).
In the std::byte
case we get 8 separate single-byte cmp
vs. a single efficient qword compare for array<char>
. Curious if there is a reason for this difference?
#include <array>
#include <bit>
#include <cstdint>
// produces completely different asm then the other 2 functions
bool compare1(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
return p == r;
}
// seems to be similar to bit_casting, but clang generates 1 more instruction
bool compare2(const std::array<char, 8> &p, std::array<char, 8> r)
{
return p == r;
}
// same assembly if you use char instead of byte
bool compare3(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
return std::bit_cast<uint64_t>(p) == std::bit_cast<uint64_t>(r);
}
clang asm:
compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>): # @compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>)
cmp dil, sil
sete al
jne .LBB0_8
mov eax, edi
shr eax, 8
mov ecx, esi
shr ecx, 8
cmp al, cl
sete al
jne .LBB0_8
mov eax, edi
shr eax, 16
mov ecx, esi
shr ecx, 16
cmp al, cl
sete al
jne .LBB0_8
mov eax, edi
shr eax, 24
mov ecx, esi
shr ecx, 24
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 32
mov rcx, rsi
shr rcx, 32
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 40
mov rcx, rsi
shr rcx, 40
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 48
mov rcx, rsi
shr rcx, 48
cmp al, cl
sete al
jne .LBB0_8
xor rdi, rsi
shr rdi, 56
sete al
.LBB0_8:
ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>): # @compare2(std::array<char, 8ul> const&, std::array<char, 8ul>)
mov qword ptr [rsp - 8], rsi
cmp qword ptr [rdi], rsi
sete al
ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>): # @compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>)
cmp qword ptr [rdi], rsi
sete al
ret
gcc asm:
compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>):
mov rdx, rdi
mov rax, rsi
cmp sil, dil
jne .L9
movzx ecx, ah
cmp dh, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 16
shr rcx, 16
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 24
shr rcx, 24
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 32
shr rcx, 32
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 40
shr rcx, 40
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 48
shr rcx, 48
cmp sil, cl
jne .L9
shr rdx, 56
shr rax, 56
cmp dl, al
sete al
ret
.L9:
xor eax, eax
ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>):
cmp QWORD PTR [rdi], rsi
sete al
ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>):
cmp QWORD PTR [rdi], rsi
sete al
ret
There is a missed optimization for both Clang and GCC.
For GCC, this issue has already been reported at Bug 101485 - Calling std::equal
with std::byte*
does not use memcmp
The difference between char
and std::byte
boils down to a library issue.
As Jonathan Wakely (libstdc++ maintainer, LWG chair) put it in your bug report:
we already have the hack in libstdc++ it just doesn't work for
std::byte
std::array::operator==
is implemented in terms of std::equal
, which is implemented using (see bits/atl_algobase.h
):
template<typename _II1, typename _II2>
_GLIBCXX20_CONSTEXPR
inline bool
__equal_aux1(_II1 __first1, _II1 __last1, _II2 __first2)
{
typedef typename iterator_traits<_II1>::value_type _ValueType1;
const bool __simple = ((__is_integer<_ValueType1>::__value
#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
|| __is_pointer(_ValueType1)
#endif
) && __memcmpable<_II1, _II2>::__value);
return std::__equal<__simple>::equal(__first1, __last1, __first2);
}
When __simple
is true, this function uses std::__memcmp
to compare objects, and otherwise, it does it naively by going through each iterator element.
std::byte
is an enumeration, so (__is_integer<_ValueType1>::__value || __is_pointer(_ValueType1)
is false
and the naive implementation is used instead.
Optimizing the naive implementation down to a single cmp
is possible in theory, but sadly, both GCC and Clang miss this optimization.