I'm looking at the woothash hash function, a reiteration of wyhash
- one of the best hash functions all-around according to the SMHasher project.
Both GCC and clang are able to perform a very deep optimization at -O1
(or higher levels, of course), and I absolutely don't understand how they go from 900+ lines of asm with -Og
, which closely follows the source code, to just 25 asm instructions (23 with GCC trunk, which is presumably GCC 13!).
The code is a bit too long to paste here in entirety, so here's the link to Compiler Explorer: https://godbolt.org/z/qK16EW4zT
Here's the gist of the question: the hash function has the main loop that processes data in batches 32 bytes wide, followed by a switch with 31 labels to process the remaining tail of the data. _wootp1 ... _wootp5
are compile-time constants.
inline constexpr uint64_t ROTL64(uint64_t x,int r) { return (x << r) | (x >> (64 - r)); }
inline uint64_t _wootmum(const uint64_t A, const uint64_t B) {
uint64_t r = (A ^ ROTL64(B, 39)) * (B ^ ROTL64(A, 39));
return r - (r >> 32);
}
inline uint64_t _wootr16(const uint8_t *p){ uint16_t v; memcpy(&v, p, 2); return v; }
inline uint64_t _wootr08(const uint8_t *p){ uint8_t v; memcpy(&v, p, 1); return v; }
inline uint64_t _wootr32(const uint8_t *p){ uint32_t v; memcpy(&v, p, 4); return v; }
inline uint64_t _wootr64(const uint8_t *p){ uint64_t v; memcpy(&v, p, 8); return v; }
uint64_t woothash(const uint8_t* p, uint64_t len, uint64_t seed) {
for (i = 0; i + 32 <= len; i += 32, p += 32)
{
a = (_wootr64(p ) ^ a) * _wootp1; a = ROTL64(a, 22); a *= _wootp3;
b = (_wootr64(p + 8) ^ b) * _wootp2; b = ROTL64(b, 25); b *= _wootp4;
c = (_wootr64(p + 16) ^ c) * _wootp3; c = ROTL64(c, 28); c *= _wootp5;
d = (_wootr64(p + 24) ^ d) * _wootp4; d = ROTL64(d, 31); d *= _wootp1;
seed += a + b + c + d;
}
switch (len & 31) {
case 1 : seed = _wootmum(seed, _wootr08(p) ^ _wootp1); break;
case 2 : seed = _wootmum(seed, _wootr16(p) ^ _wootp1); break;
case 3 : seed = _wootmum(seed, ((_wootr16(p) << 8) | _wootr08(p + 2)) ^ _wootp1); break;
case 4 : seed = _wootmum(seed, _wootr32(p) ^ _wootp1); break;
...
}
}
The main loop is quite small and seems well-optimized even at Og, with all the helper functions inlined. But then follow the 31 branches of the switch, for a total of 900+ lines of asm. This is easy to understand and follows the source code closely.
But here's the optimized assembly for main
at O1..O3 - this is the complete code, not an excerpt:
main:
movabs rdx, -1800455987208640293
movsx rax, edi
sal rdi, 32
movabs rcx, -6884282663029611481
shr rax, 32
or rdi, rax
movabs rax, 6239426704749895748
xor rdx, rdi
ror rdx, 25
xor rdx, rax
movabs rax, -4822408543216407806
xor rdi, rax
imul rdx, rdi
mov rax, rdx
shr rax, 32
sub rdx, rax
mov rax, rdx
sal rax, 16
xor rax, rdx
shr rdx, 32
xor rdx, rcx
imul rax, rdx
mov rdx, rax
shr rdx, 31
sub eax, edx
ret
from this code which calls the above woothash
that has the switch
:
[[nodiscard]] inline uint64_t woothash64(const void* data, uint64_t len) noexcept
{
return detail::woothash(data, len, 7733305894521163487ULL /* Completely fair and random seed*/);
}
[[nodiscard]] inline uint64_t woothash64i(uint64_t value) noexcept
{
return woothash64(&value, sizeof(uint64_t));
}
int main(int argc, char**)
{
return woothash64i(argc);
}
How do GCC and clang do it? I thought maybe the source code has UB and some part was unexpectedly removed, but the resulting hash values are the same with and without optimizations, with GCC or MSVC. What kind of sorcery is this? How is it possible to simplify the tail switch so well?
And more importantly, since the 25 assembly instructions do seem to correctly represent this function, it should be possible to simplify the source code in the same way, so that it's always optimized?
When you call this below, the compiler knows in woothash64
that len is a constant equal to 8 (sizeof(uint64_t)).
[[nodiscard]] inline uint64_t woothash64(const void* data, uint64_t len) noexcept
{
return detail::woothash(data, len, 7733305894521163487ULL /* Completely fair and random seed*/);
}
[[nodiscard]] inline uint64_t woothash64i(uint64_t value) noexcept
{
return woothash64(&value, sizeof(uint64_t));
}
int main(int argc, char**)
{
return woothash64i(argc);
}
int main(int argc, char**)
{
return woothash64i(argc);
}
So when it calls detail::woothash
it discards everything, becoming this:
inline uint64_t woothash(const void* key, uint64_t len, uint64_t seed) {
const uint8_t *p = (const uint8_t*)key;
// switch (len & 31) {
//case 8 :
seed = _wootmum(seed, __wootr64(p) ^ _wootp1); break;
seed = (seed ^ seed << 16) * (len ^ _wootp0 ^ seed >> 32);
return seed - (seed >> 31) + (seed << 33);
}
You can see that Godbolt itself highlights in color only the lines that were not optimized out