assemblygcccompiler-optimizationinlining

How does this compiler optimization work?


I'm looking at the woothash hash function, a reiteration of wyhash - one of the best hash functions all-around according to the SMHasher project.

Both GCC and clang are able to perform a very deep optimization at -O1 (or higher levels, of course), and I absolutely don't understand how they go from 900+ lines of asm with -Og, which closely follows the source code, to just 25 asm instructions (23 with GCC trunk, which is presumably GCC 13!).

The code is a bit too long to paste here in entirety, so here's the link to Compiler Explorer: https://godbolt.org/z/qK16EW4zT

Here's the gist of the question: the hash function has the main loop that processes data in batches 32 bytes wide, followed by a switch with 31 labels to process the remaining tail of the data. _wootp1 ... _wootp5 are compile-time constants.

inline constexpr uint64_t ROTL64(uint64_t x,int r) { return (x << r) | (x >> (64 - r)); }

inline uint64_t _wootmum(const uint64_t A, const uint64_t B) {
    uint64_t r = (A ^ ROTL64(B, 39)) * (B ^ ROTL64(A, 39));
    return r - (r >> 32);
}

inline uint64_t _wootr16(const uint8_t *p){ uint16_t v; memcpy(&v, p, 2); return v; }
inline uint64_t _wootr08(const uint8_t *p){ uint8_t v; memcpy(&v, p, 1); return v; }
inline uint64_t _wootr32(const uint8_t *p){ uint32_t v; memcpy(&v, p, 4); return v; }
inline uint64_t _wootr64(const uint8_t *p){ uint64_t v; memcpy(&v, p, 8); return v; }

uint64_t woothash(const uint8_t* p, uint64_t len, uint64_t seed) {

   for (i = 0; i + 32 <= len; i += 32, p += 32)
   {
       a = (_wootr64(p     ) ^ a) * _wootp1; a = ROTL64(a, 22); a *= _wootp3;
       b = (_wootr64(p +  8) ^ b) * _wootp2; b = ROTL64(b, 25); b *= _wootp4;
       c = (_wootr64(p + 16) ^ c) * _wootp3; c = ROTL64(c, 28); c *= _wootp5;
       d = (_wootr64(p + 24) ^ d) * _wootp4; d = ROTL64(d, 31); d *= _wootp1;
       seed += a + b + c + d;
   }

   switch (len & 31) {
       case 1 : seed = _wootmum(seed, _wootr08(p) ^ _wootp1); break;
       case 2 : seed = _wootmum(seed, _wootr16(p) ^ _wootp1); break;
       case 3 : seed = _wootmum(seed, ((_wootr16(p) << 8) | _wootr08(p + 2)) ^ _wootp1); break;
       case 4 : seed = _wootmum(seed, _wootr32(p) ^ _wootp1); break;
   ...
   }

}

The main loop is quite small and seems well-optimized even at Og, with all the helper functions inlined. But then follow the 31 branches of the switch, for a total of 900+ lines of asm. This is easy to understand and follows the source code closely.

But here's the optimized assembly for main at O1..O3 - this is the complete code, not an excerpt:

main:
        movabs  rdx, -1800455987208640293
        movsx   rax, edi
        sal     rdi, 32
        movabs  rcx, -6884282663029611481
        shr     rax, 32
        or      rdi, rax
        movabs  rax, 6239426704749895748
        xor     rdx, rdi
        ror     rdx, 25
        xor     rdx, rax
        movabs  rax, -4822408543216407806
        xor     rdi, rax
        imul    rdx, rdi
        mov     rax, rdx
        shr     rax, 32
        sub     rdx, rax
        mov     rax, rdx
        sal     rax, 16
        xor     rax, rdx
        shr     rdx, 32
        xor     rdx, rcx
        imul    rax, rdx
        mov     rdx, rax
        shr     rdx, 31
        sub     eax, edx
        ret

from this code which calls the above woothash that has the switch:

[[nodiscard]] inline uint64_t woothash64(const void* data, uint64_t len) noexcept
{
    return detail::woothash(data, len, 7733305894521163487ULL /* Completely fair and random seed*/);
}

[[nodiscard]] inline uint64_t woothash64i(uint64_t value) noexcept
{
    return woothash64(&value, sizeof(uint64_t));
}

int main(int argc, char**)
{
    return woothash64i(argc);
}

How do GCC and clang do it? I thought maybe the source code has UB and some part was unexpectedly removed, but the resulting hash values are the same with and without optimizations, with GCC or MSVC. What kind of sorcery is this? How is it possible to simplify the tail switch so well?

And more importantly, since the 25 assembly instructions do seem to correctly represent this function, it should be possible to simplify the source code in the same way, so that it's always optimized?


Solution

  • When you call this below, the compiler knows in woothash64 that len is a constant equal to 8 (sizeof(uint64_t)).

    [[nodiscard]] inline uint64_t woothash64(const void* data, uint64_t len) noexcept
    {
        return detail::woothash(data, len, 7733305894521163487ULL /* Completely fair and random seed*/);
    }
    
    [[nodiscard]] inline uint64_t woothash64i(uint64_t value) noexcept
    {
        return woothash64(&value, sizeof(uint64_t));
    }
    
    int main(int argc, char**)
    {
        return woothash64i(argc);
    }
    int main(int argc, char**)
    {
        return woothash64i(argc);
    }
    

    So when it calls detail::woothash it discards everything, becoming this:

    inline uint64_t woothash(const void* key, uint64_t len, uint64_t seed) {
        const uint8_t *p = (const uint8_t*)key;
        
        // switch (len & 31) {
        //case 8 : 
        seed = _wootmum(seed, __wootr64(p) ^ _wootp1); break;
        
        seed = (seed ^ seed << 16) * (len ^ _wootp0 ^ seed >> 32);
        return seed - (seed >> 31) + (seed << 33);
    }
    
    

    You can see that Godbolt itself highlights in color only the lines that were not optimized out

    enter image description here