c x86-64 compiler-optimization modular-arithmetic 128-bit

Modulo operator slower than manual implementation?

I have found that manually calculating the % operator on __int128 is significantly faster than the built-in compiler operator. I will show you how to calculate modulo 9, but the method can be used to calculate modulo any other number.

First, consider the built-in compiler operator:

uint64_t mod9_v1(unsigned __int128 n)
{
    return n % 9;
}

Now consider my manual implementation:

uint64_t mod9_v2(unsigned __int128 n)
{
    uint64_t r = 0;

    r += (uint32_t)(n);
    r += (uint32_t)(n >> 32) * (uint64_t)4;
    r += (uint32_t)(n >> 64) * (uint64_t)7;
    r += (uint32_t)(n >> 96);

    return r % 9;
}

Measuring over 100,000,000 random numbers gives the following results:

mod9_v1 | 3.986052 secs
mod9_v2 | 1.814339 secs

GCC 9.3.0 with -march=native -O3 was used on AMD Ryzen Threadripper 2990WX. Here is a link to godbolt.

I would like to ask if it behaves the same way on your side? (Before reporting a bug to GCC Bugzilla).

UPDATE: On request, I supply a generated assembly:

mod9_v1:
        sub     rsp, 8
        mov     edx, 9
        xor     ecx, ecx
        call    __umodti3
        add     rsp, 8
        ret

mod9_v2:
        mov     rax, rdi
        shrd    rax, rsi, 32
        mov     rdx, rsi
        mov     r8d, eax
        shr     rdx, 32
        mov     eax, edi
        add     rax, rdx
        lea     rax, [rax+r8*4]
        mov     esi, esi
        lea     rcx, [rax+rsi*8]
        sub     rcx, rsi
        mov     rax, rcx
        movabs  rdx, -2049638230412172401
        mul     rdx
        mov     rax, rdx
        shr     rax, 3
        and     rdx, -8
        add     rdx, rax
        mov     rax, rcx
        sub     rax, rdx
        ret

Solution

The reason for this difference is clear from the assembly listings: the % operator applied to 128-bit integers is implemented via a library call to a generic function that cannot take advantage of compile time knowledge of the divisor value, which makes it possible to turn division and modulo operations into much faster multiplications.

The timing difference is even more significant on my old Macbook-pro using clang, where I mod_v2() is x15 times faster than mod_v1().

Note however these remarks:

you should measure the cpu time just after the end of the for loop, not after the first printf as currently coded.
rand_u128() only produces 124 bits assuming RAND_MAX is 0x7fffffff.
most of the time is spent computing the random numbers.

Using your slicing approach, I extended you code to reduce the number of steps using slices of 42, 42 and 44 bits, which further improves the timings (because 2⁴² % 9 == 1):

#pragma GCC diagnostic ignored "-Wpedantic"
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>

static uint64_t mod9_v1(unsigned __int128 n) {
    return n % 9;
}

static uint64_t mod9_v2(unsigned __int128 n) {
    uint64_t r = 0;

    r += (uint32_t)(n);
    r += (uint32_t)(n >> 32) * (uint64_t)(((uint64_t)1ULL << 32) % 9);
    r += (uint32_t)(n >> 64) * (uint64_t)(((unsigned __int128)1 << 64) % 9);
    r += (uint32_t)(n >> 96);

    return r % 9;
}

static uint64_t mod9_v3(unsigned __int128 n) {
    return (((uint64_t)(n >>  0) & 0x3ffffffffff) +
            ((uint64_t)(n >> 42) & 0x3ffffffffff) +
            ((uint64_t)(n >> 84))) % 9;
}

unsigned __int128 rand_u128() {
    return ((unsigned __int128)rand() << 97 ^
            (unsigned __int128)rand() << 66 ^
            (unsigned __int128)rand() << 35 ^
            (unsigned __int128)rand() << 4 ^
            (unsigned __int128)rand());
}

#define N 100000000

int main() {
    srand(42);

    unsigned __int128 *arr = malloc(sizeof(unsigned __int128) * N);
    if (arr == NULL) {
        return 1;
    }

    for (size_t n = 0; n < N; ++n) {
        arr[n] = rand_u128();
    }

#if 1
    /* check that modulo 9 is calculated correctly */
    for (size_t n = 0; n < N; ++n) {
        uint64_t m = mod9_v1(arr[n]);
        assert(m == mod9_v2(arr[n]));
        assert(m == mod9_v3(arr[n]));
    }
#endif

    clock_t clk1 = -clock();
    uint64_t sum1 = 0;
    for (size_t n = 0; n < N; ++n) {
        sum1 += mod9_v1(arr[n]);
    }
    clk1 += clock();

    clock_t clk2 = -clock();
    uint64_t sum2 = 0;
    for (size_t n = 0; n < N; ++n) {
        sum2 += mod9_v2(arr[n]);
    }
    clk2 += clock();

    clock_t clk3 = -clock();
    uint64_t sum3 = 0;
    for (size_t n = 0; n < N; ++n) {
        sum3 += mod9_v3(arr[n]);
    }
    clk3 += clock();

    printf("mod9_v1: sum=%"PRIu64", elapsed time: %.3f secs\n", sum1, clk1 / (double)CLOCKS_PER_SEC);
    printf("mod9_v2: sum=%"PRIu64", elapsed time: %.3f secs\n", sum2, clk2 / (double)CLOCKS_PER_SEC);
    printf("mod9_v3: sum=%"PRIu64", elapsed time: %.3f secs\n", sum3, clk3 / (double)CLOCKS_PER_SEC);

    free(arr);
    return 0;
}

Here are the timings on my linux server (gcc):

mod9_v1: sum=400041273, elapsed time: 7.992 secs
mod9_v2: sum=400041273, elapsed time: 1.295 secs
mod9_v3: sum=400041273, elapsed time: 1.131 secs

The same code on my Macbook (clang):

mod9_v1: sum=399978071, elapsed time: 32.900 secs
mod9_v2: sum=399978071, elapsed time: 0.204 secs
mod9_v3: sum=399978071, elapsed time: 0.185 secs