cperformancebenchmarkingcompiler-optimizationauto-vectorization

Benchmarks for generating hex characters


I have just been trying to benchmark some PRNG code for generating hex characters - I have an input of random values buf, which I want to turn into a series of hex characters, in-place.

Code

#define _POSIX_C_SOURCE 199309L

#include <stdio.h>
#include <stdalign.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>

static const char HEX_CHARS[16] = "0123456789ABCDEF";

static void a(size_t n, char buf[static n]) {
    // single loop, branch
    for (size_t i = 0; i < n; i++) {
        buf[i] &= 15;
        buf[i] = buf[i] < 10
                ? buf[i] + '0'
                : buf[i] + 'A' - 10;
    }
}

static void b(size_t n, char buf[static n]) {
    // double loop, branch
    for (size_t i = 0; i < n; i++)
        buf[i] &= 15;
    for (size_t i = 0; i < n; i++)
        buf[i] = buf[i] < 10
                ? buf[i] + '0'
                : buf[i] + 'A' - 10;
}

static void c(size_t n, char buf[static n]) {
    // single loop, lookup
    for (size_t i = 0; i < n; i++)
        buf[i] = HEX_CHARS[buf[i] & 15];
}

static void d(size_t n, char buf[static n]) {
    // double loop, lookup
    for (size_t i = 0; i < n; i++)
        buf[i] &= 0xF;
    for (size_t i = 0; i < n; i++)
        buf[i] = HEX_CHARS[(size_t)buf[i]];
}

static void test(
    const size_t n,
    const char input[const static n],
    const char *const name,
    void (*const fn)(size_t n, char buf[static n])
) {
    const size_t M = 1 << 16;

    char *buf = calloc(n, 1);
    memmove(buf, input, n);

    double ns_total = 0;
    for (size_t i = 0; i < M; i++) {
        struct timespec t_start, t_end;
        clock_gettime(CLOCK_MONOTONIC, &t_start);
        fn(n, buf);
        clock_gettime(CLOCK_MONOTONIC, &t_end);
        double ns_start = (double)t_start.tv_sec * 1e9 + (double)t_start.tv_nsec;
        double ns_end = (double)t_end.tv_sec * 1e9 + (double)t_end.tv_nsec;
        double ns_dur = ns_end - ns_start;
        ns_total += ns_dur;
    }

    printf("%s() took %f sec (avg %f ms)\n", name, ns_total * 1e-9, ns_total / (double) M * 1e-6);

    free(buf);
}

int main(void) {
    const size_t N = 1UL << 24;

    // generate random data
    char *input = calloc(N, 1);
    FILE *file = fopen("/dev/urandom", "r");
    if (fread(input, 1, N, file) != N) {
        perror("fread");
        abort();
    }
    fclose(file);

    test(N, input, "a", a);
    test(N, input, "b", b);
    test(N, input, "c", c);
    test(N, input, "d", d);

    free(input);
}

Disassembly

https://c.godbolt.org/z/WK95x4x86

Results

tested on Ryzen 9 7940HS

-O3

a() took 41.452250 sec (avg 0.632511 ms)
b() took 68.869114 sec (avg 1.050859 ms)
c() took 323.392966 sec (avg 4.934585 ms)
d() took 365.612924 sec (avg 5.578810 ms)

-O3 -march=native

a() took 30.370241 sec (avg 0.463413 ms)
b() took 59.591973 sec (avg 0.909301 ms)
c() took 315.819261 sec (avg 4.819019 ms)
d() took 332.411909 sec (avg 5.072203 ms)

Questions

1.

Why is it so much faster to do a branch (a(), b()), compared to a lookup (c(), d())? Since the input data is fully random, the branch predictor shouldn't be able to predict which branch will be taken, so I would expect a lookup to win.

## 2.

Why does -march=native cause a() and b() to be roughly the same, but c() to be much slower and d() so much faster? I especially don't understand how this can cause c() to be twice as slow as without.
This turned out to be a measurement error.


Solution

  • The function a is getting vectorized. Notice that in the Assembly code you linked to the operations in the loop are performed using xmm registers:

            movdqu  xmm0, XMMWORD PTR [rax]
            add     rax, 16
            pand    xmm0, xmm6
            movdqa  xmm2, xmm0
            movdqa  xmm1, xmm0
            pcmpgtb xmm0, xmm5
            paddb   xmm2, xmm4
            paddb   xmm1, xmm3
            pand    xmm1, xmm0
            pandn   xmm0, xmm2
            por     xmm0, xmm1
            movups  XMMWORD PTR [rax-16], xmm0
    

    Meanwhile c is not vectorized at all:

    c:
            test    rdi, rdi
            je      .L1
            add     rdi, rsi
    .L3:
            movzx   eax, BYTE PTR [rsi]
            add     rsi, 1
            and     eax, 15
            movzx   eax, BYTE PTR HEX_CHARS[rax]
            mov     BYTE PTR [rsi-1], al
            cmp     rdi, rsi
            jne     .L3
    .L1:
            ret
    HEX_CHARS:
            .ascii  "0123456789ABCDEF"
    

    With xmm registers in SSE the CPU can perform the operations in a on four integers at once, which is consistent with your results (notice how the function a finishes in about a quarter of the time as the function c). The function c can’t be vectorized because its loop is a random look-up. So this answers your first question.

    I tested your code on my computer (Ryzen 5 5500U) and got the following results:

    -O3 without -march=native:

    a() took 1.089963 sec
    b() took 2.084695 sec
    c() took 3.504775 sec
    d() took 3.938886 sec
    

    -O3 with -march=native:

    a() took 0.988197 sec
    b() took 1.905577 sec
    c() took 3.507487 sec
    d() took 3.866236 sec
    

    As you can see c performed the same in both cases, so I’m not sure why in your tests it got so much slower with -march=native. d can get a bit of a speed-up because of AVX (you can see the generated code on Godbolt).