I have just been trying to benchmark some PRNG code for generating hex characters - I have an input of random values buf
, which I want to turn into a series of hex characters, in-place.
#define _POSIX_C_SOURCE 199309L
#include <stdio.h>
#include <stdalign.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>
static const char HEX_CHARS[16] = "0123456789ABCDEF";
static void a(size_t n, char buf[static n]) {
// single loop, branch
for (size_t i = 0; i < n; i++) {
buf[i] &= 15;
buf[i] = buf[i] < 10
? buf[i] + '0'
: buf[i] + 'A' - 10;
}
}
static void b(size_t n, char buf[static n]) {
// double loop, branch
for (size_t i = 0; i < n; i++)
buf[i] &= 15;
for (size_t i = 0; i < n; i++)
buf[i] = buf[i] < 10
? buf[i] + '0'
: buf[i] + 'A' - 10;
}
static void c(size_t n, char buf[static n]) {
// single loop, lookup
for (size_t i = 0; i < n; i++)
buf[i] = HEX_CHARS[buf[i] & 15];
}
static void d(size_t n, char buf[static n]) {
// double loop, lookup
for (size_t i = 0; i < n; i++)
buf[i] &= 0xF;
for (size_t i = 0; i < n; i++)
buf[i] = HEX_CHARS[(size_t)buf[i]];
}
static void test(
const size_t n,
const char input[const static n],
const char *const name,
void (*const fn)(size_t n, char buf[static n])
) {
const size_t M = 1 << 16;
char *buf = calloc(n, 1);
memmove(buf, input, n);
double ns_total = 0;
for (size_t i = 0; i < M; i++) {
struct timespec t_start, t_end;
clock_gettime(CLOCK_MONOTONIC, &t_start);
fn(n, buf);
clock_gettime(CLOCK_MONOTONIC, &t_end);
double ns_start = (double)t_start.tv_sec * 1e9 + (double)t_start.tv_nsec;
double ns_end = (double)t_end.tv_sec * 1e9 + (double)t_end.tv_nsec;
double ns_dur = ns_end - ns_start;
ns_total += ns_dur;
}
printf("%s() took %f sec (avg %f ms)\n", name, ns_total * 1e-9, ns_total / (double) M * 1e-6);
free(buf);
}
int main(void) {
const size_t N = 1UL << 24;
// generate random data
char *input = calloc(N, 1);
FILE *file = fopen("/dev/urandom", "r");
if (fread(input, 1, N, file) != N) {
perror("fread");
abort();
}
fclose(file);
test(N, input, "a", a);
test(N, input, "b", b);
test(N, input, "c", c);
test(N, input, "d", d);
free(input);
}
https://c.godbolt.org/z/WK95x4x86
tested on Ryzen 9 7940HS
a() took 41.452250 sec (avg 0.632511 ms)
b() took 68.869114 sec (avg 1.050859 ms)
c() took 323.392966 sec (avg 4.934585 ms)
d() took 365.612924 sec (avg 5.578810 ms)
a() took 30.370241 sec (avg 0.463413 ms)
b() took 59.591973 sec (avg 0.909301 ms)
c() took 315.819261 sec (avg 4.819019 ms)
d() took 332.411909 sec (avg 5.072203 ms)
Why is it so much faster to do a branch (a()
, b()
), compared to a lookup (c()
, d()
)? Since the input data is fully random, the branch predictor shouldn't be able to predict which branch will be taken, so I would expect a lookup to win.
Why does -march=native
cause a()
and b()
to be roughly the same, but c()
to be much slower and d()
so much faster? I especially don't understand how this can cause c()
to be twice as slow as without.
This turned out to be a measurement error.
The function a
is getting vectorized. Notice that in the Assembly code you linked to the operations in the loop are performed using xmm
registers:
movdqu xmm0, XMMWORD PTR [rax]
add rax, 16
pand xmm0, xmm6
movdqa xmm2, xmm0
movdqa xmm1, xmm0
pcmpgtb xmm0, xmm5
paddb xmm2, xmm4
paddb xmm1, xmm3
pand xmm1, xmm0
pandn xmm0, xmm2
por xmm0, xmm1
movups XMMWORD PTR [rax-16], xmm0
Meanwhile c
is not vectorized at all:
c:
test rdi, rdi
je .L1
add rdi, rsi
.L3:
movzx eax, BYTE PTR [rsi]
add rsi, 1
and eax, 15
movzx eax, BYTE PTR HEX_CHARS[rax]
mov BYTE PTR [rsi-1], al
cmp rdi, rsi
jne .L3
.L1:
ret
HEX_CHARS:
.ascii "0123456789ABCDEF"
With xmm
registers in SSE the CPU can perform the operations in a
on four integers at once, which is consistent with your results (notice how the function a
finishes in about a quarter of the time as the function c
). The function c
can’t be vectorized because its loop is a random look-up. So this answers your first question.
I tested your code on my computer (Ryzen 5 5500U) and got the following results:
-O3
without -march=native
:
a() took 1.089963 sec
b() took 2.084695 sec
c() took 3.504775 sec
d() took 3.938886 sec
-O3
with -march=native
:
a() took 0.988197 sec
b() took 1.905577 sec
c() took 3.507487 sec
d() took 3.866236 sec
As you can see c
performed the same in both cases, so I’m not sure why in your tests it got so much slower with -march=native
. d
can get a bit of a speed-up because of AVX (you can see the generated code on Godbolt).