I'm trying to implement an efficient segmented prime sieve in C. It's basically a sieve of Eratosthenes, but each segment is split to a size that can well fit in cache.
In my version, there is a bit array of flags in which each bit is a consecutive odd number. Each bit is erased by masking with AND
when it is a multiple of a known prime number.
This single part of code consumes about 90% of runtime. Each dirty bit of code has a reason for it that I explained in comments, but the overall operation is very simple.
And that's it.
There is a program called primesieve
which can do this operation very fast. It is about 3 times faster than my version. I read its documentation about the algorithm and also its code, and applied whatever is plausible to my code.
Since there is a known program a lot faster than mine, I will investigate further what they're doing and what I'm not, but before that, I posted this question to get extra help if you can help me find out which part is not running efficiently.
Saying again, this single routine consumes 90% of runtime, so I'm really focused on making this part run faster.
This is the old version, I've made some modifications after the post, and that one's below this one. The comments still apply.
#include <stdlib.h>
#include <stdio.h>
//size of the cache block (64K)
#define C 0x10000
static unsigned sq(unsigned a) {
return a * a;
}
//`f` is the array of flags. `st` is the starting point being a multiple of
//`C * 16`. Each byte can hold 16 odd numbers, thus having `C` multiplied by 16.
//`p` is the array of prime numbers from 67 up to `sqrt(st + C * 16)`. Primes
//below 67 are handled specially by other routines.
__attribute__((noinline)) //Don't inline for testing.
static void sieve_bit(unsigned *f, unsigned st, unsigned *p) {
//Doing table access than computing the mask in runtime was about 15% faster
//on my machine. I'm guessing that probably the bit-shifts on x86 only being
//able to use the CL register as a variable counter creates a dependency
//when there is a series of shifts, making it slower than multiple L1 cache
//accesses.
static const unsigned m[] = {
~(1u << 0), ~(1u << 1), ~(1u << 2), ~(1u << 3), ~(1u << 4),
~(1u << 5), ~(1u << 6), ~(1u << 7), ~(1u << 8), ~(1u << 9),
~(1u << 10), ~(1u << 11), ~(1u << 12), ~(1u << 13), ~(1u << 14),
~(1u << 15), ~(1u << 16), ~(1u << 17), ~(1u << 18), ~(1u << 19),
~(1u << 20), ~(1u << 21), ~(1u << 22), ~(1u << 23), ~(1u << 24),
~(1u << 25), ~(1u << 26), ~(1u << 27), ~(1u << 28), ~(1u << 29),
~(1u << 30), ~(1u << 31)
};
unsigned p2 = sq(*p);
do {
unsigned r = st % *p;
//This calculates the starting point. The result of `n` will be an odd
//number multiple of `*p` a bit larger than `st`.
unsigned n = st + *p - r + (-(r & 1) & *p);
//If the square of `*p` is larger than `n`, use it instead.
if (p2 > n) {
n = p2;
}
//The loop is unrolled 8 times, which gave the fastest result on my
//machine.
for (;; n += *p * 16) {
//Jumps to the next stage when there are no space left for 8
//operations to be done. This could simply be a `break` followed by
//a loop, but this was slightly (1~3%) faster than the simpler
//alternative.
int d = st + C * 16 - n;
//as mentioned in the comments, the value of `d` relies on
//implementation dependent behaviour for 2's complement machines.
//`n` can be larger than `st + C * 16`, in which case the wrapped
//unsigned value is converted to a negative `int`.
if (d <= (int)*p * 14) {
if (d <= (int)*p * 6) {
if (d <= (int)*p * 2) {
if (d <= (int)*p * 0) goto L0; else goto L1;
} else {
if (d <= (int)*p * 4) goto L2; else goto L3;
}
} else {
if (d <= (int)*p * 10) {
if (d <= (int)*p * 8) goto L4; else goto L5;
} else {
if (d <= (int)*p * 12) goto L6; else goto L7;
}
}
}
//The multiples of primes are erased.
#define ur(i) do {\
unsigned _n = (n - st) / 2 + *p * i;\
f[_n / 32] &= m[_n % 32];\
} while (0)
ur(0); ur(1); ur(2); ur(3); ur(4); ur(5); ur(6); ur(7);
}
//Erase with the leftovers.
L7: ur(6);
L6: ur(5);
L5: ur(4);
L4: ur(3);
L3: ur(2);
L2: ur(1);
L1: ur(0);
L0:
p2 = sq(*++p);
#undef ur
} while (p2 < st + C * 16);
}
//This could break if `TscInvariant` CPUID is false, which is probably rare on
//modern machines?
static inline unsigned long long rdtscp() {
unsigned _;
return __builtin_ia32_rdtscp(&_);
}
//This isn't used in actuall code. Just a simple one for a test. Fill with prime
//numbers enough to sieve all 32-bit primes.
static inline void fillPrimes(unsigned *p) {
for (unsigned n = 67, c = 2; n <= 65521; n += c ^= 6) {
for (unsigned d = 5, c = 4; d * d <= n; d += c ^= 6) {
if (!(n % d)) goto next;
}
*p++ = n;
next:;
}
}
int main() {
unsigned p[8000];
fillPrimes(p);
unsigned f[C / sizeof(unsigned)];
puts("start sieve");
unsigned long long c = rdtscp();
for (int i = 0; i < 2000; ++i) {
volatile unsigned *vf = f;
sieve_bit((unsigned *)vf, C * 16 * i, p);
}
c = rdtscp() - c;
printf("%llu\n", c);
return 0;
}
After having a rest, I could find some redundant calculations inside the loop, and taking them out gained about 6~7% speed.
static void sieve_bit(unsigned *f, unsigned st, unsigned *p) {
static const unsigned m[] = {
~(1u << 0), ~(1u << 1), ~(1u << 2), ~(1u << 3), ~(1u << 4),
~(1u << 5), ~(1u << 6), ~(1u << 7), ~(1u << 8), ~(1u << 9),
~(1u << 10), ~(1u << 11), ~(1u << 12), ~(1u << 13), ~(1u << 14),
~(1u << 15), ~(1u << 16), ~(1u << 17), ~(1u << 18), ~(1u << 19),
~(1u << 20), ~(1u << 21), ~(1u << 22), ~(1u << 23), ~(1u << 24),
~(1u << 25), ~(1u << 26), ~(1u << 27), ~(1u << 28), ~(1u << 29),
~(1u << 30), ~(1u << 31)
};
unsigned p2 = sq(*p);
do {
unsigned n =
(p2 > st + *p * 2 ? p2 - st : *p - st % *p + (-(st % *p & 1) & *p)) / 2;
for (;; n += *p * 8) {
int d = C * 8 - (int)n;
if (d <= (int)*p * 7) {
if (d <= (int)*p * 3) {
if (d <= (int)*p * 1) {
if (d <= (int)*p * 0) goto L0; else goto L1;
} else {
if (d <= (int)*p * 2) goto L2; else goto L3;
}
} else {
if (d <= (int)*p * 5) {
if (d <= (int)*p * 4) goto L4; else goto L5;
} else {
if (d <= (int)*p * 6) goto L6; else goto L7;
}
}
}
#define ur(i) f[(n + *p * i) / 32] &= m[(n + *p * i) % 32]
ur(0); ur(1); ur(2); ur(3); ur(4); ur(5); ur(6); ur(7);
}
L7: ur(6);
L6: ur(5);
L5: ur(4);
L4: ur(3);
L3: ur(2);
L2: ur(1);
L1: ur(0);
L0:
p2 = sq(*++p);
#undef ur
} while (p2 < st + C * 16);
}
I added a simple benchmarking framework to test various approaches. It turns out the unrolling does not improve performance. The reason is probably that modern processors predict branches so a simple loop that runs many times can run at full speed, which is the case for small primes and the same simple loop that breaks immediately is also correctly predicted once primes become large enough.
Here is a simplified version with identical performance on my laptop:
static void sieve_bit_3(unsigned *f, unsigned st, unsigned *pp) {
static const unsigned m[] = {
~(1u << 0), ~(1u << 1), ~(1u << 2), ~(1u << 3), ~(1u << 4),
~(1u << 5), ~(1u << 6), ~(1u << 7), ~(1u << 8), ~(1u << 9),
~(1u << 10), ~(1u << 11), ~(1u << 12), ~(1u << 13), ~(1u << 14),
~(1u << 15), ~(1u << 16), ~(1u << 17), ~(1u << 18), ~(1u << 19),
~(1u << 20), ~(1u << 21), ~(1u << 22), ~(1u << 23), ~(1u << 24),
~(1u << 25), ~(1u << 26), ~(1u << 27), ~(1u << 28), ~(1u << 29),
~(1u << 30), ~(1u << 31)
};
unsigned p = *pp;
unsigned p2 = sq(p);
do {
unsigned n = (p2 > st + p * 2 ? p2 - st : p - st % p + (-(st % p & 1) & p)) / 2;
for (; n < C * 8; n += p) {
f[n / 32] &= m[n % 32];
}
p2 = sq(p = *++pp);
} while (p2 < st + C * 16);
}
Here is a modified version with fewer tests, a little faster:
static void sieve_bit_6(unsigned *f, unsigned st, unsigned *pp) {
static const unsigned m[] = {
~(1u << 0), ~(1u << 1), ~(1u << 2), ~(1u << 3), ~(1u << 4),
~(1u << 5), ~(1u << 6), ~(1u << 7), ~(1u << 8), ~(1u << 9),
~(1u << 10), ~(1u << 11), ~(1u << 12), ~(1u << 13), ~(1u << 14),
~(1u << 15), ~(1u << 16), ~(1u << 17), ~(1u << 18), ~(1u << 19),
~(1u << 20), ~(1u << 21), ~(1u << 22), ~(1u << 23), ~(1u << 24),
~(1u << 25), ~(1u << 26), ~(1u << 27), ~(1u << 28), ~(1u << 29),
~(1u << 30), ~(1u << 31)
};
unsigned p, p2;
while (p = *pp++, (p2 = sq(p)) <= st + 2 * p) {
unsigned mod = st % p;
unsigned n = (p - mod + (-(mod & 1) & p)) / 2;
for (; n < C * 8; n += p) {
f[n / 32] &= m[n % 32];
}
}
while (p2 < st + C * 16) {
unsigned n = (p2 - st) / 2;
for (; n < C * 8; n += p) {
f[n / 32] &= m[n % 32];
}
p2 = sq(p = *pp++);
}
}