cprimes

Sieve eratosthenes parallelisation in c


I just had the solution for Sieve of Eratosthenes parallelisation in C from my teacher. What do these lines mean ?

if (!(a[i / 64] & ((uint_fast64_t)1 << (i % 64))))       
    continue;

a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64));

Full code:

#include <math.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>

#pragma GCC optimize("unroll-loops")
#define N 100
#define K 7

uint_fast64_t a[N];

void *threadBehaviour(void *) {
    static uint16_t k = 0;
    k = (k + 1) % K;
    uint_fast64_t nb_max_i;
    for (uint_fast64_t i = 2; i <= sqrt(N); i += 1) {
        if (!(a[i/64 ] & ((uint_fast64_t)1 << (i %64 ))))
            continue;
        nb_max_i = ((N) - (i * i)) / i;
        uint_fast64_t exitCondition = i * i + (nb_max_i * (k + 1) / K) * i;
        for (uint_fast64_t j = i * i + (nb_max_i * k / K) * i; j <= exitCondition; j += i) {
            a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64));
        }
    }
    pthread_exit(NULL);
}

int main() {
    pthread_t k[K];
    for (uint_fast64_t i = 0; i < N; i++) {
        a[i] = 0xAAAAAAAAAAAAAAAA;
    }
    for (uint16_t u = 0; u < K; u++) {
        pthread_create(&k[u], NULL, &threadBehaviour, NULL);
    }
    clock_t t;
    t = clock();
    for (uint16_t j = 0; j < K; j++) {
        pthread_join(k[j], NULL);
    }
    t = clock() - t;
    double time_taken = ((double)t) / CLOCKS_PER_SEC;
    printf("%f seconds", time_taken);

    for (uint_fast64_t i = 2; i < (N+1); i++) {
        if (a[i / 64] & ((uint_fast64_t)1 << i)) {
            printf("| %lu ", i);
        }
    }
    printf("\n");
    return 0;
}

especially for a[i/64] I don't understand what this does


Solution

  • The test (a[i / 64] & ((uint_fast64_t)1 << (i % 64))) checks if a bit is set in an element of the global array a. The bit number is the low 6 bits of i and the element number is the higher bits of i. This initial part of the outer loop selects the next prime number whose multiples to flag as composite in the rest of the array (or the slice thereof assigned to this thread).

    Similarly a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64)); clears the corresponding bit for index j, marking j as a composite number.

    This implementation creates multiple threads, each of which updates a different portion of the array. Yet there are significant issues in this approach:

    These lines at the beginning of the thread function are absolutely horrible:

    static uint16_t k = 0;
    k = (k + 1) % K;
    

    The purpose is to determine which part of the global array is modified by the current thread. Yet the static variable k is modified by concurrent threads without a synchronisation mechanism and this modification is not atomic so there is no guarantee every thread will get a different value of k. You should instead allocate a structure with the appropriate information and pass a pointer as the opaque argument, removing the need for global or static variables.

    There is another major problem in this approach: all threads access the beginning of the global array, that is modified concurrently by the first thread. This is ugly at best and potentially incorrect.

    It is also unclear why the array has N elements to compute prime numbers below N, N / 64 should suffice.

    The expressions for the initial and maximum values of j are too complicated and difficult to verify. I am almost certain the last value of j may cause undefined behavior in some cases.

    Also note that clock() measures cpu time, not elapsed time. You should use timespec_get(), gettimeofday() or another precise time function.

    Here is a modified version:

    #include <math.h>
    #include <pthread.h>
    #include <stdint.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <unistd.h>
    #include <sys/time.h>
    
    //#pragma GCC optimize("unroll-loops")
    struct sieve_args {
        uint64_t *a;
        uint64_t maxp, begin, end;
    };
    
    void *threadBehaviour(void *opaque) {
        struct sieve_args *args = opaque;
        uint64_t *a = args->a;
    
        for (uint64_t p = 3; p <= args->maxp; p += 2) {
            if (!(a[p / 64] & ((uint64_t)1 << (p % 64))))
                continue;
            /* clear odd multiples of p greater or equal to p * p
               in the slice as composites */
            uint64_t start = (args->begin + p - 1) / p * p;
            if (start < p * p)
                start = p * p;
            if (start % 2 == 0)
                start += p;
            uint64_t stop = args->end;
            for (uint64_t j = start; j < stop; j += p + p) {
                a[j / 64] &= ~((uint64_t)1 << (j % 64));
            }
        }
        pthread_exit(NULL);
    }
    
    /* cannot use clock() to measure elapsed time in a multi-threaded program */
    static uint64_t clock_us(void) {
    #ifdef TIME_UTC
        struct timespec ts;
        timespec_get(&ts, TIME_UTC);
        return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
    #else
        struct timeval tt;
        gettimeofday(&tt, NULL);
        return (uint64_t)tt.tv_sec * 1000000 + tt.tv_usec;
    #endif
    }
    
    /* count prime numbers up to and including N */
    int main(int argc, char *argv[]) {
    
        uint64_t N = argc > 1 ? strtoull(argv[1], NULL, 0) : 1000000;
        int K = argc > 2 ? strtoul(argv[2], NULL, 0) : 7;
    
        /* size of the array of uint64_t */
        size_t NW = N / 64 + 1;
    
        pthread_t tid[K];
        struct sieve_args args[K];
    
        /* allocate at least N+1 bits */
        uint64_t *a = calloc(sizeof(*a), NW);
    
        uint64_t t = clock_us();
    
        /* initialize a with all even numbers composite */
        for (size_t i = 0; i < NW; i++) {
            a[i] = 0xAAAAAAAAAAAAAAAA;
        }
        for (int k = 0; k < K; k++) {
            args[k].a = a;
            args[k].maxp = (uint64_t)+sqrt(N);
            args[k].begin = 64 * ((uint64_t)NW * k / K);
            args[k].end = 64 * ((uint64_t)NW * (k + 1) / K);
            pthread_create(&tid[k], NULL, threadBehaviour, &args[k]);
        }
        for (int k = 0; k < K; k++) {
            pthread_join(tid[k], NULL);
        }
        t = clock_us() - t;
        printf("%f ms\n", t / 1000.0);
    
        unsigned long long count = 0;
    
        /* count prime numbers */
        a[0] &= ~(1 << 1); /* set 1 as not prime */
        a[0] |= 1 << 2; /* set 2 as prime */
        for (uint64_t i = 2; i <= N; i++) {
            if (a[i / 64] & ((uint64_t)1 << (i % 64))) {
                //printf("| %llu ", i);
                count++;
            }
        }
        //printf("\n");
        printf("%llu: %llu primes\n", (unsigned long long)N, count);
        free(a);
        return 0;
    }