c++mathoptimizationimplementationnumber-theory

A Program to Find Absolute Euler Pseudoprimes


I am now trying to make a program to find the Absolute Euler Pseudoprimes ('AEPSP' in short, not Euler-Jacobi Pseudoprimes), with the definition that n is an AEPSP if

a(n-1)/2 ≡ ±1 (mod n)

for all positive integers a satisfying that the GCD of a and n is 1.

I used a C++ code to generate AEPSPs, which is based on a code to generate Carmichael numbers:

#include <iostream>
#include <cmath>
#include <algorithm>
#include <numeric>

using namespace std;

unsigned int bm(unsigned int a, unsigned int n, unsigned int p){
    unsigned long long b;
    switch (n) {
        case 0:
            return 1;
        case 1:
            return a % p;
        default:
            b = bm(a,n/2,p);
            b = (b*b) % p;
            if (n % 2 == 1) b = (b*a) % p;
            return b;
        }
} 

int numc(unsigned int n){
    int a, s;
    int found = 0;
    if (n % 2 == 0) return 0;
    s = sqrt(n);
    a = 2;
    while (a < n) {
        if (a > s && !found) {
            return 0;
        }
        if (gcd(a, n) > 1) {
            found = 1;
        }
        else {
            if (bm(a, (n-1)/2, n) != 1) {
                return 0;
            }
        }
        a++;
    }
    return 1;
}

int main(void) {
    unsigned int n;
    for (n = 3; n < 1e9; n += 2){
    if (numc(n)) printf("%u\n",n);
    }
    return 0; 
}  

Yet, the program is very slow. It generates AEPSPs up to 1.5e6 in 20 minutes. Does anyone have any ideas on optimizing the program?

Any help is most appreciated. :)


Solution

  • I've come up with a different algorithm, based on sieving for primes upfront while simultaneously marking off non-squarefree numbers. I've applied a few optimizations to pack the information into memory a bit tighter, to help with cache-friendliness as well. Here is the code:

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <stdint.h>
    
    #define PRIME_BIT  (1UL << 31)
    #define SQUARE_BIT (1UL << 30)
    #define FACTOR_MASK (SQUARE_BIT - 1)
    
    void sieve(uint64_t size, uint32_t *buffer) {
        for (uint64_t i = 3; i * i < size; i += 2) {
            if (buffer[i/2] & PRIME_BIT) {
                for (uint64_t j = i * i; j < size; j += 2 * i) {
                    buffer[j/2] &= SQUARE_BIT;
                    buffer[j/2] |= i;
                }
                for (uint64_t j = i * i; j < size; j += 2 * i * i) {
                    buffer[j/2] |= SQUARE_BIT;
                }
            }
        }
    }
    
    int main(int argc, char **argv) {
        if (argc < 2) {
            printf("Usage: prog LIMIT\n");
            return 1;
        }
    
        uint64_t size = atoi(argv[1]);
    
        uint32_t *buffer = malloc(size * sizeof(uint32_t) / 2);
        memset(buffer, 0x80, size * sizeof(uint32_t) / 2);
    
        sieve(size, buffer);
    
        for (uint64_t i = 5; i < size; i += 4) {
            if (buffer[i/2] & PRIME_BIT)
                continue;
            if (buffer[i/2] & SQUARE_BIT)
                continue;
    
            uint64_t num = i;
            uint64_t factor;
    
            while (num > 1) {
                if (buffer[num/2] & PRIME_BIT)
                    factor = num;
                else
                    factor = buffer[num/2] & FACTOR_MASK;
                
                if ((i / 2) % (factor - 1) != 0) {
                    break;
                }
                num /= factor;
            }
    
            if (num == 1)
                printf("Found pseudo-prime: %ld\n", i);
        }
    }
    

    This produces the pseudo-primes up to 1.5e6 in about 8ms on my machine, and for 2e9 it takes 1.8sec.

    The time complexity of the solution is O(n log n) - the sieve is O(n log n), and for each number we either do constant time checks or do a divisibility test for each of its factors, which there are at most log n. So, the main loop is also O(n log n), resulting in O(n log n) overall.