python performance optimization pypy sieve-algorithm

Optimizing sieving code in the Self Initializing Quadratic Sieve for PyPy

I've coded up the Self Initializing Quadratic Sieve (SIQS) in Python, but it has been coded with respect to being as fast as possible in PyPy(not native Python).

Here is the complete code:

import logging
import time
import math
from math import sqrt, ceil, floor, exp, log2, log, isqrt
from rich.live import Live
from rich.table import Table
from rich.console import Console
import random
import sys

LOWER_BOUND_SIQS = 1000
UPPER_BOUND_SIQS = 4000
logging.basicConfig(
    format='[%(levelname)s] %(asctime)s - %(message)s',
    level=logging.INFO
)
def get_gray_code(n):
    gray = [0] * (1 << (n - 1))
    gray[0] = (0, 0)
    for i in range(1, 1 << (n - 1)):
        v = 1
        j = i
        while (j & 1) == 0:
            v += 1
            j >>= 1
        tmp = i + ((1 << v) - 1)
        tmp >>= v
        if (tmp & 1) == 1:
            gray[i] = (v - 1, -1)
        else:
            gray[i] = (v - 1, 1)
    return gray

MULT_LIST = [
     1,   2,   3,   5,   7,   9,  10,  11,  13,  14,
    15,  17,  19,  21,  23,  25,  29,  31,
    33,  35,  37,  39,  41,  43,  45,   
    47,  49,  51,  53,  55,  57,  59,  61,  63,
    65,  67,  69,  71,  73,  75,  77,  79,  83,
    85,  87,  89,  91,  93,  95,  97, 101, 103, 105, 107,
   109, 111, 113, 115, 119, 121, 123, 127, 129, 131, 133,
   137, 139, 141, 143, 145, 147, 149, 151, 155, 157, 159,
   161, 163, 165, 167, 173, 177, 179, 181, 183, 185, 187,
   191, 193, 195, 197, 199, 201, 203, 205, 209, 211, 213,
   215, 217, 219, 223, 227, 229, 231, 233, 235, 237, 239,
   241, 249, 251, 253, 255
]

def create_table(relations, target_relations, num_poly, start_time):
    end = time.time()
    elapsed = end - start_time

    relations_per_second = len(relations) / elapsed if elapsed > 0 else 0
    poly_per_second = num_poly / elapsed if elapsed > 0 else 0
    percent = (len(relations) / target_relations) * 100 if target_relations > 0 else 0
    percent_per_second = percent / elapsed if elapsed > 0 else 0
    remaining_percent = 100.0 - percent
    seconds = int(remaining_percent / percent_per_second) if percent_per_second > 0 else 0
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)

    table = Table(title="Processing Status")
    table.add_column("Metric", style="cyan", no_wrap=True)
    table.add_column("Value", style="magenta")

    table.add_row("Relations per second", f"{relations_per_second:,.2f}")
    table.add_row("Poly per second", f"{poly_per_second:,.2f}")
    table.add_row("Percent", f"{percent:,.2f}%")
    table.add_row("Percent per second", f"{percent_per_second:,.4f}%")
    table.add_row("Estimated Time", f"{h:d}:{m:02d}:{s:02d}")

    return table

class QuadraticSieve:
    def __init__(self, M, B=None, T=2, prime_limit=20, eps=30, lp_multiplier=20, multiplier=None):
        self.logger = logging.getLogger(__name__)
        self.prime_log_map = {}
        self.root_map = {}
        
        self.M = M
        self.B = B
        self.T = T
        self.prime_limit = prime_limit
        self.eps = eps
        self.lp_multiplier = lp_multiplier
        self.multiplier = multiplier

        self.console = Console()

        print(f"B: {B}")
        print(f"M: {M}")
        print(f"prime_limit: {prime_limit}")
        print(f"eps: {eps}")
        print(f"lp_multiplier: {lp_multiplier}")

    @staticmethod
    def gcd(a, b):
        a, b = abs(a), abs(b)
        while a:
            a, b = b % a, a
        return b

    @staticmethod
    def legendre(n, p):
        val = pow(n, (p - 1) // 2, p)
        return val - p if val > 1 else val

    @staticmethod
    def jacobi(a, m):
        a = a % m
        t = 1
        while a != 0:
            while a % 2 == 0:
                a //= 2
                if m % 8 in [3, 5]:
                    t = -t
            a, m = m, a
            if a % 4 == 3 and m % 4 == 3:
                t = -t
            a %= m
        return t if m == 1 else 0

    @staticmethod
    def modinv(n, p):
        n = n % p
        x, u = 0, 1
        while n:
            x, u = u, x - (p // n) * u
            p, n = n, p % n
        return x

    def factorise_fast(self, value, factor_base):
        factors = set()
        if value < 0:
            factors ^= {-1}
            value = -value
        for factor in factor_base[1:]:
            while value % factor == 0:
                factors ^= {factor}
                value //= factor
        return factors, value

    @staticmethod
    def tonelli_shanks(a, p):
        a %= p
        if p % 8 in [3, 7]:
            x = pow(a, (p + 1) // 4, p)
            return x, p - x

        if p % 8 == 5:
            x = pow(a, (p + 3) // 8, p)
            if pow(x, 2, p) != a % p:
                x = (x * pow(2, (p - 1) // 4, p)) % p
            return x, p - x

        d = 2
        symb = 0
        while symb != -1:
            symb = QuadraticSieve.jacobi(d, p)
            d += 1
        d -= 1

        n = p - 1
        s = 0
        while n % 2 == 0:
            n //= 2
            s += 1
        t = n

        A = pow(a, t, p)
        D = pow(d, t, p)
        m = 0
        for i in range(s):
            i1 = pow(2, s - 1 - i)
            i2 = (A * pow(D, m, p)) % p
            i3 = pow(i2, i1, p)
            if i3 == p - 1:
                m += pow(2, i)
        x = (pow(a, (t + 1) // 2, p) * pow(D, m // 2, p)) % p
        return x, p - x

    @staticmethod
    def prime_sieve(n):
        sieve = [True] * (n + 1)
        sieve[0], sieve[1] = False, False

        for i in range(2, int(n**0.5) + 1):
            if sieve[i]:
                for j in range(i * 2, n + 1, i):
                    sieve[j] = False

        return [i for i, is_prime in enumerate(sieve) if is_prime]

    def find_b(self, N):
        x = ceil(exp(0.5 * sqrt(log(N) * log(log(N)))))
        return x

    def choose_multiplier(self, N, B):
        prime_list = self.prime_sieve(B)
        if self.multiplier is not None:
            self.logger.info("Using multiplier k = %d", self.multiplier)
            return prime_list
        NUM_TEST_PRIMES = 300
        LN2 = math.log(2)
        num_primes = min(len(prime_list), NUM_TEST_PRIMES)
        log2n = math.log(N)
        scores = [0.0 for _ in MULT_LIST]
        num_multipliers = 0

        for i, curr_mult in enumerate(MULT_LIST):
            knmod8 = (curr_mult * (N % 8)) % 8
            logmult = math.log(curr_mult)
            scores[i] = 0.5 * logmult

            if knmod8 == 1:
                scores[i] -= 2 * LN2
            elif knmod8 == 5:
                scores[i] -= LN2
            elif knmod8 in (3, 7):
                scores[i] -= 0.5 * LN2

            num_multipliers += 1

        for i in range(1, num_primes):
            prime = prime_list[i]
            contrib = math.log(prime) / (prime - 1)
            modp = N % prime

            for j in range(num_multipliers):
                curr_mult = MULT_LIST[j]
                knmodp = (modp * curr_mult) % prime

                if knmodp == 0 or self.legendre(knmodp, prime) == 1:
                    if knmodp == 0:
                        scores[j] -= contrib
                    else:
                        scores[j] -= 2 * contrib

        best_score = float('inf')
        best_mult = 1

        for i in range(num_multipliers):
            if scores[i] < best_score:
                best_score = scores[i]
                best_mult = MULT_LIST[i]

        self.multiplier = best_mult
        self.logger.info("Using multiplier k = %d", best_mult)
        return prime_list

    def get_smooth_b(self, N, B, prime_list):
        factor_base = [-1, 2]
        self.prime_log_map[2] = 1
        for p in prime_list[1:]:
            if self.legendre(N, p) == 1:
                factor_base.append(p)
                self.prime_log_map[p] = round(log2(p))
                self.root_map[p] = self.tonelli_shanks(N, p)

        return factor_base

    def decide_bound(self, N, B=None):
        if B is None:
            B = self.find_b(N)
        self.B = B
        self.logger.info("Using B = %d", B)
        return B

    def build_factor_base(self, N, B, prime_list):
        fb = self.get_smooth_b(N, B, prime_list)
        self.logger.info("Factor base size: %d", len(fb))
        return fb

    def new_poly_a(self, factor_base, N, M, poly_a_list):
        small_B = 1024
        lower_polypool_index = 2
        upper_polypool_index = small_B - 1
        poly_low_found = False
        for i in range(small_B):
            if factor_base[i] > LOWER_BOUND_SIQS and not poly_low_found:
                lower_polypool_index = i
                poly_low_found = True
            if factor_base[i] > UPPER_BOUND_SIQS:
                upper_polypool_index = i - 1
                break
        # Compute target_a and bit threshold
        target_a = int(math.sqrt(2 * N) / M)
        target_mul = 0.9
        target_bits = int(target_a.bit_length() * target_mul)
        too_close = 10
        close_range = 5
        min_ratio = LOWER_BOUND_SIQS

        while True:
            poly_a = 1
            afact = []
            qli = []
            while True:
                found_a_factor = False
                while(found_a_factor == False):
                    randindex = random.randint(lower_polypool_index, upper_polypool_index)
                    potential_a_factor = factor_base[randindex]
                    found_a_factor = True
                    if potential_a_factor in afact:
                        found_a_factor = False

                poly_a = poly_a * potential_a_factor
                afact.append(potential_a_factor)
                qli.append(randindex)

                j = target_a.bit_length() - poly_a.bit_length()
                if j < too_close:
                    poly_a = 1
                    s = 0
                    afact = []
                    qli = []
                    continue
                elif j < (too_close + close_range):
                    break

            a1 = target_a // poly_a
            if a1 < min_ratio:
                continue

            mindiff = 100000000000000000
            randindex = 0

            for i in range(small_B):
                if abs(a1 - factor_base[i]) < mindiff:
                    mindiff = abs(a1 - factor_base[i])
                    randindex = i

            found_a_factor = False
            while not found_a_factor:
                potential_a_factor = factor_base[randindex]
                found_a_factor = True
                if potential_a_factor in afact:
                    found_a_factor = False
                if not found_a_factor:
                    randindex += 1

            if randindex > small_B:
                continue

            poly_a = poly_a * factor_base[randindex]
            afact.append(factor_base[randindex])
            qli.append(randindex)

            diff_bits = (target_a - poly_a).bit_length()

            if diff_bits < target_bits:
                if poly_a in poly_a_list:
                    if target_bits > 1000:
                        print("SOMETHING WENT WRONG")
                        sys.exit()
                    target_bits += 1
                    continue
                else:
                    break

        poly_a_list.append(poly_a)
        return poly_a, sorted(qli), set(afact)

    def generate_first_polynomial(self, factor_base, N, M, poly_a_list):
        a, qli, factors_a = self.new_poly_a(factor_base, N, M, poly_a_list)
        s = len(qli)
        B = []
        for l in range(s):
            p = factor_base[qli[l]]
            r1 = self.root_map[p][0]
            aq = a // p
            invaq = self.modinv(aq, p)
            gamma = r1 * invaq % p
            if gamma > p // 2:
                gamma = p - gamma
            B.append(aq * gamma)
        b = sum(B) % a
        c = (b * b - N) // a
        soln_map = {}
        Bainv = {}
        for p in factor_base:
            Bainv[p] = []
            if a % p == 0 or p == 2:
                continue

            ainv = self.modinv(a, p)
            # store bainv
            for j in range(s):
                Bainv[p].append((2 * B[j] * ainv) % p)

            # store roots
            r1, r2 = self.root_map[p]
            r1 = ((r1 - b) * ainv) % p
            r2 = ((r2 - b) * ainv) % p
            soln_map[p] = [r1, r2]

        return a, b, c, B, Bainv, soln_map, s, factors_a


    def sieve(self, N, B, factor_base, M):
        # ------------------------------------------------
        # 1) TIMING
        # ------------------------------------------------
        start = time.time()

        # ------------------------------------------------
        # 2) FACTOR BASE & RELATED
        # ------------------------------------------------
        fb_len = len(factor_base)
        fb_map = {val: i for i, val in enumerate(factor_base)}
        target_relations = fb_len + self.T
        large_prime_bound = B * self.lp_multiplier

        # ------------------------------------------------
        # 3) THRESHOLD & MISC
        # ------------------------------------------------
        threshold = int(math.log2(M * math.sqrt(N)) - self.eps)
        lp_found = 0
        ind = 1
        matrix = [0] * fb_len
        relations = []
        roots = []
        partials = {}
        num_poly = 0
        interval_size = 2 * M + 1
        grays = get_gray_code(20)
        poly_a_list = []
        poly_ind = 0
        sieve_values = [0] * interval_size
        r1 = 0
        r2 = 0
        def process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a):
            nonlocal ind
            val = sieve_values[x]
            sieve_values[x] = 0
            lpf = 0
            if val > threshold:
                xval = x - M
                relation = a * xval + b
                poly_val = a * xval * xval + 2 * b * xval + c

                local_factors, value = self.factorise_fast(poly_val, factor_base)
                local_factors ^= factors_a


                if value != 1:
                    if value < large_prime_bound:
                        if value in partials:
                            rel, lf, pv = partials[value]
                            relation *= rel
                            local_factors ^= lf
                            poly_val *= pv
                            lpf = 1
                        else:
                            partials[value] = (relation, local_factors, poly_val * a)
                            return 0
                    else:
                        return 0

                for fac in local_factors:
                    idx = fb_map[fac]
                    matrix[idx] |= ind
                ind = ind + ind
                relations.append(relation)
                roots.append(poly_val * a)
            return lpf

        with Live(console=self.console) as live:
            while len(relations) < target_relations:
                if num_poly % 10 == 0:
                    live.update(create_table(relations, target_relations, num_poly, start))

                if poly_ind == 0:
                    a, b, c, B, Bainv, soln_map, s, factors_a = self.generate_first_polynomial(factor_base, N, M, poly_a_list)
                    end = 1 << (s - 1)
                    poly_ind += 1
                else:
                    v, e = grays[poly_ind]
                    b = (b + 2 * e * B[v])
                    c = (b * b - N) // a
                    poly_ind += 1
                    if poly_ind == end:
                        poly_ind = 0
                v, e = grays[poly_ind] # v, e for next iteration
                for p in factor_base:
                    if p < self.prime_limit or a % p == 0:
                        continue
                    log_p = self.prime_log_map[p]
                    r1, r2 = soln_map[p]
                    soln_map[p][0] = (r1 - e * Bainv[p][v]) % p
                    soln_map[p][1] = (r2 - e * Bainv[p][v]) % p

                    amx = r1 + M
                    bmx = r2 + M

                    apx = amx - p
                    bpx = bmx - p
                    k = p

                    while k < M:
                        sieve_values[apx + k] += log_p
                        sieve_values[bpx + k] += log_p
                        sieve_values[amx - k] += log_p
                        sieve_values[bmx - k] += log_p
                        k += p
                num_poly += 1

                x = 0
                while x < 2 * M - 6:
                    # for some reason need to do all this for max performance gain in PyPy3
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1

        print(f"\n{num_poly} polynomials sieved")
        print(f"{lp_found} relations from partials")
        print(f"{target_relations - lp_found} normal smooth relations")
        print(f"{target_relations} total relations\n")
        return matrix, relations, roots

    def solve_bits(self, matrix, n):
        self.logger.info("Solving linear system in GF(2).")
        lsmap = {lsb: 1 << lsb for lsb in range(n)}

        # GAUSSIAN ELIMINATION
        m = len(matrix)
        marks = []
        cur = -1
        # m -> number of primes in factor base
        # n -> number of smooth relations
        mark_mask = 0
        for row in matrix:
            if cur % 100 == 0:
                print("", end=f"{cur, m}\r")
            cur += 1
            lsb = (row & -row).bit_length() - 1
            if lsb == -1:
                continue
            marks.append(n - lsb - 1)
            mark_mask |= 1 << lsb
            for i in range(m):
                if matrix[i] & lsmap[lsb] and i != cur:
                    matrix[i] ^= row
        marks.sort()
        # NULL SPACE EXTRACTION
        nulls = []
        free_cols = [col for col in range(n) if col not in marks]
        k = 0
        for col in free_cols:
            shift = n - col - 1
            val = 1 << shift
            fin = val
            for v in matrix:
                if v & val:
                    fin |= v & mark_mask
            nulls.append(fin)
            k += 1
            if k == self.T:
                break
        return nulls

    def extract_factors(self, N, relations, roots, null_space):
        n = len(relations)
        for vector in null_space:
            prod_left = 1
            prod_right = 1
            for idx in range(len(relations)):
                bit = vector & 1
                vector = vector >> 1
                if bit == 1:
                    prod_left *= relations[idx]
                    prod_right *= roots[idx]
                idx += 1

            sqrt_right = isqrt(prod_right)
            prod_left = prod_left % N
            sqrt_right = sqrt_right % N
            factor_candidate = self.gcd(N, prod_left - sqrt_right)

            if factor_candidate not in (1, N):
                other_factor = N // factor_candidate
                self.logger.info("Found factors: %d, %d", factor_candidate, other_factor)
                return factor_candidate, other_factor

        return 0, 0

    def factor(self, N, B=None):
        overall_start = time.time()
        self.logger.info("========== Quadratic Sieve V4 Start ==========")
        self.logger.info("Factoring N = %d", N)

        step_start = time.time()
        B = self.decide_bound(N, self.B)
        step_end = time.time()
        self.logger.info("Step 1 (Decide Bound) took %.3f seconds", step_end - step_start)

        step_start = time.time()
        prime_list = self.choose_multiplier(N, self.B)
        step_end = time.time()
        self.logger.info("Step 2 (Choose Multiplier) took %.3f seconds", step_end - step_start)
        kN = self.multiplier * N
        if kN.bit_length() < 140:
            LOWER_BOUND_SIQS = 3

        step_start = time.time()
        factor_base = self.build_factor_base(kN, B, prime_list)
        step_end = time.time()
        self.logger.info("Step 3 (Build Factor Base) took %.3f seconds", step_end - step_start)

        step_start = time.time()
        matrix, relations, roots = self.sieve(kN, B, factor_base, self.M)
        step_end = time.time()
        self.logger.info("Step 4 (Sieve Interval) took %.3f seconds", step_end - step_start)

        n = len(relations)
        step_start = time.time()
        null_space = self.solve_bits(matrix, n)
        step_end = time.time()
        self.logger.info("Step 5 (Solve Dependencies) took %.3f seconds", step_end - step_start)

        step_start = time.time()
        f1, f2 = self.extract_factors(N, relations, roots, null_space)
        step_end = time.time()
        self.logger.info("Step 6 (Extract Factors) took %.3f seconds", step_end - step_start)

        if f1 and f2:
            self.logger.info("Quadratic Sieve successful: %d * %d = %d", f1, f2, N)
        else:
            self.logger.warning("No non-trivial factors found with the current settings.")

        overall_end = time.time()
        self.logger.info("Total time for Quadratic Sieve: %.10f seconds", overall_end - overall_start)
        self.logger.info("========== Quadratic Sieve End ==========")

        return f1, f2

if __name__ == '__main__':

    ## 60 digit number
    #N = 373784758862055327503642974151754627650123768832847679663987
    #qs = QuadraticSieve(B=111000, M=400000, T=10, prime_limit=45, eps=34, lp_multiplier=20000)

    ### 70 digit number
    N = 3605578192695572467817617873284285677017674222302051846902171336604399
    qs = QuadraticSieve(B=300000, M=350000, prime_limit=47, eps=40, T=10, lp_multiplier=256)

    ## 80 digit number
    #N = 4591381393475831156766592648455462734389 * 1678540564209846881735567157366106310351
    #qs = QuadraticSieve(B=700_000, M=600_000, prime_limit=52, eps=45, T=10, lp_multiplier=256)

    factor1, factor2 = qs.factor(N)

Now, the main running time in the comes from the following section which is where basically a giant sieving process:

    def sieve(self, N, B, factor_base, M):
        start = time.time()

        fb_len = len(factor_base)
        fb_map = {val: i for i, val in enumerate(factor_base)}
        target_relations = fb_len + self.T
        large_prime_bound = B * self.lp_multiplier


        threshold = int(math.log2(M * math.sqrt(N)) - self.eps)
        lp_found = 0
        ind = 1

        matrix = [0] * fb_len
        relations = []
        roots = []
        partials = {}

        num_poly = 0
        interval_size = 2 * M + 1
        grays = get_gray_code(20)
        poly_a_list = []
        poly_ind = 0

        sieve_values = [0] * interval_size
        r1 = 0
        r2 = 0

        def process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a):
            nonlocal ind
            val = sieve_values[x]
            sieve_values[x] = 0
            lpf = 0
            if val > threshold:
                xval = x - M
                relation = a * xval + b
                poly_val = a * xval * xval + 2 * b * xval + c

                local_factors, value = self.factorise_fast(poly_val, factor_base)
                local_factors ^= factors_a


                if value != 1:
                    if value < large_prime_bound:
                        if value in partials:
                            rel, lf, pv = partials[value]
                            relation *= rel
                            local_factors ^= lf
                            poly_val *= pv
                            lpf = 1
                        else:
                            partials[value] = (relation, local_factors, poly_val * a)
                            return 0
                    else:
                        return 0

                for fac in local_factors:
                    idx = fb_map[fac]
                    matrix[idx] |= ind
                ind = ind + ind
                relations.append(relation)
                roots.append(poly_val * a)
            return lpf

        with Live(console=self.console) as live:
            while len(relations) < target_relations:
                if num_poly % 10 == 0:
                    live.update(create_table(relations, target_relations, num_poly, start))

                if poly_ind == 0:
                    a, b, c, B, Bainv, soln_map, s, factors_a = self.generate_first_polynomial(factor_base, N, M, poly_a_list)
                    end = 1 << (s - 1)
                    poly_ind += 1
                else:
                    v, e = grays[poly_ind]
                    b = (b + 2 * e * B[v])
                    c = (b * b - N) // a
                    poly_ind += 1
                    if poly_ind == end:
                        poly_ind = 0
                v, e = grays[poly_ind] # v, e for next iteration
                for p in factor_base:
                    if p < self.prime_limit or a % p == 0:
                        continue
                    log_p = self.prime_log_map[p]
                    r1, r2 = soln_map[p]
                    soln_map[p][0] = (r1 - e * Bainv[p][v]) % p
                    soln_map[p][1] = (r2 - e * Bainv[p][v]) % p

                    amx = r1 + M
                    bmx = r2 + M

                    apx = amx - p
                    bpx = bmx - p
                    k = p

                    while k < M:
                        sieve_values[apx + k] += log_p
                        sieve_values[bpx + k] += log_p
                        sieve_values[amx - k] += log_p
                        sieve_values[bmx - k] += log_p
                        k += p
                num_poly += 1

                x = 0
                while x < 2 * M - 6:
                    # for some reason need to do all this for max performance gain in PyPy3
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1
                    lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
                    x += 1

        print(f"\n{num_poly} polynomials sieved")
        print(f"{lp_found} relations from partials")
        print(f"{target_relations - lp_found} normal smooth relations")
        print(f"{target_relations} total relations\n")
        return matrix, relations, roots

I have "optimized" the code as much as I can so far, and it runs pretty fast in PyPy, but I am wondering if there is anything I am missing that can tweak out more performance gain. I haven't really been able to get anything meaningful out of profiling the code because of the way that PyPy works, but through a bit of testing have made various improvements that have cut down the time a lot like some loop unrolling, precision reduction, and the way I work with the arrays.

Unfortunately, this sieving code is still not fast enough to be feasible in factoring the size of numbers which I'm targeting(100 to 115 digit semiprimes). By my initial estimates with semi-decent parameter selection and without multiprocessing, the sieving code itself would taken around 70 hours when compiled using PyPy3 on my device.

Is there anything I can do to better optimize this code for the PyPy JIT?

Solution

This answer provides a way to significantly speed up the code by:

transformation of a fraction of the Python code into a C++ code
low-level optimization of the most expensive parts
parallelization of one very-expensive part

The overall algorithm is left unchanged and the logic of the Python code too. That being said many non-trivial optimizations are performed and most of the changes applied are nearly impossible to do purely in PyPy (this why I choose to move the code to C++), starting from the (efficient) use of multiple threads. In the end, this makes the code 7 times faster for 80-digit numbers (4 times faster for 70-digit ones).

Note that the answer mostly focus only on "Sieve Interval" step though few optimization tips are provided at the end.

Details of the optimizations performed

First of all, I profiled the code and found out that two parts of the code was slow:

the while loop calling many times process_sieve_value
the while loop filling sieve_values

The former tends to be the bottleneck on numbers with 60~70 digits, while the later is clearly the bottleneck for numbers having 80~90 digits. Let us focus on the former first.

Make the code SIMD-friendly: To optimize the process_sieve_value-based loop, we can search the next value where sieve_values[i] > threshold and only then call process_sieve_value with the value found rather than calling the function many times. This enable further optimizations. Indeed, searching for sieve_values[i] > threshold can be done in a SIMD-friendly way while the initial solution is clearly not SIMD-friendly. SIMD units of modern CPU can operate on many items simultaneously (e.g. 8 x 32-bit integers, or even 16 x 16-bit integers) with a cost close to scalar operation (for most operations except very-expensive ones like integer divisions). This means codes compiled to used SIMD instructions can be much faster than ones using scalar instructions. PyPy is generally unable to do such optimization. I think this is mainly because vectorizing codes is a very expensive operation (not great for JITs like PyPy) and Python code tends to be really hard to vectorize efficiently when this is even possible. This searching operation is done in C++.

Converting lists to Numpy arrays: One issue with this optimization (like the others) is that sieve_values is a list. Lists are not very efficient (because they contain reference to dynamically-typed garbage-collected dynamically-allocated Python objects) and we cannot (easily) operate on them with modules like CFFI, Cython, etc. Thus, lists should be converted to native-friendly data-structures like plain arrays. Numpy is the perfect module for that. However, converting lists to Numpy arrays is painfully slow in PyPy. Getting/Setting items is insanely slow too (10 times slower than CPython which is already >200 times slower than C++ for that). The key to make this fast is unfortunately to write a C++ function to load/store values in Numpy arrays. This is clearly sub-optimal (i.e. one function call per access) but surprisingly fast in practice compared to all other alternatives. It is actually fast enough to be competitive with list accesses!

Converting dictionaries to Numpy arrays: Dictionaries are pretty expensive, even in C++. Moreover, like lists, Python dictionaries does not fit well native codes. Statements like soln_map[p][0] = (r1 - e * Bainv[p][v]) % p are clearly a problem. Indeed, it is not only pretty slow (not a bottleneck except once the rest of the code has been optimized), but it also prevents the transformation of the loop containing the statement in a native code at least neither easily nor efficiently). The thing is such dictionary are read in a p-based loop (so they are read in order). Thus, we can just replace such dictionaries by basic 2D Numpy arrays. There is a catch though: the array needs to be transposed to be read in a cache-friendly way. Indeed, v is a constant in the loop so the contiguous axis has to be the one indexing p instead of v.

Converting big numbers: Unfortunately, >64-bit numbers cannot directly be provided to native codes (not supported yet). Thus, one way to pass large integers to native codes is simply to split it in 64-bit parts. It is a bit tedious but this is fine for <192-bit numbers since it only results in 3 parts (+1 for the sign).

Optimizing the factorization: PyPy turns out to be relatively efficient for computing modulo on large numbers since it uses different algorithms regarding the size of the integers (because small integers are internally a different type). That being said, we can write a bit faster code by checking if the high bits of the target large number are set to zero in order to reduce the number of (very expensive) native 64-bit modulo operations. Having large number stored in parts turns out to be good for convenience and performance in this case. Indeed, we can first check the first 64-bit part trivially and natively. Besides, PyPy needs to follows the semantic of Python (a modulo must be always positive) while C++ code does not (we generally know that the operands are positive so there is no need to guarantee that with additional instructions).

Optimizing the update of sieve_values:

This is clearly the hardest part of the algorithm to optimize. Indeed, the operation is memory bound and also hard to parallelize due to the memory access pattern (huge stride for relatively small p and pseudo-random memory accesses for large p). Each memory access tends to result in a full cache-line fetch (slow). For large N numbers (requiring a rather large M for the code to be fast), the whole array tends to be so big that it does not fit in the L1/L2 cache of most mainstream CPUs.

The key to make this operation cache-friendly is to store sieve_values in a very-compact way. I choose to store items in 8-bit integers. Since this is not enough to store all value contained in the array, the idea is to accumulate the log_p value in 8-bit items as much as possible and fallback on a 16-bit array when an item of the 8-bit array overflows (the item is then reset to 0 so to mitigate the number of accesses to the 16-bit array). Accesses to the 16-bit array are unlikely since log_p values are generally tiny (they are all less than 20 in practice).

This part can be parallelized (though it clearly does not scale). Indeed, we can allocate multiple temporary 8-bit array for accumulating the log_p values and finally sum the 8-bit arrays in sieve_values. This means the quite-expensive divisions are fully executed in parallel. Unfortunately, the 8-bit array access does not scale well: only 3-4 threads are enough to saturate the memory hierarchy on my machine (i5-9600KF CPU). Even worse: more threads results in random-like memory accesses on a wider memory area for the CPU and the L3 cache may not be large enough to contain all arrays. If this happens, the DRAM is used instead and it makes things significantly slower due to its much higher latency and significantly lower throughput.

Unfortunately, I did not find any additional (significant) optimization for this expensive part, and it is still a bottleneck on large 80~90 bit numbers in the optimized code... Note that the optimized code might better scale on some recent CPUs having significantly bigger caches than my CPU (especially the L3 cache on AMD X3D-like CPUs, and the L2 of all recent x86-64 CPU). Besides, note that tuning the B and M parameters of the algorithm can help to reduce the bottleneck a bit.

Actual optimized code

This section provide the optimized C++ code, parts of the modified Python code (too big to fit in this answer anyway) and additional resources (e.g. how to build the libraries). I used CFFI to call it from PyPy (pretty-fast in PyPy and relatively simple to use).

Here is the section of the optimized Python code:

# Include-like section
import numpy as np
from kernel_cffi import lib as kernel
import cffi
ffi = cffi.FFI()

# Convenient function to extract the pointer of Numpy arrays 
def get_pointer(numpy_array):
    return ffi.cast(f'{numpy_array.dtype}_t*', ffi.from_buffer(numpy_array))

# Modified function so to build and return Numpy arrays instead of dicts/lists
def generate_first_polynomial(self, factor_base, N, M, poly_a_list):
    a, qli, factors_a = self.new_poly_a(factor_base, N, M, poly_a_list)
    s = len(qli)
    B = []
    for l in range(s):
        p = factor_base[qli[l]]
        r1 = self.root_map[p][0]
        aq = a // p
        invaq = self.modinv(aq, p)
        gamma = r1 * invaq % p
        if gamma > p // 2:
            gamma = p - gamma
        B.append(aq * gamma)
    b = sum(B) % a
    c = (b * b - N) // a
    factor_base_size = len(factor_base)
    np_Bainv = np.zeros((s,factor_base_size), dtype=np.int32)
    np_Bainv_ptr = get_pointer(np_Bainv)
    np_soln_map = np.zeros((2,factor_base_size), dtype=np.int32)
    np_soln_map_ptr = get_pointer(np_soln_map)
    for i,p in enumerate(factor_base):
        if a % p == 0 or p == 2:
            continue

        ainv = self.modinv(a, p)
        # store bainv
        for j in range(s):
            kernel.set_i32_2D(np_Bainv_ptr, factor_base_size, j, i, (2 * B[j] * ainv) % p)

        # store roots
        r1, r2 = self.root_map[p]
        r1 = ((r1 - b) * ainv) % p
        r2 = ((r2 - b) * ainv) % p
        kernel.set_i32_2D(np_soln_map_ptr, factor_base_size, 0, i, r1)
        kernel.set_i32_2D(np_soln_map_ptr, factor_base_size, 1, i, r2)

    return a, b, c, B, np_Bainv, np_soln_map, s, factors_a

# Main modified function (without the same old initialization code)
def sieve(self, N, B, factor_base, M):
    # [...] same as before (initialization in 3 steps)

    # ------------------------------------------------
    # 4) CONVERSIONS
    # ------------------------------------------------

    np_factor_base = np.array(factor_base, dtype=np.int32)
    np_factor_base_ptr = get_pointer(np_factor_base)
    np_factor_base_size = np_factor_base.size
    np_prime_log_map = np.zeros(np_factor_base_size, dtype=np.int32)
    np_prime_log_map_ptr = get_pointer(np_prime_log_map)
    for i,p in enumerate(factor_base[1:]):
        kernel.set_i32_1D(np_prime_log_map_ptr, i+1, self.prime_log_map[p])
    np_sieve_values = np.zeros(interval_size, dtype=np.int16)
    np_sieve_values_ptr = get_pointer(np_sieve_values)
    np_sieve_values_size = np_sieve_values.size

    def process_sieve_value_new(x, partials, relations, roots, a, b, c, factors_a):
        nonlocal ind
        lpf = 0
        xval = x - M
        relation = a * xval + b
        poly_val = a * xval * xval + 2 * b * xval + c

        assert poly_val < (1 << 190)
        poly_val_sign = 1 if poly_val >= 0 else -1
        poly_val_hi = (abs(poly_val) >> 128) & 0xFFFFFFFF_FFFFFFFF
        poly_val_mi = (abs(poly_val) >> 64) & 0xFFFFFFFF_FFFFFFFF
        poly_val_lo = abs(poly_val) & 0xFFFFFFFF_FFFFFFFF
        factor_buff = np.zeros(192, dtype=np.int32)
        factor_buff_ptr = get_pointer(factor_buff)
        value = kernel.factorise(factor_buff_ptr, poly_val_sign, poly_val_hi, poly_val_mi, poly_val_lo, np_factor_base_ptr, np_factor_base_size)
        local_factors = set(factor_buff[factor_buff > 0].tolist())
        local_factors ^= factors_a

        if value != 1:
            if value < large_prime_bound:
                if value in partials:
                    rel, lf, pv = partials[value]
                    relation *= rel
                    local_factors ^= lf
                    poly_val *= pv
                    lpf = 1
                else:
                    partials[value] = (relation, local_factors, poly_val * a)
                    return 0
            else:
                return 0

        for fac in local_factors:
            idx = fb_map[fac]
            matrix[idx] |= ind
        ind = ind + ind
        relations.append(relation)
        roots.append(poly_val * a)
        return lpf

    with Live(console=self.console) as live:
        while len(relations) < target_relations:
            if num_poly % 10 == 0:
                live.update(create_table(relations, target_relations, num_poly, start))

            if poly_ind == 0:
                a, b, c, B, np_Bainv, np_soln_map, s, factors_a = self.generate_first_polynomial(factor_base, N, M, poly_a_list)
                np_soln_map_ptr = get_pointer(np_soln_map)
                np_Bainv_ptr = get_pointer(np_Bainv)
                end = 1 << (s - 1)
                poly_ind += 1
            else:
                v, e = grays[poly_ind]
                b = (b + 2 * e * B[v])
                c = (b * b - N) // a
                poly_ind += 1
                if poly_ind == end:
                    poly_ind = 0
            v, e = grays[poly_ind] # v, e for next iteration

            assert 0 <= a < (1 << 190)
            a_abs = abs(a)
            a_sign = 1 if a >= 0 else -1
            a_hi = (a_abs >> 128) & 0xFFFFFFFF_FFFFFFFF
            a_mi = (a_abs >> 64) & 0xFFFFFFFF_FFFFFFFF
            a_lo = a_abs & 0xFFFFFFFF_FFFFFFFF

            kernel.compute_big_loop(np_factor_base_ptr, np_factor_base_size, 
                                    np_sieve_values_ptr, np_sieve_values_size, 
                                    np_Bainv_ptr,
                                    np_soln_map_ptr,
                                    np_prime_log_map_ptr, 
                                    a_sign, a_hi, a_mi, a_lo,
                                    self.prime_limit, M, v, e)

            num_poly += 1

            # Sanity checks
            assert 0 <= threshold < 16384 # Check overflows
            assert np_sieve_values_size == 2 * M - 6 + 7

            x = 0
            xmax = np_sieve_values_size #2 * M - 6 + 7 # Correct?
            while True:
                x = kernel.search_next(np_sieve_values_ptr, x, xmax, threshold)
                if x >= xmax:
                    break
                lp_found += process_sieve_value_new(x, partials, relations, roots, a, b, c, factors_a)
                x += 1

    print(f"\n{num_poly} polynomials sieved")
    print(f"{lp_found} relations from partials")
    print(f"{target_relations - lp_found} normal smooth relations")
    print(f"{target_relations} total relations\n")
    return matrix, relations, roots

Here is the C++ code:

#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <cassert>
#include <algorithm>
#include <unordered_set>
#include <vector>
#include <omp.h>

// Faster than Clang types (mainly for 256-bit integers)
#include <boost/multiprecision/cpp_int.hpp>
using namespace boost::multiprecision;


// Unsafe 32-bit contiguous array single load
extern "C" int32_t get_i32_1D(int32_t* sieve_values, int32_t pos)
{
    return sieve_values[pos];
}


// Unsafe 32-bit contiguous array single store
extern "C" void set_i32_1D(int32_t* sieve_values, int32_t pos, int32_t val)
{
    sieve_values[pos] = val;
}


// Unsafe 32-bit contiguous array single load
extern "C" int32_t get_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m)
{
    return sieve_values[n*ld+m];
}


// Unsafe 32-bit contiguous array single store
extern "C" void set_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m, int32_t val)
{
    sieve_values[n*ld+m] = val;
}


static bool is_factor(uint64_t poly_val_hi, uint64_t poly_val_mi, 
                             uint64_t poly_val_lo, int32_t factor)
{
    //assert(factor > 1);

    uint64_t value = 0;

    // Speed up the computation for small numbers
    if(poly_val_hi == 0)
    {
        // Speed up the computation for tiny numbers
        if(poly_val_mi == 0)
            return (poly_val_lo % factor) == 0;

        value = poly_val_mi % factor;
    }
    else
    {
        value = poly_val_hi % factor;

        value = (value << 32) | (poly_val_mi >> 32);
        value %= factor;

        value = (value << 32) | (poly_val_mi & 0xFFFFFFFF);
        value %= factor;
    }

    value = (value << 32) | (poly_val_lo >> 32);
    value %= factor;

    value = (value << 32) | (poly_val_lo & 0xFFFFFFFF);
    value %= factor;

    return value == 0;
}

static void compute_loop(uint8_t* worker_sieve_values_ptr, 
                                int16_t* sieve_values, 
                                int32_t p, int32_t r1, int32_t r2, 
                                int32_t M, int32_t log_p)
{
    const int32_t amx = r1 + M;
    const int32_t bmx = r2 + M;
    const int32_t apx = amx - p;
    const int32_t bpx = bmx - p;

    auto safe_inc = [=] (int32_t idx, int32_t inc){
        uint8_t& value = worker_sieve_values_ptr[idx];

        if (int32_t(value) + inc < 256) [[likely]]
        {
            value += inc;
        }
        else
        {
            #pragma omp atomic
            sieve_values[idx] += value;
            value = 0;
        }
    };

    for(int32_t k = p; k < M; k += p)
    {
        safe_inc(apx + k, log_p);
        safe_inc(bpx + k, log_p);
        safe_inc(amx - k, log_p);
        safe_inc(bmx - k, log_p);
    }
}


int32_t py_mod(int32_t n, int32_t d)
{
    // assert(d > 0);
    const int32_t r = n % d;
    return r + (r < 0 ? d : 0);
}


struct SavedValues { int32_t idx, r1, r2; };

extern "C" void compute_big_loop(int32_t* factor_base, int32_t factor_base_size, 
                                 int16_t* sieve_values, int32_t sieve_values_size, 
                                 int32_t* Bainv, 
                                 int32_t* soln_map, 
                                 int32_t* prime_log_map, 
                                 int32_t a_sign, uint64_t a_hi, uint64_t a_mi, uint64_t a_lo, 
                                 int32_t prime_limit, int32_t M, int32_t v, int32_t e)
{
    const int num_threads = 4;

    std::vector<uint8_t*> workers_sieve_values_ptrs;
    workers_sieve_values_ptrs.reserve(num_threads);

    // Spawn a thread-pool with only few threads since the computation
    // does not scale well (seems rather memory-bound).
    // Using too many threads cause data not to fit in the L3 cache anymore and 
    // DRAM accesses are significantly slower, not to mention the DRAM can 
    // quickly be saturated, so the additional threads are not really useful.
    // Note that this is strongly dependent of the B parameter of the algorithm 
    // and the size of the cache on the target platform.
    #pragma omp parallel num_threads(num_threads)
    {
        // Make the items as compact as possible to fit in the CPU cache (critical for performance)
        std::vector<uint8_t> worker_sieve_values(sieve_values_size);
        uint8_t* worker_sieve_values_ptr = worker_sieve_values.data();

        // Record the thread-local arrays of each worker so to sum them up later
        #pragma omp critical
        workers_sieve_values_ptrs.push_back(worker_sieve_values_ptr);

        // Execute many `compute_loop` calls in parallel on thread-local arrays
        #pragma omp for schedule(static,1)
        for (size_t i = 0; i < factor_base_size; ++i)
        {
            const int32_t p = factor_base[i];

            assert(p > 0 || p == -1);

            if(p < prime_limit or p == -1 or is_factor(a_hi, a_mi, a_lo, p))
                continue;

            assert(p > 0);

            const int32_t r1 = get_i32_2D(soln_map, factor_base_size, 0, i);
            const int32_t r2 = get_i32_2D(soln_map, factor_base_size, 1, i);
            const int32_t Bainv_val = get_i32_2D(Bainv, factor_base_size, v, i);
            assert(prime_log_map[i] < 32);
            const uint8_t log_p = prime_log_map[i];
            set_i32_2D(soln_map, factor_base_size, 0, i, py_mod(r1 - e * Bainv_val, p));
            set_i32_2D(soln_map, factor_base_size, 1, i, py_mod(r2 - e * Bainv_val, p));

            // This part is the bottleneck on big numbers because of random 
            // accesses on large arrays (typically stored in DRAM)
            compute_loop(worker_sieve_values_ptr, sieve_values, p, r1, r2, M, log_p);
        }

        // Sum up all the thread-local arrays in parallel
        #pragma omp for schedule(static)
        for (int32_t ib = 0; ib < sieve_values_size; ib+=128)
        {
            int16_t tmp[128] = {0};

            for (int32_t w = 0; w < workers_sieve_values_ptrs.size(); ++w)
                for (int32_t i = 0; i < std::min(128, sieve_values_size-ib); ++i)
                    tmp[i] += workers_sieve_values_ptrs[w][ib+i];

            for (int32_t i = 0; i < std::min(128, sieve_values_size-ib); ++i)
                sieve_values[ib+i] = tmp[i];
        }
    }
}


extern "C" uint64_t factorise(int32_t* final_factors, 
                              int32_t poly_val_sign, uint64_t poly_val_hi, uint64_t poly_val_mi, uint64_t poly_val_lo, 
                              int32_t* factor_base, int32_t factor_base_size)
{
    std::unordered_set<int32_t> factors;

    if(poly_val_sign < 0)
        factors.insert(-1);

    uint256_t value = (uint256_t(poly_val_hi) << 128u) | (uint256_t(poly_val_mi) << 64u) | uint256_t(poly_val_lo);

    int32_t i;

    for (int32_t i = 1; i < factor_base_size; ++i)
    {
        const int32_t factor = factor_base[i];

        while(is_factor(poly_val_hi, poly_val_mi, poly_val_lo, factor))
        {
            // Rarely executed

            if(factors.erase(factor) == 0)
                factors.insert(factor);

            value /= factor;
            poly_val_hi = uint64_t(value >> 128u);
            poly_val_mi = uint64_t(value >> 64u);
            poly_val_lo = uint64_t(value);
        }
    }

    assert(factors.size() <= 192);
    std::copy(factors.begin(), factors.end(), final_factors);
    std::sort(final_factors, final_factors + factors.size());

    assert(value < (uint256_t(1) << 63u));
    return uint64_t(value);
}


extern "C" int32_t search_next(int16_t* sieve_values, int32_t x, int32_t xmax, int16_t threshold)
{
    while(x < xmax)
    {
        // Fast path (SIMD-friendly)
        while (x + 32 < xmax)
        {
            bool found = false;

            for (int i = 0; i < 32; ++i)
                found |= sieve_values[x+i] > threshold;

            if(found)
                break;

            memset(sieve_values+x, 0, 32*sizeof(int16_t));
            x += 32;
        }

        const int16_t val = sieve_values[x];
        sieve_values[x] = 0;

        if(val > threshold) [[unlikely]]
            return x;

        x++;
    }

    return xmax;
}

Here is the CFFI wrapper (just the prototypes of the C++ extern "C" function):

from cffi import FFI

ffibuilder = FFI()

ffibuilder.cdef("""
    int32_t search_next(int16_t* sieve_values, int32_t x, int32_t xmax, int16_t threshold);
    uint64_t factorise(int32_t* factors, 
                       int32_t poly_val_sign, uint64_t poly_val_hi, uint64_t poly_val_mi, uint64_t poly_val_lo, 
                       int32_t* factor_base_ptr, int32_t factor_base_size);
    int32_t get_i32_1D(int32_t* sieve_values, int32_t pos);
    void set_i32_1D(int32_t* sieve_values, int32_t pos, int32_t val);
    int32_t get_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m);
    void set_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m, int32_t val);
    void compute_big_loop(int32_t* factor_base, int32_t factor_base_size, 
                          int16_t* sieve_values, int32_t sieve_values_size, 
                          int32_t* Bainv, 
                          int32_t* soln_map, 
                          int32_t* prime_log_map, 
                          int32_t a_sign, uint64_t a_hi, uint64_t a_mi, uint64_t a_lo, 
                          int32_t prime_limit, int32_t M, int32_t v, int32_t e);
""")

# There is probably a cleaner way than copy-pasting the cdef code and just prefix the lines with "extern"
ffibuilder.set_source(
    'kernel_cffi', 
    '''
extern int32_t search_next(int16_t* sieve_values, int32_t x, int32_t xmax, int16_t threshold);
extern uint64_t factorise(int32_t* final_factors, 
                          int32_t poly_val_sign, uint64_t poly_val_hi, uint64_t poly_val_mi, uint64_t poly_val_lo, 
                          int32_t* factor_base_ptr, int32_t factor_base_size);
extern int32_t get_i32_1D(int32_t* sieve_values, int32_t pos);
extern void set_i32_1D(int32_t* sieve_values, int32_t pos, int32_t val);
extern int32_t get_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m);
extern void set_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m, int32_t val);
extern void compute_big_loop(int32_t* factor_base, int32_t factor_base_size, 
                                 int16_t* sieve_values, int32_t sieve_values_size, 
                                 int32_t* Bainv, 
                                 int32_t* soln_map, 
                                 int32_t* prime_log_map, 
                                 int32_t a_sign, uint64_t a_hi, uint64_t a_mi, uint64_t a_lo, 
                                 int32_t prime_limit, int32_t M, int32_t v, int32_t e);
''', 
    libraries=['kernel'], 
    include_dirs=['build'], 
    library_dirs=['.', 'build']
)

ffibuilder.compile(tmpdir='build', verbose=False)

Here is the command-line to compile the libraries:

mkdir -p build
clang++ -std=c++17 -O3 -mavx2 -fPIC -c kernel.cpp -o build/kernel.o -fopenmp
clang++ -shared -fPIC build/kernel.o -o build/libkernel.so -fopenmp
./pypy/bin/pypy build_cffi_module.py

And the one to run the code:

PYTHONPATH=build LD_LIBRARY_PATH=build ./pypy/bin/pypy main.py

Further optimizations

The factorization can be parallelized so to only take a fraction of the overall execution. However, this requires to transform the Python while loop calling search_next and process_sieve_value_new mostly into C++. This means more variables must be converted to Numpy arrays like at least xval, relation and poly_val which might be large integers (tedious to pass to C++ code). All the call to factorise can be done in parallel in C++. The rest of the process_sieve_value_new function does not have to be transformed to C++ code since you can store all the resulting value and then run the remaining code of process_sieve_value_new. This reduce the amount of work needed to do the transformation (and still most of the code is Python one). This optimization should significantly reduce the time to factorize small numbers but not so much for big numbers (especially beyond 90-digit numbers).

The "Solving linear system in GF(2)" step can be massively optimized. However this first require to convert the huge numbers stored in matrix to native-integer-based arrays. You can then convert the code into C++ like I did for the other part of the code. Then 3 main optimizations can be applied:

make the code SIMD-friendly so the compiled can auto-vectorize it (the current PyPy code waste most of its time performing inefficient scalar moves in this part);
perform the Gaussian elimination block by block (so to make the operation less memory-bound);
try to parallelize the Gaussian elimination

Besides, AFAIK there are more efficient algorithm for this part (linear algebra in GF(2)). On this part, Wikipedia mentions the Block Wiedemann algorithm. On top of that, using GPUs for this specific part might also help (especially since the matrix seems a dense one).