I've coded up the Self Initializing Quadratic Sieve (SIQS) in Python, but it has been coded with respect to being as fast as possible in PyPy(not native Python).
Here is the complete code:
import logging
import time
import math
from math import sqrt, ceil, floor, exp, log2, log, isqrt
from rich.live import Live
from rich.table import Table
from rich.console import Console
import random
import sys
LOWER_BOUND_SIQS = 1000
UPPER_BOUND_SIQS = 4000
logging.basicConfig(
format='[%(levelname)s] %(asctime)s - %(message)s',
level=logging.INFO
)
def get_gray_code(n):
gray = [0] * (1 << (n - 1))
gray[0] = (0, 0)
for i in range(1, 1 << (n - 1)):
v = 1
j = i
while (j & 1) == 0:
v += 1
j >>= 1
tmp = i + ((1 << v) - 1)
tmp >>= v
if (tmp & 1) == 1:
gray[i] = (v - 1, -1)
else:
gray[i] = (v - 1, 1)
return gray
MULT_LIST = [
1, 2, 3, 5, 7, 9, 10, 11, 13, 14,
15, 17, 19, 21, 23, 25, 29, 31,
33, 35, 37, 39, 41, 43, 45,
47, 49, 51, 53, 55, 57, 59, 61, 63,
65, 67, 69, 71, 73, 75, 77, 79, 83,
85, 87, 89, 91, 93, 95, 97, 101, 103, 105, 107,
109, 111, 113, 115, 119, 121, 123, 127, 129, 131, 133,
137, 139, 141, 143, 145, 147, 149, 151, 155, 157, 159,
161, 163, 165, 167, 173, 177, 179, 181, 183, 185, 187,
191, 193, 195, 197, 199, 201, 203, 205, 209, 211, 213,
215, 217, 219, 223, 227, 229, 231, 233, 235, 237, 239,
241, 249, 251, 253, 255
]
def create_table(relations, target_relations, num_poly, start_time):
end = time.time()
elapsed = end - start_time
relations_per_second = len(relations) / elapsed if elapsed > 0 else 0
poly_per_second = num_poly / elapsed if elapsed > 0 else 0
percent = (len(relations) / target_relations) * 100 if target_relations > 0 else 0
percent_per_second = percent / elapsed if elapsed > 0 else 0
remaining_percent = 100.0 - percent
seconds = int(remaining_percent / percent_per_second) if percent_per_second > 0 else 0
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
table = Table(title="Processing Status")
table.add_column("Metric", style="cyan", no_wrap=True)
table.add_column("Value", style="magenta")
table.add_row("Relations per second", f"{relations_per_second:,.2f}")
table.add_row("Poly per second", f"{poly_per_second:,.2f}")
table.add_row("Percent", f"{percent:,.2f}%")
table.add_row("Percent per second", f"{percent_per_second:,.4f}%")
table.add_row("Estimated Time", f"{h:d}:{m:02d}:{s:02d}")
return table
class QuadraticSieve:
def __init__(self, M, B=None, T=2, prime_limit=20, eps=30, lp_multiplier=20, multiplier=None):
self.logger = logging.getLogger(__name__)
self.prime_log_map = {}
self.root_map = {}
self.M = M
self.B = B
self.T = T
self.prime_limit = prime_limit
self.eps = eps
self.lp_multiplier = lp_multiplier
self.multiplier = multiplier
self.console = Console()
print(f"B: {B}")
print(f"M: {M}")
print(f"prime_limit: {prime_limit}")
print(f"eps: {eps}")
print(f"lp_multiplier: {lp_multiplier}")
@staticmethod
def gcd(a, b):
a, b = abs(a), abs(b)
while a:
a, b = b % a, a
return b
@staticmethod
def legendre(n, p):
val = pow(n, (p - 1) // 2, p)
return val - p if val > 1 else val
@staticmethod
def jacobi(a, m):
a = a % m
t = 1
while a != 0:
while a % 2 == 0:
a //= 2
if m % 8 in [3, 5]:
t = -t
a, m = m, a
if a % 4 == 3 and m % 4 == 3:
t = -t
a %= m
return t if m == 1 else 0
@staticmethod
def modinv(n, p):
n = n % p
x, u = 0, 1
while n:
x, u = u, x - (p // n) * u
p, n = n, p % n
return x
def factorise_fast(self, value, factor_base):
factors = set()
if value < 0:
factors ^= {-1}
value = -value
for factor in factor_base[1:]:
while value % factor == 0:
factors ^= {factor}
value //= factor
return factors, value
@staticmethod
def tonelli_shanks(a, p):
a %= p
if p % 8 in [3, 7]:
x = pow(a, (p + 1) // 4, p)
return x, p - x
if p % 8 == 5:
x = pow(a, (p + 3) // 8, p)
if pow(x, 2, p) != a % p:
x = (x * pow(2, (p - 1) // 4, p)) % p
return x, p - x
d = 2
symb = 0
while symb != -1:
symb = QuadraticSieve.jacobi(d, p)
d += 1
d -= 1
n = p - 1
s = 0
while n % 2 == 0:
n //= 2
s += 1
t = n
A = pow(a, t, p)
D = pow(d, t, p)
m = 0
for i in range(s):
i1 = pow(2, s - 1 - i)
i2 = (A * pow(D, m, p)) % p
i3 = pow(i2, i1, p)
if i3 == p - 1:
m += pow(2, i)
x = (pow(a, (t + 1) // 2, p) * pow(D, m // 2, p)) % p
return x, p - x
@staticmethod
def prime_sieve(n):
sieve = [True] * (n + 1)
sieve[0], sieve[1] = False, False
for i in range(2, int(n**0.5) + 1):
if sieve[i]:
for j in range(i * 2, n + 1, i):
sieve[j] = False
return [i for i, is_prime in enumerate(sieve) if is_prime]
def find_b(self, N):
x = ceil(exp(0.5 * sqrt(log(N) * log(log(N)))))
return x
def choose_multiplier(self, N, B):
prime_list = self.prime_sieve(B)
if self.multiplier is not None:
self.logger.info("Using multiplier k = %d", self.multiplier)
return prime_list
NUM_TEST_PRIMES = 300
LN2 = math.log(2)
num_primes = min(len(prime_list), NUM_TEST_PRIMES)
log2n = math.log(N)
scores = [0.0 for _ in MULT_LIST]
num_multipliers = 0
for i, curr_mult in enumerate(MULT_LIST):
knmod8 = (curr_mult * (N % 8)) % 8
logmult = math.log(curr_mult)
scores[i] = 0.5 * logmult
if knmod8 == 1:
scores[i] -= 2 * LN2
elif knmod8 == 5:
scores[i] -= LN2
elif knmod8 in (3, 7):
scores[i] -= 0.5 * LN2
num_multipliers += 1
for i in range(1, num_primes):
prime = prime_list[i]
contrib = math.log(prime) / (prime - 1)
modp = N % prime
for j in range(num_multipliers):
curr_mult = MULT_LIST[j]
knmodp = (modp * curr_mult) % prime
if knmodp == 0 or self.legendre(knmodp, prime) == 1:
if knmodp == 0:
scores[j] -= contrib
else:
scores[j] -= 2 * contrib
best_score = float('inf')
best_mult = 1
for i in range(num_multipliers):
if scores[i] < best_score:
best_score = scores[i]
best_mult = MULT_LIST[i]
self.multiplier = best_mult
self.logger.info("Using multiplier k = %d", best_mult)
return prime_list
def get_smooth_b(self, N, B, prime_list):
factor_base = [-1, 2]
self.prime_log_map[2] = 1
for p in prime_list[1:]:
if self.legendre(N, p) == 1:
factor_base.append(p)
self.prime_log_map[p] = round(log2(p))
self.root_map[p] = self.tonelli_shanks(N, p)
return factor_base
def decide_bound(self, N, B=None):
if B is None:
B = self.find_b(N)
self.B = B
self.logger.info("Using B = %d", B)
return B
def build_factor_base(self, N, B, prime_list):
fb = self.get_smooth_b(N, B, prime_list)
self.logger.info("Factor base size: %d", len(fb))
return fb
def new_poly_a(self, factor_base, N, M, poly_a_list):
small_B = 1024
lower_polypool_index = 2
upper_polypool_index = small_B - 1
poly_low_found = False
for i in range(small_B):
if factor_base[i] > LOWER_BOUND_SIQS and not poly_low_found:
lower_polypool_index = i
poly_low_found = True
if factor_base[i] > UPPER_BOUND_SIQS:
upper_polypool_index = i - 1
break
# Compute target_a and bit threshold
target_a = int(math.sqrt(2 * N) / M)
target_mul = 0.9
target_bits = int(target_a.bit_length() * target_mul)
too_close = 10
close_range = 5
min_ratio = LOWER_BOUND_SIQS
while True:
poly_a = 1
afact = []
qli = []
while True:
found_a_factor = False
while(found_a_factor == False):
randindex = random.randint(lower_polypool_index, upper_polypool_index)
potential_a_factor = factor_base[randindex]
found_a_factor = True
if potential_a_factor in afact:
found_a_factor = False
poly_a = poly_a * potential_a_factor
afact.append(potential_a_factor)
qli.append(randindex)
j = target_a.bit_length() - poly_a.bit_length()
if j < too_close:
poly_a = 1
s = 0
afact = []
qli = []
continue
elif j < (too_close + close_range):
break
a1 = target_a // poly_a
if a1 < min_ratio:
continue
mindiff = 100000000000000000
randindex = 0
for i in range(small_B):
if abs(a1 - factor_base[i]) < mindiff:
mindiff = abs(a1 - factor_base[i])
randindex = i
found_a_factor = False
while not found_a_factor:
potential_a_factor = factor_base[randindex]
found_a_factor = True
if potential_a_factor in afact:
found_a_factor = False
if not found_a_factor:
randindex += 1
if randindex > small_B:
continue
poly_a = poly_a * factor_base[randindex]
afact.append(factor_base[randindex])
qli.append(randindex)
diff_bits = (target_a - poly_a).bit_length()
if diff_bits < target_bits:
if poly_a in poly_a_list:
if target_bits > 1000:
print("SOMETHING WENT WRONG")
sys.exit()
target_bits += 1
continue
else:
break
poly_a_list.append(poly_a)
return poly_a, sorted(qli), set(afact)
def generate_first_polynomial(self, factor_base, N, M, poly_a_list):
a, qli, factors_a = self.new_poly_a(factor_base, N, M, poly_a_list)
s = len(qli)
B = []
for l in range(s):
p = factor_base[qli[l]]
r1 = self.root_map[p][0]
aq = a // p
invaq = self.modinv(aq, p)
gamma = r1 * invaq % p
if gamma > p // 2:
gamma = p - gamma
B.append(aq * gamma)
b = sum(B) % a
c = (b * b - N) // a
soln_map = {}
Bainv = {}
for p in factor_base:
Bainv[p] = []
if a % p == 0 or p == 2:
continue
ainv = self.modinv(a, p)
# store bainv
for j in range(s):
Bainv[p].append((2 * B[j] * ainv) % p)
# store roots
r1, r2 = self.root_map[p]
r1 = ((r1 - b) * ainv) % p
r2 = ((r2 - b) * ainv) % p
soln_map[p] = [r1, r2]
return a, b, c, B, Bainv, soln_map, s, factors_a
def sieve(self, N, B, factor_base, M):
# ------------------------------------------------
# 1) TIMING
# ------------------------------------------------
start = time.time()
# ------------------------------------------------
# 2) FACTOR BASE & RELATED
# ------------------------------------------------
fb_len = len(factor_base)
fb_map = {val: i for i, val in enumerate(factor_base)}
target_relations = fb_len + self.T
large_prime_bound = B * self.lp_multiplier
# ------------------------------------------------
# 3) THRESHOLD & MISC
# ------------------------------------------------
threshold = int(math.log2(M * math.sqrt(N)) - self.eps)
lp_found = 0
ind = 1
matrix = [0] * fb_len
relations = []
roots = []
partials = {}
num_poly = 0
interval_size = 2 * M + 1
grays = get_gray_code(20)
poly_a_list = []
poly_ind = 0
sieve_values = [0] * interval_size
r1 = 0
r2 = 0
def process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a):
nonlocal ind
val = sieve_values[x]
sieve_values[x] = 0
lpf = 0
if val > threshold:
xval = x - M
relation = a * xval + b
poly_val = a * xval * xval + 2 * b * xval + c
local_factors, value = self.factorise_fast(poly_val, factor_base)
local_factors ^= factors_a
if value != 1:
if value < large_prime_bound:
if value in partials:
rel, lf, pv = partials[value]
relation *= rel
local_factors ^= lf
poly_val *= pv
lpf = 1
else:
partials[value] = (relation, local_factors, poly_val * a)
return 0
else:
return 0
for fac in local_factors:
idx = fb_map[fac]
matrix[idx] |= ind
ind = ind + ind
relations.append(relation)
roots.append(poly_val * a)
return lpf
with Live(console=self.console) as live:
while len(relations) < target_relations:
if num_poly % 10 == 0:
live.update(create_table(relations, target_relations, num_poly, start))
if poly_ind == 0:
a, b, c, B, Bainv, soln_map, s, factors_a = self.generate_first_polynomial(factor_base, N, M, poly_a_list)
end = 1 << (s - 1)
poly_ind += 1
else:
v, e = grays[poly_ind]
b = (b + 2 * e * B[v])
c = (b * b - N) // a
poly_ind += 1
if poly_ind == end:
poly_ind = 0
v, e = grays[poly_ind] # v, e for next iteration
for p in factor_base:
if p < self.prime_limit or a % p == 0:
continue
log_p = self.prime_log_map[p]
r1, r2 = soln_map[p]
soln_map[p][0] = (r1 - e * Bainv[p][v]) % p
soln_map[p][1] = (r2 - e * Bainv[p][v]) % p
amx = r1 + M
bmx = r2 + M
apx = amx - p
bpx = bmx - p
k = p
while k < M:
sieve_values[apx + k] += log_p
sieve_values[bpx + k] += log_p
sieve_values[amx - k] += log_p
sieve_values[bmx - k] += log_p
k += p
num_poly += 1
x = 0
while x < 2 * M - 6:
# for some reason need to do all this for max performance gain in PyPy3
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
print(f"\n{num_poly} polynomials sieved")
print(f"{lp_found} relations from partials")
print(f"{target_relations - lp_found} normal smooth relations")
print(f"{target_relations} total relations\n")
return matrix, relations, roots
def solve_bits(self, matrix, n):
self.logger.info("Solving linear system in GF(2).")
lsmap = {lsb: 1 << lsb for lsb in range(n)}
# GAUSSIAN ELIMINATION
m = len(matrix)
marks = []
cur = -1
# m -> number of primes in factor base
# n -> number of smooth relations
mark_mask = 0
for row in matrix:
if cur % 100 == 0:
print("", end=f"{cur, m}\r")
cur += 1
lsb = (row & -row).bit_length() - 1
if lsb == -1:
continue
marks.append(n - lsb - 1)
mark_mask |= 1 << lsb
for i in range(m):
if matrix[i] & lsmap[lsb] and i != cur:
matrix[i] ^= row
marks.sort()
# NULL SPACE EXTRACTION
nulls = []
free_cols = [col for col in range(n) if col not in marks]
k = 0
for col in free_cols:
shift = n - col - 1
val = 1 << shift
fin = val
for v in matrix:
if v & val:
fin |= v & mark_mask
nulls.append(fin)
k += 1
if k == self.T:
break
return nulls
def extract_factors(self, N, relations, roots, null_space):
n = len(relations)
for vector in null_space:
prod_left = 1
prod_right = 1
for idx in range(len(relations)):
bit = vector & 1
vector = vector >> 1
if bit == 1:
prod_left *= relations[idx]
prod_right *= roots[idx]
idx += 1
sqrt_right = isqrt(prod_right)
prod_left = prod_left % N
sqrt_right = sqrt_right % N
factor_candidate = self.gcd(N, prod_left - sqrt_right)
if factor_candidate not in (1, N):
other_factor = N // factor_candidate
self.logger.info("Found factors: %d, %d", factor_candidate, other_factor)
return factor_candidate, other_factor
return 0, 0
def factor(self, N, B=None):
overall_start = time.time()
self.logger.info("========== Quadratic Sieve V4 Start ==========")
self.logger.info("Factoring N = %d", N)
step_start = time.time()
B = self.decide_bound(N, self.B)
step_end = time.time()
self.logger.info("Step 1 (Decide Bound) took %.3f seconds", step_end - step_start)
step_start = time.time()
prime_list = self.choose_multiplier(N, self.B)
step_end = time.time()
self.logger.info("Step 2 (Choose Multiplier) took %.3f seconds", step_end - step_start)
kN = self.multiplier * N
if kN.bit_length() < 140:
LOWER_BOUND_SIQS = 3
step_start = time.time()
factor_base = self.build_factor_base(kN, B, prime_list)
step_end = time.time()
self.logger.info("Step 3 (Build Factor Base) took %.3f seconds", step_end - step_start)
step_start = time.time()
matrix, relations, roots = self.sieve(kN, B, factor_base, self.M)
step_end = time.time()
self.logger.info("Step 4 (Sieve Interval) took %.3f seconds", step_end - step_start)
n = len(relations)
step_start = time.time()
null_space = self.solve_bits(matrix, n)
step_end = time.time()
self.logger.info("Step 5 (Solve Dependencies) took %.3f seconds", step_end - step_start)
step_start = time.time()
f1, f2 = self.extract_factors(N, relations, roots, null_space)
step_end = time.time()
self.logger.info("Step 6 (Extract Factors) took %.3f seconds", step_end - step_start)
if f1 and f2:
self.logger.info("Quadratic Sieve successful: %d * %d = %d", f1, f2, N)
else:
self.logger.warning("No non-trivial factors found with the current settings.")
overall_end = time.time()
self.logger.info("Total time for Quadratic Sieve: %.10f seconds", overall_end - overall_start)
self.logger.info("========== Quadratic Sieve End ==========")
return f1, f2
if __name__ == '__main__':
## 60 digit number
#N = 373784758862055327503642974151754627650123768832847679663987
#qs = QuadraticSieve(B=111000, M=400000, T=10, prime_limit=45, eps=34, lp_multiplier=20000)
### 70 digit number
N = 3605578192695572467817617873284285677017674222302051846902171336604399
qs = QuadraticSieve(B=300000, M=350000, prime_limit=47, eps=40, T=10, lp_multiplier=256)
## 80 digit number
#N = 4591381393475831156766592648455462734389 * 1678540564209846881735567157366106310351
#qs = QuadraticSieve(B=700_000, M=600_000, prime_limit=52, eps=45, T=10, lp_multiplier=256)
factor1, factor2 = qs.factor(N)
Now, the main running time in the comes from the following section which is where basically a giant sieving process:
def sieve(self, N, B, factor_base, M):
start = time.time()
fb_len = len(factor_base)
fb_map = {val: i for i, val in enumerate(factor_base)}
target_relations = fb_len + self.T
large_prime_bound = B * self.lp_multiplier
threshold = int(math.log2(M * math.sqrt(N)) - self.eps)
lp_found = 0
ind = 1
matrix = [0] * fb_len
relations = []
roots = []
partials = {}
num_poly = 0
interval_size = 2 * M + 1
grays = get_gray_code(20)
poly_a_list = []
poly_ind = 0
sieve_values = [0] * interval_size
r1 = 0
r2 = 0
def process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a):
nonlocal ind
val = sieve_values[x]
sieve_values[x] = 0
lpf = 0
if val > threshold:
xval = x - M
relation = a * xval + b
poly_val = a * xval * xval + 2 * b * xval + c
local_factors, value = self.factorise_fast(poly_val, factor_base)
local_factors ^= factors_a
if value != 1:
if value < large_prime_bound:
if value in partials:
rel, lf, pv = partials[value]
relation *= rel
local_factors ^= lf
poly_val *= pv
lpf = 1
else:
partials[value] = (relation, local_factors, poly_val * a)
return 0
else:
return 0
for fac in local_factors:
idx = fb_map[fac]
matrix[idx] |= ind
ind = ind + ind
relations.append(relation)
roots.append(poly_val * a)
return lpf
with Live(console=self.console) as live:
while len(relations) < target_relations:
if num_poly % 10 == 0:
live.update(create_table(relations, target_relations, num_poly, start))
if poly_ind == 0:
a, b, c, B, Bainv, soln_map, s, factors_a = self.generate_first_polynomial(factor_base, N, M, poly_a_list)
end = 1 << (s - 1)
poly_ind += 1
else:
v, e = grays[poly_ind]
b = (b + 2 * e * B[v])
c = (b * b - N) // a
poly_ind += 1
if poly_ind == end:
poly_ind = 0
v, e = grays[poly_ind] # v, e for next iteration
for p in factor_base:
if p < self.prime_limit or a % p == 0:
continue
log_p = self.prime_log_map[p]
r1, r2 = soln_map[p]
soln_map[p][0] = (r1 - e * Bainv[p][v]) % p
soln_map[p][1] = (r2 - e * Bainv[p][v]) % p
amx = r1 + M
bmx = r2 + M
apx = amx - p
bpx = bmx - p
k = p
while k < M:
sieve_values[apx + k] += log_p
sieve_values[bpx + k] += log_p
sieve_values[amx - k] += log_p
sieve_values[bmx - k] += log_p
k += p
num_poly += 1
x = 0
while x < 2 * M - 6:
# for some reason need to do all this for max performance gain in PyPy3
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
lp_found += process_sieve_value(x, sieve_values, partials, relations, roots, a, b, c, factors_a)
x += 1
print(f"\n{num_poly} polynomials sieved")
print(f"{lp_found} relations from partials")
print(f"{target_relations - lp_found} normal smooth relations")
print(f"{target_relations} total relations\n")
return matrix, relations, roots
I have "optimized" the code as much as I can so far, and it runs pretty fast in PyPy, but I am wondering if there is anything I am missing that can tweak out more performance gain. I haven't really been able to get anything meaningful out of profiling the code because of the way that PyPy works, but through a bit of testing have made various improvements that have cut down the time a lot like some loop unrolling, precision reduction, and the way I work with the arrays.
Unfortunately, this sieving code is still not fast enough to be feasible in factoring the size of numbers which I'm targeting(100 to 115 digit semiprimes). By my initial estimates with semi-decent parameter selection and without multiprocessing, the sieving code itself would taken around 70 hours when compiled using PyPy3 on my device.
Is there anything I can do to better optimize this code for the PyPy JIT?
This answer provides a way to significantly speed up the code by:
The overall algorithm is left unchanged and the logic of the Python code too. That being said many non-trivial optimizations are performed and most of the changes applied are nearly impossible to do purely in PyPy (this why I choose to move the code to C++), starting from the (efficient) use of multiple threads. In the end, this makes the code 7 times faster for 80-digit numbers (4 times faster for 70-digit ones).
Note that the answer mostly focus only on "Sieve Interval" step though few optimization tips are provided at the end.
First of all, I profiled the code and found out that two parts of the code was slow:
while
loop calling many times process_sieve_value
while
loop filling sieve_values
The former tends to be the bottleneck on numbers with 60~70 digits, while the later is clearly the bottleneck for numbers having 80~90 digits. Let us focus on the former first.
Make the code SIMD-friendly:
To optimize the process_sieve_value
-based loop, we can search the next value where sieve_values[i] > threshold
and only then call process_sieve_value
with the value found rather than calling the function many times. This enable further optimizations. Indeed, searching for sieve_values[i] > threshold
can be done in a SIMD-friendly way while the initial solution is clearly not SIMD-friendly. SIMD units of modern CPU can operate on many items simultaneously (e.g. 8 x 32-bit integers, or even 16 x 16-bit integers) with a cost close to scalar operation (for most operations except very-expensive ones like integer divisions). This means codes compiled to used SIMD instructions can be much faster than ones using scalar instructions. PyPy is generally unable to do such optimization. I think this is mainly because vectorizing codes is a very expensive operation (not great for JITs like PyPy) and Python code tends to be really hard to vectorize efficiently when this is even possible. This searching operation is done in C++.
Converting lists to Numpy arrays:
One issue with this optimization (like the others) is that sieve_values
is a list. Lists are not very efficient (because they contain reference to dynamically-typed garbage-collected dynamically-allocated Python objects) and we cannot (easily) operate on them with modules like CFFI, Cython, etc. Thus, lists should be converted to native-friendly data-structures like plain arrays. Numpy is the perfect module for that. However, converting lists to Numpy arrays is painfully slow in PyPy. Getting/Setting items is insanely slow too (10 times slower than CPython which is already >200 times slower than C++ for that). The key to make this fast is unfortunately to write a C++ function to load/store values in Numpy arrays. This is clearly sub-optimal (i.e. one function call per access) but surprisingly fast in practice compared to all other alternatives. It is actually fast enough to be competitive with list accesses!
Converting dictionaries to Numpy arrays:
Dictionaries are pretty expensive, even in C++. Moreover, like lists, Python dictionaries does not fit well native codes. Statements like soln_map[p][0] = (r1 - e * Bainv[p][v]) % p
are clearly a problem. Indeed, it is not only pretty slow (not a bottleneck except once the rest of the code has been optimized), but it also prevents the transformation of the loop containing the statement in a native code at least neither easily nor efficiently). The thing is such dictionary are read in a p
-based loop (so they are read in order). Thus, we can just replace such dictionaries by basic 2D Numpy arrays. There is a catch though: the array needs to be transposed to be read in a cache-friendly way. Indeed, v
is a constant in the loop so the contiguous axis has to be the one indexing p
instead of v
.
Converting big numbers: Unfortunately, >64-bit numbers cannot directly be provided to native codes (not supported yet). Thus, one way to pass large integers to native codes is simply to split it in 64-bit parts. It is a bit tedious but this is fine for <192-bit numbers since it only results in 3 parts (+1 for the sign).
Optimizing the factorization: PyPy turns out to be relatively efficient for computing modulo on large numbers since it uses different algorithms regarding the size of the integers (because small integers are internally a different type). That being said, we can write a bit faster code by checking if the high bits of the target large number are set to zero in order to reduce the number of (very expensive) native 64-bit modulo operations. Having large number stored in parts turns out to be good for convenience and performance in this case. Indeed, we can first check the first 64-bit part trivially and natively. Besides, PyPy needs to follows the semantic of Python (a modulo must be always positive) while C++ code does not (we generally know that the operands are positive so there is no need to guarantee that with additional instructions).
Optimizing the update of sieve_values
:
This is clearly the hardest part of the algorithm to optimize. Indeed, the operation is memory bound and also hard to parallelize due to the memory access pattern (huge stride for relatively small p
and pseudo-random memory accesses for large p
). Each memory access tends to result in a full cache-line fetch (slow). For large N
numbers (requiring a rather large M
for the code to be fast), the whole array tends to be so big that it does not fit in the L1/L2 cache of most mainstream CPUs.
The key to make this operation cache-friendly is to store sieve_values
in a very-compact way. I choose to store items in 8-bit integers. Since this is not enough to store all value contained in the array, the idea is to accumulate the log_p
value in 8-bit items as much as possible and fallback on a 16-bit array when an item of the 8-bit array overflows (the item is then reset to 0 so to mitigate the number of accesses to the 16-bit array). Accesses to the 16-bit array are unlikely since log_p
values are generally tiny (they are all less than 20 in practice).
This part can be parallelized (though it clearly does not scale). Indeed, we can allocate multiple temporary 8-bit array for accumulating the log_p
values and finally sum the 8-bit arrays in sieve_values
. This means the quite-expensive divisions are fully executed in parallel. Unfortunately, the 8-bit array access does not scale well: only 3-4 threads are enough to saturate the memory hierarchy on my machine (i5-9600KF CPU). Even worse: more threads results in random-like memory accesses on a wider memory area for the CPU and the L3 cache may not be large enough to contain all arrays. If this happens, the DRAM is used instead and it makes things significantly slower due to its much higher latency and significantly lower throughput.
Unfortunately, I did not find any additional (significant) optimization for this expensive part, and it is still a bottleneck on large 80~90 bit numbers in the optimized code... Note that the optimized code might better scale on some recent CPUs having significantly bigger caches than my CPU (especially the L3 cache on AMD X3D-like CPUs, and the L2 of all recent x86-64 CPU). Besides, note that tuning the B
and M
parameters of the algorithm can help to reduce the bottleneck a bit.
This section provide the optimized C++ code, parts of the modified Python code (too big to fit in this answer anyway) and additional resources (e.g. how to build the libraries). I used CFFI to call it from PyPy (pretty-fast in PyPy and relatively simple to use).
Here is the section of the optimized Python code:
# Include-like section
import numpy as np
from kernel_cffi import lib as kernel
import cffi
ffi = cffi.FFI()
# Convenient function to extract the pointer of Numpy arrays
def get_pointer(numpy_array):
return ffi.cast(f'{numpy_array.dtype}_t*', ffi.from_buffer(numpy_array))
# Modified function so to build and return Numpy arrays instead of dicts/lists
def generate_first_polynomial(self, factor_base, N, M, poly_a_list):
a, qli, factors_a = self.new_poly_a(factor_base, N, M, poly_a_list)
s = len(qli)
B = []
for l in range(s):
p = factor_base[qli[l]]
r1 = self.root_map[p][0]
aq = a // p
invaq = self.modinv(aq, p)
gamma = r1 * invaq % p
if gamma > p // 2:
gamma = p - gamma
B.append(aq * gamma)
b = sum(B) % a
c = (b * b - N) // a
factor_base_size = len(factor_base)
np_Bainv = np.zeros((s,factor_base_size), dtype=np.int32)
np_Bainv_ptr = get_pointer(np_Bainv)
np_soln_map = np.zeros((2,factor_base_size), dtype=np.int32)
np_soln_map_ptr = get_pointer(np_soln_map)
for i,p in enumerate(factor_base):
if a % p == 0 or p == 2:
continue
ainv = self.modinv(a, p)
# store bainv
for j in range(s):
kernel.set_i32_2D(np_Bainv_ptr, factor_base_size, j, i, (2 * B[j] * ainv) % p)
# store roots
r1, r2 = self.root_map[p]
r1 = ((r1 - b) * ainv) % p
r2 = ((r2 - b) * ainv) % p
kernel.set_i32_2D(np_soln_map_ptr, factor_base_size, 0, i, r1)
kernel.set_i32_2D(np_soln_map_ptr, factor_base_size, 1, i, r2)
return a, b, c, B, np_Bainv, np_soln_map, s, factors_a
# Main modified function (without the same old initialization code)
def sieve(self, N, B, factor_base, M):
# [...] same as before (initialization in 3 steps)
# ------------------------------------------------
# 4) CONVERSIONS
# ------------------------------------------------
np_factor_base = np.array(factor_base, dtype=np.int32)
np_factor_base_ptr = get_pointer(np_factor_base)
np_factor_base_size = np_factor_base.size
np_prime_log_map = np.zeros(np_factor_base_size, dtype=np.int32)
np_prime_log_map_ptr = get_pointer(np_prime_log_map)
for i,p in enumerate(factor_base[1:]):
kernel.set_i32_1D(np_prime_log_map_ptr, i+1, self.prime_log_map[p])
np_sieve_values = np.zeros(interval_size, dtype=np.int16)
np_sieve_values_ptr = get_pointer(np_sieve_values)
np_sieve_values_size = np_sieve_values.size
def process_sieve_value_new(x, partials, relations, roots, a, b, c, factors_a):
nonlocal ind
lpf = 0
xval = x - M
relation = a * xval + b
poly_val = a * xval * xval + 2 * b * xval + c
assert poly_val < (1 << 190)
poly_val_sign = 1 if poly_val >= 0 else -1
poly_val_hi = (abs(poly_val) >> 128) & 0xFFFFFFFF_FFFFFFFF
poly_val_mi = (abs(poly_val) >> 64) & 0xFFFFFFFF_FFFFFFFF
poly_val_lo = abs(poly_val) & 0xFFFFFFFF_FFFFFFFF
factor_buff = np.zeros(192, dtype=np.int32)
factor_buff_ptr = get_pointer(factor_buff)
value = kernel.factorise(factor_buff_ptr, poly_val_sign, poly_val_hi, poly_val_mi, poly_val_lo, np_factor_base_ptr, np_factor_base_size)
local_factors = set(factor_buff[factor_buff > 0].tolist())
local_factors ^= factors_a
if value != 1:
if value < large_prime_bound:
if value in partials:
rel, lf, pv = partials[value]
relation *= rel
local_factors ^= lf
poly_val *= pv
lpf = 1
else:
partials[value] = (relation, local_factors, poly_val * a)
return 0
else:
return 0
for fac in local_factors:
idx = fb_map[fac]
matrix[idx] |= ind
ind = ind + ind
relations.append(relation)
roots.append(poly_val * a)
return lpf
with Live(console=self.console) as live:
while len(relations) < target_relations:
if num_poly % 10 == 0:
live.update(create_table(relations, target_relations, num_poly, start))
if poly_ind == 0:
a, b, c, B, np_Bainv, np_soln_map, s, factors_a = self.generate_first_polynomial(factor_base, N, M, poly_a_list)
np_soln_map_ptr = get_pointer(np_soln_map)
np_Bainv_ptr = get_pointer(np_Bainv)
end = 1 << (s - 1)
poly_ind += 1
else:
v, e = grays[poly_ind]
b = (b + 2 * e * B[v])
c = (b * b - N) // a
poly_ind += 1
if poly_ind == end:
poly_ind = 0
v, e = grays[poly_ind] # v, e for next iteration
assert 0 <= a < (1 << 190)
a_abs = abs(a)
a_sign = 1 if a >= 0 else -1
a_hi = (a_abs >> 128) & 0xFFFFFFFF_FFFFFFFF
a_mi = (a_abs >> 64) & 0xFFFFFFFF_FFFFFFFF
a_lo = a_abs & 0xFFFFFFFF_FFFFFFFF
kernel.compute_big_loop(np_factor_base_ptr, np_factor_base_size,
np_sieve_values_ptr, np_sieve_values_size,
np_Bainv_ptr,
np_soln_map_ptr,
np_prime_log_map_ptr,
a_sign, a_hi, a_mi, a_lo,
self.prime_limit, M, v, e)
num_poly += 1
# Sanity checks
assert 0 <= threshold < 16384 # Check overflows
assert np_sieve_values_size == 2 * M - 6 + 7
x = 0
xmax = np_sieve_values_size #2 * M - 6 + 7 # Correct?
while True:
x = kernel.search_next(np_sieve_values_ptr, x, xmax, threshold)
if x >= xmax:
break
lp_found += process_sieve_value_new(x, partials, relations, roots, a, b, c, factors_a)
x += 1
print(f"\n{num_poly} polynomials sieved")
print(f"{lp_found} relations from partials")
print(f"{target_relations - lp_found} normal smooth relations")
print(f"{target_relations} total relations\n")
return matrix, relations, roots
Here is the C++ code:
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <cassert>
#include <algorithm>
#include <unordered_set>
#include <vector>
#include <omp.h>
// Faster than Clang types (mainly for 256-bit integers)
#include <boost/multiprecision/cpp_int.hpp>
using namespace boost::multiprecision;
// Unsafe 32-bit contiguous array single load
extern "C" int32_t get_i32_1D(int32_t* sieve_values, int32_t pos)
{
return sieve_values[pos];
}
// Unsafe 32-bit contiguous array single store
extern "C" void set_i32_1D(int32_t* sieve_values, int32_t pos, int32_t val)
{
sieve_values[pos] = val;
}
// Unsafe 32-bit contiguous array single load
extern "C" int32_t get_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m)
{
return sieve_values[n*ld+m];
}
// Unsafe 32-bit contiguous array single store
extern "C" void set_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m, int32_t val)
{
sieve_values[n*ld+m] = val;
}
static bool is_factor(uint64_t poly_val_hi, uint64_t poly_val_mi,
uint64_t poly_val_lo, int32_t factor)
{
//assert(factor > 1);
uint64_t value = 0;
// Speed up the computation for small numbers
if(poly_val_hi == 0)
{
// Speed up the computation for tiny numbers
if(poly_val_mi == 0)
return (poly_val_lo % factor) == 0;
value = poly_val_mi % factor;
}
else
{
value = poly_val_hi % factor;
value = (value << 32) | (poly_val_mi >> 32);
value %= factor;
value = (value << 32) | (poly_val_mi & 0xFFFFFFFF);
value %= factor;
}
value = (value << 32) | (poly_val_lo >> 32);
value %= factor;
value = (value << 32) | (poly_val_lo & 0xFFFFFFFF);
value %= factor;
return value == 0;
}
static void compute_loop(uint8_t* worker_sieve_values_ptr,
int16_t* sieve_values,
int32_t p, int32_t r1, int32_t r2,
int32_t M, int32_t log_p)
{
const int32_t amx = r1 + M;
const int32_t bmx = r2 + M;
const int32_t apx = amx - p;
const int32_t bpx = bmx - p;
auto safe_inc = [=] (int32_t idx, int32_t inc){
uint8_t& value = worker_sieve_values_ptr[idx];
if (int32_t(value) + inc < 256) [[likely]]
{
value += inc;
}
else
{
#pragma omp atomic
sieve_values[idx] += value;
value = 0;
}
};
for(int32_t k = p; k < M; k += p)
{
safe_inc(apx + k, log_p);
safe_inc(bpx + k, log_p);
safe_inc(amx - k, log_p);
safe_inc(bmx - k, log_p);
}
}
int32_t py_mod(int32_t n, int32_t d)
{
// assert(d > 0);
const int32_t r = n % d;
return r + (r < 0 ? d : 0);
}
struct SavedValues { int32_t idx, r1, r2; };
extern "C" void compute_big_loop(int32_t* factor_base, int32_t factor_base_size,
int16_t* sieve_values, int32_t sieve_values_size,
int32_t* Bainv,
int32_t* soln_map,
int32_t* prime_log_map,
int32_t a_sign, uint64_t a_hi, uint64_t a_mi, uint64_t a_lo,
int32_t prime_limit, int32_t M, int32_t v, int32_t e)
{
const int num_threads = 4;
std::vector<uint8_t*> workers_sieve_values_ptrs;
workers_sieve_values_ptrs.reserve(num_threads);
// Spawn a thread-pool with only few threads since the computation
// does not scale well (seems rather memory-bound).
// Using too many threads cause data not to fit in the L3 cache anymore and
// DRAM accesses are significantly slower, not to mention the DRAM can
// quickly be saturated, so the additional threads are not really useful.
// Note that this is strongly dependent of the B parameter of the algorithm
// and the size of the cache on the target platform.
#pragma omp parallel num_threads(num_threads)
{
// Make the items as compact as possible to fit in the CPU cache (critical for performance)
std::vector<uint8_t> worker_sieve_values(sieve_values_size);
uint8_t* worker_sieve_values_ptr = worker_sieve_values.data();
// Record the thread-local arrays of each worker so to sum them up later
#pragma omp critical
workers_sieve_values_ptrs.push_back(worker_sieve_values_ptr);
// Execute many `compute_loop` calls in parallel on thread-local arrays
#pragma omp for schedule(static,1)
for (size_t i = 0; i < factor_base_size; ++i)
{
const int32_t p = factor_base[i];
assert(p > 0 || p == -1);
if(p < prime_limit or p == -1 or is_factor(a_hi, a_mi, a_lo, p))
continue;
assert(p > 0);
const int32_t r1 = get_i32_2D(soln_map, factor_base_size, 0, i);
const int32_t r2 = get_i32_2D(soln_map, factor_base_size, 1, i);
const int32_t Bainv_val = get_i32_2D(Bainv, factor_base_size, v, i);
assert(prime_log_map[i] < 32);
const uint8_t log_p = prime_log_map[i];
set_i32_2D(soln_map, factor_base_size, 0, i, py_mod(r1 - e * Bainv_val, p));
set_i32_2D(soln_map, factor_base_size, 1, i, py_mod(r2 - e * Bainv_val, p));
// This part is the bottleneck on big numbers because of random
// accesses on large arrays (typically stored in DRAM)
compute_loop(worker_sieve_values_ptr, sieve_values, p, r1, r2, M, log_p);
}
// Sum up all the thread-local arrays in parallel
#pragma omp for schedule(static)
for (int32_t ib = 0; ib < sieve_values_size; ib+=128)
{
int16_t tmp[128] = {0};
for (int32_t w = 0; w < workers_sieve_values_ptrs.size(); ++w)
for (int32_t i = 0; i < std::min(128, sieve_values_size-ib); ++i)
tmp[i] += workers_sieve_values_ptrs[w][ib+i];
for (int32_t i = 0; i < std::min(128, sieve_values_size-ib); ++i)
sieve_values[ib+i] = tmp[i];
}
}
}
extern "C" uint64_t factorise(int32_t* final_factors,
int32_t poly_val_sign, uint64_t poly_val_hi, uint64_t poly_val_mi, uint64_t poly_val_lo,
int32_t* factor_base, int32_t factor_base_size)
{
std::unordered_set<int32_t> factors;
if(poly_val_sign < 0)
factors.insert(-1);
uint256_t value = (uint256_t(poly_val_hi) << 128u) | (uint256_t(poly_val_mi) << 64u) | uint256_t(poly_val_lo);
int32_t i;
for (int32_t i = 1; i < factor_base_size; ++i)
{
const int32_t factor = factor_base[i];
while(is_factor(poly_val_hi, poly_val_mi, poly_val_lo, factor))
{
// Rarely executed
if(factors.erase(factor) == 0)
factors.insert(factor);
value /= factor;
poly_val_hi = uint64_t(value >> 128u);
poly_val_mi = uint64_t(value >> 64u);
poly_val_lo = uint64_t(value);
}
}
assert(factors.size() <= 192);
std::copy(factors.begin(), factors.end(), final_factors);
std::sort(final_factors, final_factors + factors.size());
assert(value < (uint256_t(1) << 63u));
return uint64_t(value);
}
extern "C" int32_t search_next(int16_t* sieve_values, int32_t x, int32_t xmax, int16_t threshold)
{
while(x < xmax)
{
// Fast path (SIMD-friendly)
while (x + 32 < xmax)
{
bool found = false;
for (int i = 0; i < 32; ++i)
found |= sieve_values[x+i] > threshold;
if(found)
break;
memset(sieve_values+x, 0, 32*sizeof(int16_t));
x += 32;
}
const int16_t val = sieve_values[x];
sieve_values[x] = 0;
if(val > threshold) [[unlikely]]
return x;
x++;
}
return xmax;
}
Here is the CFFI wrapper (just the prototypes of the C++ extern "C"
function):
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("""
int32_t search_next(int16_t* sieve_values, int32_t x, int32_t xmax, int16_t threshold);
uint64_t factorise(int32_t* factors,
int32_t poly_val_sign, uint64_t poly_val_hi, uint64_t poly_val_mi, uint64_t poly_val_lo,
int32_t* factor_base_ptr, int32_t factor_base_size);
int32_t get_i32_1D(int32_t* sieve_values, int32_t pos);
void set_i32_1D(int32_t* sieve_values, int32_t pos, int32_t val);
int32_t get_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m);
void set_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m, int32_t val);
void compute_big_loop(int32_t* factor_base, int32_t factor_base_size,
int16_t* sieve_values, int32_t sieve_values_size,
int32_t* Bainv,
int32_t* soln_map,
int32_t* prime_log_map,
int32_t a_sign, uint64_t a_hi, uint64_t a_mi, uint64_t a_lo,
int32_t prime_limit, int32_t M, int32_t v, int32_t e);
""")
# There is probably a cleaner way than copy-pasting the cdef code and just prefix the lines with "extern"
ffibuilder.set_source(
'kernel_cffi',
'''
extern int32_t search_next(int16_t* sieve_values, int32_t x, int32_t xmax, int16_t threshold);
extern uint64_t factorise(int32_t* final_factors,
int32_t poly_val_sign, uint64_t poly_val_hi, uint64_t poly_val_mi, uint64_t poly_val_lo,
int32_t* factor_base_ptr, int32_t factor_base_size);
extern int32_t get_i32_1D(int32_t* sieve_values, int32_t pos);
extern void set_i32_1D(int32_t* sieve_values, int32_t pos, int32_t val);
extern int32_t get_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m);
extern void set_i32_2D(int32_t* sieve_values, int32_t ld, int32_t n, int32_t m, int32_t val);
extern void compute_big_loop(int32_t* factor_base, int32_t factor_base_size,
int16_t* sieve_values, int32_t sieve_values_size,
int32_t* Bainv,
int32_t* soln_map,
int32_t* prime_log_map,
int32_t a_sign, uint64_t a_hi, uint64_t a_mi, uint64_t a_lo,
int32_t prime_limit, int32_t M, int32_t v, int32_t e);
''',
libraries=['kernel'],
include_dirs=['build'],
library_dirs=['.', 'build']
)
ffibuilder.compile(tmpdir='build', verbose=False)
Here is the command-line to compile the libraries:
mkdir -p build
clang++ -std=c++17 -O3 -mavx2 -fPIC -c kernel.cpp -o build/kernel.o -fopenmp
clang++ -shared -fPIC build/kernel.o -o build/libkernel.so -fopenmp
./pypy/bin/pypy build_cffi_module.py
And the one to run the code:
PYTHONPATH=build LD_LIBRARY_PATH=build ./pypy/bin/pypy main.py
The factorization can be parallelized so to only take a fraction of the overall execution. However, this requires to transform the Python while
loop calling search_next
and process_sieve_value_new
mostly into C++. This means more variables must be converted to Numpy arrays like at least xval
, relation
and poly_val
which might be large integers (tedious to pass to C++ code). All the call to factorise
can be done in parallel in C++. The rest of the process_sieve_value_new
function does not have to be transformed to C++ code since you can store all the resulting value and then run the remaining code of process_sieve_value_new
. This reduce the amount of work needed to do the transformation (and still most of the code is Python one). This optimization should significantly reduce the time to factorize small numbers but not so much for big numbers (especially beyond 90-digit numbers).
The "Solving linear system in GF(2)" step can be massively optimized. However this first require to convert the huge numbers stored in matrix
to native-integer-based arrays. You can then convert the code into C++ like I did for the other part of the code. Then 3 main optimizations can be applied:
Besides, AFAIK there are more efficient algorithm for this part (linear algebra in GF(2)
). On this part, Wikipedia mentions the Block Wiedemann algorithm. On top of that, using GPUs for this specific part might also help (especially since the matrix seems a dense one).