My task is to:
For example, I need to generate a list of combinations where each combination has a length of 3, has unique sorted values inside, each value is from the range(256) and each value appears in all of the combinations generated so far as close to 100 times as possible.
My problem is, how efficiently detect there are no more unique combinations of the values that can be created to stop the loop.
The problem appears when the script is going to an end and there still are available values left and len(available_values) > com_len but it isn't possible to create any new unique combinations that haven't appeared yet.
The code created so far:
import numpy as np
import random
com_len = 3
length = 256
counter = np.zeros(length)
values = range(length)
exclude_values = []
counter_expected = 100
done = []
mask = np.ones(len(np.array(values)), np.bool)
mask[exclude_values] = False
available_values = set(values) - set(exclude_values)
available_values = list(available_values)
ii = 0
while True:
"""print progress"""
ii = ii + 1
if not ii % 1000: print('.', end='')
#POSSIBLE CONDITION HERE
ex = random.sample(available_values, k=com_len)
ex.sort()
if ex in done: continue
done.append(ex)
counter[ex] = counter[ex] + 1
for l_ in ex:
if counter[l_] == counter_expected:
del available_values[available_values.index(l_)]
if len(available_values) < com_len:break
if all(counter[mask] == counter_expected): break
#OR HERE
NOTE: The script usually ends successfully because either len(available_values) < com_len or all(counter[mask] == counter_expected) condition breaks the While loop. Try running the script several times and at some point, you'll observe the script going into an infinite loop as len(available_values) >= com_len but there are no more new unique conditions available to be created so the counter is not increasing.
I need an efficient condition to stop the script. Using itertools.combinations here is not an option because available_values list may be long, i.e. 10k elements, at the beginning.
The brute-force would be using itertools.combinations when len(available_values) reaches a certain level and checking if there are any combinations that haven't been created yet but it's an ugly solution.
There might be a better way which doesn't come up to me but may to you. I'll be grateful for your help.
UPDATED FINAL ANSWER:
I've come up with the following code that matches my needs. NOTES:
The code:
import numpy as np
import random
import itertools
from decimal import Decimal
def get_random_combinations(values, exclude, length, mode, limit, min_ = 0, max_ = 0):
done = set()
try:
"""Creating counter"""
counter = np.zeros(len(values), np.uint)
"""Create a mask for excluded values"""
"""https://stackoverflow.com/questions/25330959/how-to-select-inverse-of-indexes-of-a-numpy-array"""
mask = np.ones(len(np.array(values)), np.bool)
mask[exclude] = False
"""available values to create combinations"""
values_a = set(values) - set(exclude)
values_a = list(values_a)
if length == 1:
if mode == 'total':
"""generate just data_number of examples"""
for ii in range(limit):
comb = random.sample(values_a, 1)[0]
del values_a[values_a.index(comb)]
done.add(tuple([comb]))
else:
"""generate one example for each comb"""
for comb in values_a: done.add(tuple([comb]))
else:
"""total number of combinations"""
if isinstance(length, str): rr = np.mean([min_, max_])
else: rr = length
nn = len(values_a)
comb_max = int(Decimal(np.math.factorial(nn)) / Decimal(np.math.factorial(rr) * np.math.factorial(nn-rr)))
err_limit = int(comb_max * 0.01)
if err_limit > 10000: err_limit = 10000
"""initiate variables"""
#should itertools be used to generate the rest of combinations
gen_comb = False
#has all combinations generated by itertools ended
comb_left_0 = False
#has the limit of errors been reached to generate itertools combinations
err_limit_reached = False
#previous combination
ll_prev = 0
dd = 0 #done counter
comb_left = set() #itertools combinations
err = 0 #errors counter
"""options variables for statistics"""
err_t = 0 #total number of errors
gen = 0 #total number of generations of itertools.combinations
ii = 0 #total number of iterations
print('GENERATING LIST OF COMBINATIONS')
while True:
"""print progress"""
ii = ii + 1
if not dd % 1000: print('.', end='')
"""check if length of combs is random or not"""
if isinstance(length, str):
"""change max_ length of combinations to
\the length of available values"""
if len(values_a) < max_: max_ = len(values_a)
ll = random.randint(min_, max_)
else: ll = length
if ll != ll_prev: gen_comb = True
"""generate combinations only when err limit is reached or
the length of combinations has changed"""
if err_limit_reached and gen_comb:
gen = gen + 1
"""after reaching the max number of consecutive errors, start generating combinations via itertools"""
"""generation is done at this point to prevent generation for a very long list"""
"""generation is also done when when length of a combination changes"""
comb_left = set(itertools.combinations(values_a, ll)) - done
"""break if there are no elements left"""
if not len(comb_left): break
"""if combinations has already been generated, used them"""
if comb_left:
"""take random sample from the set"""
comb = random.sample(comb_left, 1)[0]
"""remove it from the set"""
comb_left.remove(comb)
"""check if it was the last combination to break the loop at the end"""
if not len(comb_left): comb_left_0 = True
else:
"""generate random combination"""
comb = tuple(sorted(random.sample(values_a, ll)))
"""set previous length"""
ll_prev = ll
"""reset gen_comb"""
gen_comb = False
"""check if combination is new"""
if comb not in done: found = True
else:
"""otherwise, iterate errors"""
err = err + 1
err_t = err_t + 1
found = False
if err > err_limit: err_limit_reached = gen_comb = True
if found:
"""reset err"""
err = 0
dd = dd + 1
"""add combination to done"""
done.add(comb)
"""increase counter for the combs"""
counter[list(comb)] = counter[list(comb)] + 1
"""check if seekeing the max number of combinations or min"""
if mode == 'max':
"""for max, we must remove the elements which reached the limit"""
for l_ in list(comb):
if counter[l_] == limit:
del values_a[values_a.index(l_)]
"""if length of available elements is smaller than the possible length of the combinations"""
if isinstance(length, str):
"""for random length, choose the minimal length"""
if len(values_a) < min_: break
else:
if len(values_a) < ll: break
"""if all elements reached the limit"""
if mode == 'total':
if len(done) >= limit: break
else: #min, max
if all(counter[mask] >= limit): break
"""if the number of consecutive errors reached
the total number of combinations, break as you may never
draw a valid combination"""
if err > comb_max: break
"""if it was the last combination left"""
if comb_left_0: break
except Exception as e: print(e)
print('')
print('Combinations generated: ' + str(dd))
print('Total number of iterations: ' + str(ii))
print('Final value of err: ' + str(err))
print('Total number of errors: ' + str(err_t))
print('How many times itertools.combinations was used: ' + str(gen))
return done
"""range of values to create combinations"""
values = range(256)
"""values excluded from the combinations"""
exclude = [0,255]
"""length of combinations, if is string, the number of layers
is generated randomly withing (min_, max_) range """
length = 'r'
"""mode of how the combinations are generated:
min: minimal number of times the value appears across all combinations
(limited down by the limit value)
max: max number of times the value appears across all combinations (limited
max by the limit value)
total: total number of combinations (limited the limit value)"""
mode = 'max'
"""limit used for the mode combinations' generation"""
limit = 1000
"""min_ and max_ are used when length is string,
length is generated randomly within (min_, max_) range"""
min_ = 4
max_ = 12
done = get_random_combinations(values, exclude, length, mode, limit, min_, max_)