I have the following pandas dataframe which represents the consumption of 7 days (day_0
is today, day_-1
is yesterday etc) of 10 people (ids
):
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(8, 15, size=(10, 7)))
df.columns = ['day_0', 'day_-1', 'day_-2', 'day_-3', 'day_-4', 'day_-5', 'day_-6']
df.index.name = 'id'
print(df.reset_index())
id day_0 day_-1 day_-2 day_-3 day_-4 day_-5 day_-6
0 0 10 10 14 8 14 14 14
1 1 10 13 11 11 8 10 10
2 2 10 12 9 12 9 10 10
3 3 12 12 9 11 9 12 13
4 4 12 13 8 12 8 11 9
5 5 13 9 8 13 9 12 10
6 6 8 9 8 14 8 13 14
7 7 13 10 14 12 8 9 11
8 8 8 8 10 12 11 14 14
9 9 14 13 13 9 11 14 13
I would like to find daily weights (so in total 7 weights: w_0, w_-1, w_-2, w_-3, w_-4, w_-5, w_-6
) which need to have the following properties:
w_0
> w_-1
> w_-2
> ... > w_-6
> 0w_0
+ w_-1
+ w_-2
+ ... + w_-6
= 7ids
to be below a threshold (e.g. 11)I can achieve prerequisites 1 & 2 by using the exponential decay function and later normalizing:
import numpy as np
n = 7
_lambda = 0.5
# Calculate the weights using exponential decay
weights = np.exp(-_lambda * np.arange(n))
# Normalize the weights so that their sum is equal to the length of the time series
weights *= n / np.sum(weights)
But I don't know how I could apply also prerequisite 3.
Is that possible? How can I do that in python?
This does not use exponential decay, because that doesn't seem particularly useful to meet your requirements. Define an ILP with a disjunction constraint that at exactly one combination of n out of m IDs has a weighted mean less than or equal to a threshold:
import io
import numpy as np
import pandas as pd
import scipy.sparse
from scipy.optimize import milp, Bounds, LinearConstraint
with io.StringIO(
'''id, 0, -1, -2, -3, -4, -5, -6
0, 10, 10, 14, 8, 14, 14, 14
1, 10, 13, 11, 11, 8, 10, 10
2, 10, 12, 9, 12, 9, 10, 10
3, 12, 12, 9, 11, 9, 12, 13
4, 12, 13, 8, 12, 8, 11, 9
5, 13, 9, 8, 13, 9, 12, 10
6, 8, 9, 8, 14, 8, 13, 14
7, 13, 10, 14, 12, 8, 9, 11
8, 8, 8, 10, 12, 11, 14, 14
9, 14, 13, 13, 9, 11, 14, 13
''') as f:
df = pd.read_csv(f, skipinitialspace=True, index_col=0)
df.columns = pd.Index(name='day', data=df.columns.astype(int))
m, n = df.shape # number of IDs, days
'''
LP variables:
n weights
m weighted mean threshold binary predicates
'''
# The weight sum must be equal to n
sum_constraint = LinearConstraint(
A=np.concatenate((
np.ones(n), np.zeros(m),
)),
lb=n, ub=n,
)
# The weights must be strictly decreasing by this amount
min_decrease = 1e-2 # chosen fully arbitrarily
antimonotonic_constraint = LinearConstraint(
A=scipy.sparse.diags_array(
(
np.ones(shape=n - 1),
np.full(shape=n - 1, fill_value=-1),
),
offsets=(0, 1), shape=(n - 1, n + m), format='csc',
),
lb=min_decrease,
)
'''
For each binary threshold predicate,
pred = 1 iff weights.df_values/n <= threshold
lower bound:
pred >= 1 - (weights.values)/n/threshold
weights.values/threshold + pred*n >= n
upper bound:
pred <= 2 - (weights.values)/n/threshold
weights.values/threshold + pred*n <= 2*n
'''
threshold = 11
mean_constraint = LinearConstraint(
A=scipy.sparse.hstack(
(
df.values/threshold,
scipy.sparse.diags_array(
np.full(shape=m, fill_value=n),
),
),
format='csc',
),
lb=n, ub=2*n,
)
# Exactly this many out of m IDs must be at the threshold or lower
n_up_to_threshold = 6
disjunction_constraint = LinearConstraint(
A=np.concatenate((np.zeros(n), np.ones(m))),
lb=n_up_to_threshold, ub=n_up_to_threshold,
)
result = milp(
c=np.zeros(n + m), # no optimisation objective
integrality=np.concatenate((
np.zeros(shape=n, dtype=np.uint8), # weights are continuous
np.ones(shape=m, dtype=np.uint8), # predicates are binary
)),
bounds=Bounds(
lb=np.concatenate((
np.full(shape=n, fill_value=1e-2), # minimum weight, arbitrary
np.zeros(shape=m), # binary predicate
)),
ub=np.concatenate((
np.full(shape=n, fill_value=np.inf),
np.ones(shape=m), # binary predicate
)),
),
constraints=(
sum_constraint,
antimonotonic_constraint,
mean_constraint,
disjunction_constraint,
),
)
if not result.success:
raise ValueError(result.message)
weights, threshold_preds = np.split(result.x, (n,))
means = df @ (weights/n)
print('weights =')
print(weights)
print('threshold predicates =')
print(threshold_preds)
print('means =')
print(means)
weights =
[2.96714286 1.01571429 1.00571429 0.99571429 0.98571429 0.02
0.01 ]
threshold predicates =
[1. 1. 1. 0. 1. 0. 1. 0. 1. 0.]
means =
id
0 10.870612
1 10.439592
2 10.290204
3 11.005714
4 11.000000
5 11.130816
6 9.021429
7 11.847755
8 9.304490
9 12.576122
dtype: float64
To turn the hard threshold constraint into a soft constraint where the threshold moves to accommodate the input system,
import io
import numpy as np
import pandas as pd
import scipy.sparse
from scipy.optimize import milp, Bounds, LinearConstraint
with io.StringIO(
'''id, 0, -1, -2, -3, -4, -5, -6
0, 10, 10, 14, 8, 14, 14, 14
1, 10, 13, 11, 11, 8, 10, 10
2, 10, 12, 9, 12, 9, 10, 10
3, 12, 12, 9, 11, 9, 12, 13
4, 12, 13, 8, 12, 8, 11, 9
5, 13, 9, 8, 13, 9, 12, 10
6, 8, 9, 8, 14, 8, 13, 14
7, 13, 10, 14, 12, 8, 9, 11
8, 8, 8, 10, 12, 11, 14, 14
9, 14, 13, 13, 9, 11, 14, 13
''') as f:
df = pd.read_csv(f, skipinitialspace=True, index_col=0)
df.columns = pd.Index(name='day', data=df.columns.astype(int))
m, n = df.shape # number of IDs, days
'''
LP variables:
n weights
m weighted mean threshold binary predicates
1 threshold
1 threshold error
'''
# The weight sum must be equal to n
sum_constraint = LinearConstraint(
A=np.concatenate((
np.ones(n), # weights
np.zeros(m + 1 + 1), # preds, threshold, error
)),
lb=n, ub=n,
)
# The weights must be strictly decreasing by this amount
min_decrease = 1e-3 # chosen fully arbitrarily
antimonotonic_constraint = LinearConstraint(
A=scipy.sparse.diags_array(
(
np.ones(shape=n - 1), # current weight
np.full(shape=n - 1, fill_value=-1), # next weight
),
offsets=(0, 1), # current and next weight index
shape=(n - 1, n + m + 1 + 1), # after the weight positions, everything is zero
format='csc',
),
lb=min_decrease,
)
'''
For each binary threshold predicate,
pred = 1 iff weights.df_values/n <= threshold
Upper bound (forces predicate to 0):
weights.df_values + pM - n*threshold <= M
Lower bound (forces predicate to 1):
weights.df_values + pM - n*threshold >= 0
'''
M = 2*df.sum(axis=1).max()
threshold_constraint = LinearConstraint(
A=scipy.sparse.hstack(
(
df.values, # weights
scipy.sparse.diags_array( # predicates
np.full(shape=m, fill_value=M),
),
np.full(shape=(m, 1), fill_value=-n), # threshold
scipy.sparse.csc_array((m, 1)), # error (0)
),
format='csc',
),
lb=0, ub=M,
)
# Exactly this many out of m IDs must be at the threshold or lower
n_up_to_threshold = 5
count_constraint = LinearConstraint(
A=np.concatenate((
np.zeros(n), # weights
np.ones(m), # predicates
np.zeros(1 + 1), # threshold, error
)),
lb=n_up_to_threshold, ub=n_up_to_threshold,
)
'''
The absolute error must be relative to the threshold
error >= target - threshold threshold + error >= target
error >= threshold - target threshold - error <= target
'''
target_threshold = 12
error_constraint = LinearConstraint(
A=scipy.sparse.hstack(
(
scipy.sparse.csc_array((2, n + m)), # weights, predicates
np.array((
(1, +1), # threshold, error
(1, -1),
)),
),
format='csc',
),
lb=(target_threshold, -np.inf),
ub=(np.inf, target_threshold),
)
result = milp(
c=np.concatenate((
np.zeros(n + m + 1), # weights, preds, threshold are non-optimised
np.ones(1), # threshold error is minimised
)),
integrality=np.concatenate((
np.zeros(shape=n, dtype=np.uint8), # weights are continuous
np.ones(shape=m, dtype=np.uint8), # predicates are binary
(0, 0), # threshold, error are continuous
)),
bounds=Bounds(
lb=np.concatenate((
np.full(shape=n, fill_value=1e-3), # minimum weight, arbitrary
np.zeros(shape=m), # binary predicate
(-np.inf, -np.inf), # threshold, error unbounded
)),
ub=np.concatenate((
np.full(shape=n, fill_value=np.inf), # weight unbounded
np.ones(shape=m), # binary predicate
(np.inf, np.inf), # threshold, error unbounded
)),
),
constraints=(
sum_constraint,
antimonotonic_constraint,
threshold_constraint,
count_constraint,
error_constraint,
),
)
if not result.success:
raise ValueError(result.message)
weights, preds, (threshold,), (error,) = np.split(result.x, (n, n+m, n+m+1))
means = df @ (weights/n)
print(f'threshold = {threshold:.3f}')
print(f'error from target of {target_threshold} = {error:.6f}')
print('weights =')
print(weights)
print('threshold predicates =')
print(preds)
print('means =')
print(means.values)
threshold = 11.996
error from target of 12 = 0.003857
weights =
[6.975e+00 1.000e-02 5.000e-03 4.000e-03 3.000e-03 2.000e-03 1.000e-03]
threshold predicates =
[1. 1. 1. 0. 0. 0. 1. 0. 1. 0.]
means =
[10.00514286 10.00471429 10.00285714 11.99614286 11.99614286 12.98828571
8.00714286 12.99228571 8.00757143 13.99357143]