I have the following dataframe and would like to group the data by cluster
number to generate 5 new dataframes (the clusters go from 0-4), and then further split them up into training and test sets based on the Date
column and run each train/test dataframe through a Random Forest regressor.
I have been able to achieve this in a heavily hard-coded way via:
cluster_1 = whatever_df[whatever_df['cluster'] == 0]
cluster_2 = whatever_df[whatever_df['cluster'] == 1]
cluster_3 = whatever_df[whatever_df['cluster'] == 2]
cluster_4 = whatever_df[whatever_df['cluster'] == 3]
cluster_5 = whatever_df[whatever_df['cluster'] == 4]
train_1 = cluster_1[cluster_1['Date'] <= max(cluster_1['Date']) - relativedelta(months = 3)]
test_1 = cluster_1[cluster_1['Date'] > max(cluster_1['Date']) - relativedelta(months = 3)]
train_2 = cluster_2[cluster_2['Date'] <= max(cluster_2['Date']) - relativedelta(months = 3)]
test_2 = cluster_2[cluster_2['Date'] > max(cluster_2['Date']) - relativedelta(months = 3)]
train_3 = cluster_3[cluster_3['Date'] <= max(cluster_3['Date']) - relativedelta(months = 3)]
test_3 = cluster_3[cluster_3['Date'] > max(cluster_3['Date']) - relativedelta(months = 3)]
train_4 = cluster_4[cluster_4['Date'] <= max(cluster_4['Date']) - relativedelta(months = 3)]
test_4 = cluster_4[cluster_4['Date'] > max(cluster_4['Date']) - relativedelta(months = 3)]
train_5 = cluster_5[cluster_5['Date'] <= max(cluster_5['Date']) - relativedelta(months = 3)]
test_5 = cluster_5[cluster_5['Date'] > max(cluster_5['Date']) - relativedelta(months = 3)]
columns = whatever_df.columns.tolist()
cols = [c for c in columns if c not in ['Date', 'CPR']]
X_train_1 = train_1[cols]
y_train_1 = train_1['CPR']
X_test_1 = test_1[cols]
y_test_1 = test_1['CPR']
X_train_2 = train_2[cols]
y_train_2 = train_2['CPR']
X_test_2 = test_2[cols]
y_test_2 = test_2['CPR']
X_train_3 = train_3[cols]
y_train_3 = train_3['CPR']
X_test_3 = test_3[cols]
y_test_3 = test_3['CPR']
X_train_4 = train_4[cols]
y_train_4 = train_4['CPR']
X_test_4 = test_4[cols]
y_test_4 = test_4['CPR']
X_train_5 = train_5[cols]
y_train_5 = train_5['CPR']
X_test_5 = test_5[cols]
y_test_5 = test_5['CPR']
Followed by:
from sklearn.ensemble import RandomForestRegressor
rf_1 = RandomForestRegressor(max_depth=5)
rf_1.fit(X_train_1, y_train_1)
y_pred_1 = rf_1.predict(X_test_1)
print('MAE 1: ', metrics.mean_absolute_error(y_test_1, y_pred_1))
print('MSE 1: ', metrics.mean_squared_error(y_test_1, y_pred_1))
print('\n')
rf_2 = RandomForestRegressor(max_depth=5)
rf_2.fit(X_train_2, y_train_2)
print('\n')
y_pred_2 = rf.predict(X_test_2)
print('MAE 2: ', metrics.mean_absolute_error(y_test_2, y_pred_2))
print('MSE 2: ', metrics.mean_squared_error(y_test_2, y_pred_2))
print('\n')
rf_3 = RandomForestRegressor(max_depth=5)
rf_3.fit(X_train_3, y_train_3)
y_pred_3 = rf.predict(X_test_3)
print('MAE 3: ', metrics.mean_absolute_error(y_test_3, y_pred_3))
print('MSE 3: ', metrics.mean_squared_error(y_test_3, y_pred_3))
print('\n')
rf_4 = RandomForestRegressor(max_depth=5)
rf_4.fit(X_train_4, y_train_4)
print('\n')
y_pred_4 = rf.predict(X_test_4)
print('MAE 4: ', metrics.mean_absolute_error(y_test_4, y_pred_4))
print('MSE 4: ', metrics.mean_squared_error(y_test_4, y_pred_4))
rf_5 = RandomForestRegressor(max_depth=5)
rf_5.fit(X_train_5, y_train_5)
print('\n')
y_pred_5 = rf.predict(X_test_5)
print('MAE 5: ', metrics.mean_absolute_error(y_test_5, y_pred_5))
print('MSE 5: ', metrics.mean_squared_error(y_test_5, y_pred_5))
But I would like to accomplish the same thing without so much copy/paste. Any tips?
A for loop is perfect here!
columns = whatever_df.columns.tolist()
cols = [c for c in columns if c not in ['Date', 'CPR']]
from sklearn.ensemble import RandomForestRegressor
for i in range(5):
cluster = whatever_df[whatever_df['cluster'] == i]
train = cluster[cluster['Date'] <= max(cluster['Date']) - relativedelta(months = 3)]
test = cluster[cluster['Date'] > max(cluster['Date']) - relativedelta(months = 3)]
X_train = train[cols]
y_train = train['CPR']
X_test = test[cols]
y_test = test['CPR']
rf = RandomForestRegressor(max_depth=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f'MAE {i}: ', metrics.mean_absolute_error(y_test, y_pred))
print(f'MSE {i}: ', metrics.mean_squared_error(y_test, y_pred))
print('\n')