[SOLVED] Leveraging for loop to run slices of dataframe through supervised model based on one column value

Leveraging for loop to run slices of dataframe through supervised model based on one column value

I have the following dataframe and would like to group the data by cluster number to generate 5 new dataframes (the clusters go from 0-4), and then further split them up into training and test sets based on the Date column and run each train/test dataframe through a Random Forest regressor.

I have been able to achieve this in a heavily hard-coded way via:

cluster_1 = whatever_df[whatever_df['cluster'] == 0]
cluster_2 = whatever_df[whatever_df['cluster'] == 1]
cluster_3 = whatever_df[whatever_df['cluster'] == 2]
cluster_4 = whatever_df[whatever_df['cluster'] == 3]
cluster_5 = whatever_df[whatever_df['cluster'] == 4]

train_1 = cluster_1[cluster_1['Date'] <= max(cluster_1['Date']) - relativedelta(months = 3)]
test_1 = cluster_1[cluster_1['Date'] > max(cluster_1['Date']) - relativedelta(months = 3)]

train_2 = cluster_2[cluster_2['Date'] <= max(cluster_2['Date']) - relativedelta(months = 3)]
test_2 = cluster_2[cluster_2['Date'] > max(cluster_2['Date']) - relativedelta(months = 3)]

train_3 = cluster_3[cluster_3['Date'] <= max(cluster_3['Date']) - relativedelta(months = 3)]
test_3 = cluster_3[cluster_3['Date'] > max(cluster_3['Date']) - relativedelta(months = 3)]

train_4 = cluster_4[cluster_4['Date'] <= max(cluster_4['Date']) - relativedelta(months = 3)]
test_4 = cluster_4[cluster_4['Date'] > max(cluster_4['Date']) - relativedelta(months = 3)]

train_5 = cluster_5[cluster_5['Date'] <= max(cluster_5['Date']) - relativedelta(months = 3)]
test_5 = cluster_5[cluster_5['Date'] > max(cluster_5['Date']) - relativedelta(months = 3)]

columns = whatever_df.columns.tolist()
cols = [c for c in columns if c not in ['Date', 'CPR']]

X_train_1 = train_1[cols]
y_train_1 = train_1['CPR']
X_test_1 = test_1[cols]
y_test_1 = test_1['CPR']


X_train_2 = train_2[cols]
y_train_2 = train_2['CPR']
X_test_2 = test_2[cols]
y_test_2 = test_2['CPR']


X_train_3 = train_3[cols]
y_train_3 = train_3['CPR']
X_test_3 = test_3[cols]
y_test_3 = test_3['CPR']


X_train_4 = train_4[cols]
y_train_4 = train_4['CPR']
X_test_4 = test_4[cols]
y_test_4 = test_4['CPR']


X_train_5 = train_5[cols]
y_train_5 = train_5['CPR']
X_test_5 = test_5[cols]
y_test_5 = test_5['CPR']

Followed by:

from sklearn.ensemble import RandomForestRegressor
rf_1 = RandomForestRegressor(max_depth=5)
rf_1.fit(X_train_1, y_train_1)

y_pred_1 = rf_1.predict(X_test_1)
print('MAE 1: ', metrics.mean_absolute_error(y_test_1, y_pred_1))
print('MSE 1: ', metrics.mean_squared_error(y_test_1, y_pred_1))
print('\n')

rf_2 = RandomForestRegressor(max_depth=5)
rf_2.fit(X_train_2, y_train_2)
print('\n')

y_pred_2 = rf.predict(X_test_2)
print('MAE 2: ', metrics.mean_absolute_error(y_test_2, y_pred_2))
print('MSE 2: ', metrics.mean_squared_error(y_test_2, y_pred_2))
print('\n')

rf_3 = RandomForestRegressor(max_depth=5)
rf_3.fit(X_train_3, y_train_3)

y_pred_3 = rf.predict(X_test_3)
print('MAE 3: ', metrics.mean_absolute_error(y_test_3, y_pred_3))
print('MSE 3: ', metrics.mean_squared_error(y_test_3, y_pred_3))
print('\n')

rf_4 = RandomForestRegressor(max_depth=5)
rf_4.fit(X_train_4, y_train_4)
print('\n')
y_pred_4 = rf.predict(X_test_4)
print('MAE 4: ', metrics.mean_absolute_error(y_test_4, y_pred_4))
print('MSE 4: ', metrics.mean_squared_error(y_test_4, y_pred_4))

rf_5 = RandomForestRegressor(max_depth=5)
rf_5.fit(X_train_5, y_train_5)
print('\n')
y_pred_5 = rf.predict(X_test_5)
print('MAE 5: ', metrics.mean_absolute_error(y_test_5, y_pred_5))
print('MSE 5: ', metrics.mean_squared_error(y_test_5, y_pred_5))

But I would like to accomplish the same thing without so much copy/paste. Any tips?

Solution

A for loop is perfect here!

columns = whatever_df.columns.tolist()
cols = [c for c in columns if c not in ['Date', 'CPR']]
from sklearn.ensemble import RandomForestRegressor

for i in range(5):
    cluster = whatever_df[whatever_df['cluster'] == i]

    train = cluster[cluster['Date'] <= max(cluster['Date']) - relativedelta(months = 3)]
    test = cluster[cluster['Date'] > max(cluster['Date']) - relativedelta(months = 3)]

    X_train = train[cols]
    y_train = train['CPR']
    X_test = test[cols]
    y_test = test['CPR']

    rf = RandomForestRegressor(max_depth=5)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    print(f'MAE {i}: ', metrics.mean_absolute_error(y_test, y_pred))
    print(f'MSE {i}: ', metrics.mean_squared_error(y_test, y_pred))
    print('\n')