pythonpandasrandom-foresttrain-test-splithardcode

Leveraging for loop to run slices of dataframe through supervised model based on one column value


I have the following dataframe and would like to group the data by cluster number to generate 5 new dataframes (the clusters go from 0-4), and then further split them up into training and test sets based on the Date column and run each train/test dataframe through a Random Forest regressor. df

I have been able to achieve this in a heavily hard-coded way via:

cluster_1 = whatever_df[whatever_df['cluster'] == 0]
cluster_2 = whatever_df[whatever_df['cluster'] == 1]
cluster_3 = whatever_df[whatever_df['cluster'] == 2]
cluster_4 = whatever_df[whatever_df['cluster'] == 3]
cluster_5 = whatever_df[whatever_df['cluster'] == 4]

train_1 = cluster_1[cluster_1['Date'] <= max(cluster_1['Date']) - relativedelta(months = 3)]
test_1 = cluster_1[cluster_1['Date'] > max(cluster_1['Date']) - relativedelta(months = 3)]

train_2 = cluster_2[cluster_2['Date'] <= max(cluster_2['Date']) - relativedelta(months = 3)]
test_2 = cluster_2[cluster_2['Date'] > max(cluster_2['Date']) - relativedelta(months = 3)]

train_3 = cluster_3[cluster_3['Date'] <= max(cluster_3['Date']) - relativedelta(months = 3)]
test_3 = cluster_3[cluster_3['Date'] > max(cluster_3['Date']) - relativedelta(months = 3)]

train_4 = cluster_4[cluster_4['Date'] <= max(cluster_4['Date']) - relativedelta(months = 3)]
test_4 = cluster_4[cluster_4['Date'] > max(cluster_4['Date']) - relativedelta(months = 3)]

train_5 = cluster_5[cluster_5['Date'] <= max(cluster_5['Date']) - relativedelta(months = 3)]
test_5 = cluster_5[cluster_5['Date'] > max(cluster_5['Date']) - relativedelta(months = 3)]

columns = whatever_df.columns.tolist()
cols = [c for c in columns if c not in ['Date', 'CPR']]

X_train_1 = train_1[cols]
y_train_1 = train_1['CPR']
X_test_1 = test_1[cols]
y_test_1 = test_1['CPR']


X_train_2 = train_2[cols]
y_train_2 = train_2['CPR']
X_test_2 = test_2[cols]
y_test_2 = test_2['CPR']


X_train_3 = train_3[cols]
y_train_3 = train_3['CPR']
X_test_3 = test_3[cols]
y_test_3 = test_3['CPR']


X_train_4 = train_4[cols]
y_train_4 = train_4['CPR']
X_test_4 = test_4[cols]
y_test_4 = test_4['CPR']


X_train_5 = train_5[cols]
y_train_5 = train_5['CPR']
X_test_5 = test_5[cols]
y_test_5 = test_5['CPR']

Followed by:

from sklearn.ensemble import RandomForestRegressor
rf_1 = RandomForestRegressor(max_depth=5)
rf_1.fit(X_train_1, y_train_1)

y_pred_1 = rf_1.predict(X_test_1)
print('MAE 1: ', metrics.mean_absolute_error(y_test_1, y_pred_1))
print('MSE 1: ', metrics.mean_squared_error(y_test_1, y_pred_1))
print('\n')

rf_2 = RandomForestRegressor(max_depth=5)
rf_2.fit(X_train_2, y_train_2)
print('\n')

y_pred_2 = rf.predict(X_test_2)
print('MAE 2: ', metrics.mean_absolute_error(y_test_2, y_pred_2))
print('MSE 2: ', metrics.mean_squared_error(y_test_2, y_pred_2))
print('\n')

rf_3 = RandomForestRegressor(max_depth=5)
rf_3.fit(X_train_3, y_train_3)

y_pred_3 = rf.predict(X_test_3)
print('MAE 3: ', metrics.mean_absolute_error(y_test_3, y_pred_3))
print('MSE 3: ', metrics.mean_squared_error(y_test_3, y_pred_3))
print('\n')

rf_4 = RandomForestRegressor(max_depth=5)
rf_4.fit(X_train_4, y_train_4)
print('\n')
y_pred_4 = rf.predict(X_test_4)
print('MAE 4: ', metrics.mean_absolute_error(y_test_4, y_pred_4))
print('MSE 4: ', metrics.mean_squared_error(y_test_4, y_pred_4))

rf_5 = RandomForestRegressor(max_depth=5)
rf_5.fit(X_train_5, y_train_5)
print('\n')
y_pred_5 = rf.predict(X_test_5)
print('MAE 5: ', metrics.mean_absolute_error(y_test_5, y_pred_5))
print('MSE 5: ', metrics.mean_squared_error(y_test_5, y_pred_5))

But I would like to accomplish the same thing without so much copy/paste. Any tips?


Solution

  • A for loop is perfect here!

    columns = whatever_df.columns.tolist()
    cols = [c for c in columns if c not in ['Date', 'CPR']]
    from sklearn.ensemble import RandomForestRegressor
    
    for i in range(5):
        cluster = whatever_df[whatever_df['cluster'] == i]
    
        train = cluster[cluster['Date'] <= max(cluster['Date']) - relativedelta(months = 3)]
        test = cluster[cluster['Date'] > max(cluster['Date']) - relativedelta(months = 3)]
    
        X_train = train[cols]
        y_train = train['CPR']
        X_test = test[cols]
        y_test = test['CPR']
    
        rf = RandomForestRegressor(max_depth=5)
        rf.fit(X_train, y_train)
    
        y_pred = rf.predict(X_test)
        print(f'MAE {i}: ', metrics.mean_absolute_error(y_test, y_pred))
        print(f'MSE {i}: ', metrics.mean_squared_error(y_test, y_pred))
        print('\n')