rmachine-learningrandom-forestr-ranger

Training, Tuning, Cross-Validating, and Testing Ranger (Random Forest) Quantile Regression Model?


May someone share how to train, tune (hyperparameters), cross-validate, and test a ranger quantile regression model, along with error evaluation? With the iris or Boston housing dataset?

The reason I ask is because I have not been able to find many examples or walkthroughs using quantile regression on Kaggle, random blogs, Youtube. Most problems I encountered are classification problems.

I am currently using a quantile regression model but I am hoping to see other examples in particular with hyperparameter tuning


Solution

  • There are a lot of parameters for this function. Since this isn't a forum for what it all means, I really suggest that you hit up Cross Validates with questions on the how and why. (Or look for questions that may already be answered.)

    library(tidyverse)
    library(ranger)
    library(caret)
    library(funModeling)
    
    data(iris)
    
    #----------- setup data -----------
    # this doesn't include exploration or cleaning which are both necessary
    summary(iris)
    df_status(iris)
    
    #----------------- create training sample ----------------
    set.seed(395280469) # for replicability
    
    # create training sample partition (70/20 split)
    tr <- createDataPartition(iris$Species, 
                              p = .8, 
                              list = F)
    

    There are a lot of ways to split the data, but I tend to prefer Caret, because they word to even out factors if that's what you feed it.

    #--------- First model ---------
    fit.r <- ranger(Sepal.Length ~ ., 
                    data = iris[tr, ],
                    write.forest = TRUE,
                    importance = 'permutation',
                    quantreg = TRUE,
                    keep.inbag = TRUE,
                    replace = FALSE)
    fit.r
    # Ranger result
    # 
    # Call:
    #  ranger(Sepal.Length ~ ., data = iris[tr, ], write.forest = TRUE,
    #     importance = "permutation", quantreg = TRUE, keep.inbag = TRUE, 
    #     replace = FALSE) 
    # 
    # Type:                             Regression 
    # Number of trees:                  500 
    # Sample size:                      120 
    # Number of independent variables:  4 
    # Mtry:                             2 
    # Target node size:                 5 
    # Variable importance mode:         permutation 
    # Splitrule:                        variance 
    # OOB prediction error (MSE):       0.1199364 
    # R squared (OOB):                  0.8336928  
    
    p.r <- predict(fit.r, iris[-tr, -1],
                   type = 'quantiles')
    

    It defaults to .1, .5, and .9:

    postResample(p.r$predictions[, 1], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.5165946 0.7659124 0.4036667  
    
    postResample(p.r$predictions[, 2], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.3750556 0.7587326 0.3133333  
    
    postResample(p.r$predictions[, 3], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.6488991 0.7461830 0.5703333  
    

    To see what this looks like in practice:

    # this performance is the best so far, let's see what it looks like visually
    ggplot(data.frame(p.Q1 = p.r$predictions[, 1],
                      p.Q5 = p.r$predictions[, 2],
                      p.Q9 = p.r$predictions[, 3],
                      Actual = iris[-tr, 1])) +
      geom_point(aes(x = Actual, y = p.Q1, color = "P.Q1")) +
      geom_point(aes(x = Actual, y = p.Q5, color = "P.Q5")) +
      geom_point(aes(x = Actual, y = p.Q9, color = "P.Q9")) +
      geom_line(aes(Actual, Actual, color = "Actual")) +
      scale_color_viridis_d(end = .8, "Error",
                            direction = -1)+
      theme_bw()
    

    enter image description here

    # since Quantile .1 performed the best
    ggplot(data.frame(p.Q9 = p.r$predictions[, 3],
                      Actual = iris[-tr, 1])) +
      geom_point(aes(x = Actual, y = p.Q9, color = "P.Q9")) +
      geom_segment(aes(x = Actual, xend = Actual, 
                       y = Actual, yend = p.Q9)) +
      geom_line(aes(Actual, Actual, color = "Actual")) +
      scale_color_viridis_d(end = .8, "Error",
                            direction = -1)+
      theme_bw()
    

    enter image description here

    #------------ ranger model with options --------------
    # last call used default 
    #    splitrule: variance, use "extratrees" (only 2 for this one)
    #    mtry = 2, use 3 this time
    #    min.node.size = 5, using 6 this time
    #    using num.threads = 15 ** this is the number of cores on YOUR device
    #        change accordingly --- if you don't know, drop this one
    
    set.seed(326)
    fit.r2 <- ranger(Sepal.Length ~ ., 
                    data = iris[tr, ],
                    write.forest = TRUE,
                    importance = 'permutation',
                    quantreg = TRUE,
                    keep.inbag = TRUE,
                    replace = FALSE,
                    splitrule = "extratrees",
                    mtry = 3,
                    min.node.size = 6,
                    num.threads = 15)
    fit.r2
    # Ranger result
    # Type:                             Regression 
    # Number of trees:                  500 
    # Sample size:                      120 
    # Number of independent variables:  4 
    # Mtry:                             3 
    # Target node size:                 6 
    # Variable importance mode:         permutation 
    # Splitrule:                        extratrees 
    # Number of random splits:          1 
    # OOB prediction error (MSE):       0.1107299 
    # R squared (OOB):                  0.8464588  
    

    This model produced similarly.

    p.r2 <- predict(fit.r2, iris[-tr, -1],
                   type = 'quantiles')
    
    postResample(p.r2$predictions[, 1], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.4932883 0.8144309 0.4000000  
     
    postResample(p.r2$predictions[, 2], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.3610171 0.7643744 0.3100000  
    
    postResample(p.r2$predictions[, 3], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.6555939 0.8141144 0.5603333 
    

    The prediction was pretty similar overall, as well. This isn't a very large set of data, with few predictors. How much do they contribute?

    importance(fit.r2)
    #  Sepal.Width Petal.Length  Petal.Width      Species 
    #   0.06138883   0.71052453   0.22956522   0.18082998  
    
    #------------ ranger model with options --------------
    # drop a predictor, lower mtry, min.node.size
    set.seed(326)
    fit.r3 <- ranger(Sepal.Length ~ ., 
                     data = iris[tr, -4], # dropped Sepal.Width
                     write.forest = TRUE,
                     importance = 'permutation',
                     quantreg = TRUE,
                     keep.inbag = TRUE,
                     replace = FALSE,
                     splitrule = "extratrees",
                     mtry = 2,            # has to change (var count lower)
                     min.node.size = 4,   # lowered
                     num.threads = 15)
    fit.r3
    # Ranger result
    # Type:                             Regression 
    # Number of trees:                  500 
    # Sample size:                      120 
    # Number of independent variables:  3 
    # Mtry:                             2 
    # Target node size:                 6 
    # Variable importance mode:         permutation 
    # Splitrule:                        extratrees 
    # Number of random splits:          1 
    # OOB prediction error (MSE):       0.1050143 
    # R squared (OOB):                  0.8543842  
    

    The second most important predictor was removed and it improved.

    p.r3 <- predict(fit.r3, iris[-tr, -c(1, 4)],
                    type = 'quantiles')
    
    postResample(p.r3$predictions[, 1], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.4760952 0.8089810 0.3800000  
    
    postResample(p.r3$predictions[, 2], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.3738315 0.7769388 0.3250000  
    
    postResample(p.r3$predictions[, 3], iris[-tr, 1])
    #      RMSE  Rsquared       MAE 
    # 0.6085584 0.8032592 0.5170000   
    
    importance(fit.r3)
    # almost everthing relies on Petal.Length
    #  Sepal.Width Petal.Length      Species 
    #   0.08008264   0.95440333   0.32570147