rtidymodelspls

Predictor importance for PLS model trained with tidymodels


I'm using tidymodels to fit a PLS model but I'm struggling to find the PLS variable importance scores or coefficients.

This is what I've tried so far; the example data is from AppliedPredictiveModeling package.

Modeling fitting

data(ChemicalManufacturingProcess) 
split <- ChemicalManufacturingProcess %>% initial_split(prop = 0.7)
train <- training(split)
test <- testing(split)

tidy_rec <- recipe(Yield ~ ., data = train) %>% 
  step_knnimpute(all_predictors()) %>% 
  step_BoxCox(all_predictors()) %>% 
  step_normalize(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_corr(all_predictors())

boots <- bootstraps(time = 25, data = train)

tidy_model <- plsmod::pls(num_comp = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("mixOmics")

tidy_grid <- expand.grid(num_comp = seq(from = 1, to = 48, by = 5))

tidy_tune <- tidy_model %>% tune_grid(
  preprocessor = tidy_rec,
  grid = tidy_grid,
  resamples = boots,
  metrics = metric_set(mae, rmse, rsq)
)

tidy_best <- tidy_tune %>% select_best("rsq")
Final_model <- tidy_model %>% finalize_model(tidy_best)

tidy_wf <- workflow() %>% 
  add_model(Final_model) %>% 
  add_recipe(tidy_rec) 

Fit_PLS <- tidy_wf %>% fit(data = train)

# check the most important predictors
tidy_info <- Fit_PLS %>% pull_workflow_fit()
loadings <- tidy_info$fit$loadings$X

PLS variable importance

tidy_load <- loadings %>% as.data.frame() %>% rownames_to_column() %>% 
  select(rowname, comp1, comp2, comp3) %>% 
  pivot_longer(-rowname) %>% 
  rename(predictors = rowname)

tidy_load %>% mutate(Sing = if_else(value < 0, "neg", "pos")) %>% 
  mutate(absvalue = abs(value)) %>% group_by(predictors) %>% summarise(Importance = sum(absvalue)) %>% 
  mutate(predictors = fct_reorder(predictors, Importance)) %>% 
  slice_head(n = 15) %>% 
  ggplot(aes(Importance, predictors, fill = predictors)) + geom_col(show.legend = F) 

Thanks! The vi() function from the vip package is not available for this model.


Solution

  • You can directly tidy() the output of the PLS model to get the coefficients:

    library(tidymodels)
    library(tidyverse)
    library(plsmod)
    
    data(ChemicalManufacturingProcess, package = "AppliedPredictiveModeling") 
    split <- initial_split(ChemicalManufacturingProcess, prop = 0.7)
    train <- training(split)
    test <- testing(split)
    
    chem_rec <- recipe(Yield ~ ., data = train) %>% 
      step_knnimpute(all_predictors()) %>% 
      step_BoxCox(all_predictors()) %>% 
      step_normalize(all_predictors()) %>% 
      step_nzv(all_predictors()) %>% 
      step_corr(all_predictors())
    
    pls_spec <- pls(num_comp = 4) %>%    ## can tune instead to find the optimal number
      set_mode("regression") %>% 
      set_engine("mixOmics")
    
    wf <- workflow() %>%
      add_recipe(chem_rec) %>%
      add_model(pls_spec)
    
    
    pls_fit <- fit(wf, train)
    
    ## tidy the fitted model
    tidy_pls <- pls_fit %>%
      pull_workflow_fit()
      tidy()
    
    tidy_pls
    #> # A tibble: 192 x 4
    #>    term                    value type       component
    #>    <chr>                   <dbl> <chr>          <dbl>
    #>  1 BiologicalMaterial01  0.193   predictors         1
    #>  2 BiologicalMaterial01 -0.247   predictors         2
    #>  3 BiologicalMaterial01  0.00969 predictors         3
    #>  4 BiologicalMaterial01  0.0228  predictors         4
    #>  5 BiologicalMaterial03  0.249   predictors         1
    #>  6 BiologicalMaterial03 -0.00118 predictors         2
    #>  7 BiologicalMaterial03  0.0780  predictors         3
    #>  8 BiologicalMaterial03 -0.0866  predictors         4
    #>  9 BiologicalMaterial04  0.217   predictors         1
    #> 10 BiologicalMaterial04 -0.192   predictors         2
    #> # … with 182 more rows
    
    tidy_pls %>%
      filter(term != "Y") %>%
      group_by(component) %>%
      slice_max(abs(value), n = 10) %>%
      ungroup() %>%
      ggplot(aes(value, fct_reorder(term, value), fill = factor(component))) +
      geom_col(show.legend = FALSE) +
      facet_wrap(~component, scales = "free_y") +
      labs(y = NULL)
    

    Created on 2020-10-19 by the reprex package (v0.3.0.9001)

    I showed this without tuning the number of components, but it works about the same with tuning.