rmachine-learningr-recipes

How to deal with a column with only one value?


How to add a step to remove a column with constant value?

I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"

library(tidymodels)  
library(readr)       
library(broom.mixed) 
library(dotwhisker)  
library(skimr)           
library(rpart.plot)  
library(vip)    
library(glmnet)
library(naniar) 
library(tidyr)
library(dplyr)
library(textrecipes)

# Data cleaning

skool <-
  read_csv("/Users/riddhimaagupta/Desktop/log1.csv")

skool_v1 <- 
  select (skool, -c(...1, id,   npsn,   public, cert_est,   cert_ops,   name_clean, name,   muh1,   muh2,   muh,    chr1,   chr2,   chr3,   chr,    hindu,  nu1,    nu2,    nu_klaten,  nu_sby, nu, it1,    it, other_swas_international)) 

skool_v2 <- 
  filter(skool_v1, afiliasi != 99)

skool_v2.1 <- replace_with_na(skool_v2,
                              replace = list(village = c("-")))

skool_v2.2 <- replace_with_na(skool_v2.1,
                              replace = list(area = c("0")))

skool_v2.3 <- replace_with_na(skool_v2.2,
                              replace = list(date_est = c("-")))

skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')

skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')

skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)

skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)

skool_v2.4 <- skool_v2.3 %>%
  separate(latlon, c("latitude", "longitude"), ",")

skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)

skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude) 


skool_v3 <- skool_v2.4 %>%
  mutate_if(is.character, tolower) %>%
  mutate_if(is.character, as.factor) 


skool_v4 <- skool_v3 %>%
  mutate_if(is.logical, as.factor)

skool_v4$afiliasi <- as.factor(skool_v4$afiliasi) 

glimpse(skool_v4)


# Data splitting 

set.seed(123)
splits      <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test  <- testing(splits)

set.seed(234)
val_set <- validation_split(skool_v4, 
                            strata = afiliasi, 
                            prop = 0.80)

# Penalised multinomial regression

lr_mod <- 
  logistic_reg(penalty = tune(), mixture = 0.5) %>% 
  set_engine("glmnet")

lr_recipe <- 
  recipe(afiliasi ~ ., data = school_train) %>%  
  step_date(date_est, date_ops) %>% 
  step_rm(date_est, date_ops) %>%
  textrecipes::step_clean_levels(village) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors()) 


lr_workflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(lr_recipe)


lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)

lr_res <- 
  lr_workflow %>% 
  tune_grid(val_set,
            grid = lr_reg_grid,
            control = control_grid(save_pred = TRUE,  verbose = TRUE),
            metrics = metric_set(roc_auc))

The console says

x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column. 

Solution

  • This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy().

    This would make your recipe look like

    lr_recipe <- 
      recipe(afiliasi ~ ., data = school_train) %>%  
      step_date(date_est, date_ops) %>% 
      step_rm(date_est, date_ops) %>%
      textrecipes::step_clean_levels(village) %>%
      step_zv(all_nominal_predictors()) %>%
      step_dummy(all_nominal_predictors()) %>%
      step_zv(all_predictors()) %>% 
      step_normalize(all_predictors()) 
    

    Reprex showing what is happening:

    library(recipes)
    
    mtcars$fac1 <- "h"
    mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
    
    recipe(mpg ~ ., data = mtcars) %>%
      step_dummy(all_nominal_predictors()) %>%
      prep()
    #> Error in `bake()`:
    #> ! Only one factor level in fac1: h
    
    recipe(mpg ~ ., data = mtcars) %>%
      step_zv(all_nominal_predictors()) %>%
      step_dummy(all_nominal_predictors()) %>%
      prep()
    #> Recipe
    #> 
    #> Inputs:
    #> 
    #>       role #variables
    #>    outcome          1
    #>  predictor         12
    #> 
    #> Training data contained 32 data points and no missing data.
    #> 
    #> Operations:
    #> 
    #> Zero variance filter removed fac1 [trained]
    #> Dummy variables from fac2 [trained]