rdatabricksapache-spark-mllibsparklyr

How to map coefficient values to names from {sparklyr} fitted pipeline?


After fitting a generalized linear model using {sparklyr} in an Azure Databricks notebook, how might one map the model coefficient values to predictor names?

Here is an example of fitting a model and then extracting the coefficients. I would like to determine names associated with each coefficient.

library(sparklyr)

sc <- spark_connect(method = "databricks")

data <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE)

pipeline <- ml_pipeline(sc) %>%
  ft_r_formula(vs ~ cyl + carb) %>%
  ml_generalized_linear_regression(family = "binomial")

partitioned_data <- sdf_random_split(data, train = 0.80, test = 0.20, seed = 42)

fitted_pipeline <- ml_fit(pipeline, partitioned_data$train)

glrm_transformer <- ml_stage(fitted_pipeline, length(fitted_pipeline$stages))

with(glrm_transformer, c(intercept, coefficients))

This question is quite similar to this one, but in R instead of Python.


Solution

  • I took the approach of creating an S3 child class 'ml_generalized_linear_regression_model_enhanced' which is an extension of the parent class 'ml_generalized_linear_regression_model'.

    This new child class adds four functions to the parent summary object:

    1. print()
      • prints the regression table using the toString() method of the java object
    2. feature_names(intercept = FALSE)
      • returns a vector of feature names in the order of the coefficients
    3. feature_count(intercept = FALSE)
      • returns a count of the number of features
    4. coefficients(intercept = FALSE)
      • returns the coefficient values in the order of the feature names
    as.ml_generalized_linear_regression_model_enhanced <- function(x, ...) {
      UseMethod("as.ml_generalized_linear_regression_model_enhanced", x)
    }
    
    as.ml_generalized_linear_regression_model_enhanced.ml_generalized_linear_regression_model <- function(x) {
      # 1. print()
      x$summary$print <- function() {
        x$summary$.jobj |>
          sparklyr::invoke("toString") |>
          cat()
      }
      
      # 2. feature_names()
      x$summary$feature_names <- function(intercept = FALSE) {
        feature_names <- x$summary$.jobj |>
          sparklyr::invoke("featureNames") |>
          unlist()
        
        if (intercept)
          feature_names <- c("(Intercept)", feature_names)
        
        return(feature_names)
      }
    
      # 3. feature_count()
      x$summary$feature_count <- function(intercept = FALSE) {
        length(x$summary$feature_names(intercept))
      }
    
      # 4. coefficients()
      x$summary$coefficients <- function(intercept = FALSE) {
        coefficients <- x$coefficients
    
        if(intercept)
          coefficients <- c(x$intercept, coefficients)
    
        return(coefficients)
      }
    
      # amend class
      class(x) <- c("ml_generalized_linear_regression_model_enhanced", class(x))
    
      # return object
      return(x)
    }
    

    Then one can do something like the following:

    library(sparklyr)
    
    sc <- spark_connect(method = "databricks")
    
    data <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE)
    
    pipeline <- ml_pipeline(sc) %>%
      ft_r_formula(vs ~ cyl + carb) %>%
      ml_generalized_linear_regression(family = "binomial")
    
    partitioned_data <- sdf_random_split(data, train = 0.80, test = 0.20, seed = 42)
    
    fitted_pipeline <- ml_fit(pipeline, partitioned_data$train)
    
    glrm_transformer <- ml_stage(fitted_pipeline, length(fitted_pipeline$stages))
    
    glrm_transformer_enhanced <- as.ml_generalized_linear_regression_model_enhanced(glrm_transformer)
    
    with(
      glrm_transformer_enhanced$summary, 
      data.frame(
        Feature = feature_names(intercept = TRUE), 
        Estimate = coefficients(intercept = TRUE),
        'Std Error' = coefficient_standard_errors(),
        'T Value' = t_values(),
        'P Value' = p_values()
      )
    ) %>% 
    copy_to(sc, ., "model_summary", overwrite = TRUE)
    

    Which results in:

    # Source: spark<model_summary> [?? x 5]
      Feature     Estimate Std_Error T_Value P_Value
      <chr>          <dbl>     <dbl>   <dbl>   <dbl>
    1 (Intercept)    8.80      3.27    2.69  0.00712
    2 cyl           -1.31      0.537  -2.44  0.0148 
    3 carb          -0.387     0.488  -0.792 0.428