After fitting a generalized linear model using {sparklyr} in an Azure Databricks notebook, how might one map the model coefficient values to predictor names?
Here is an example of fitting a model and then extracting the coefficients. I would like to determine names associated with each coefficient.
library(sparklyr)
sc <- spark_connect(method = "databricks")
data <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE)
pipeline <- ml_pipeline(sc) %>%
ft_r_formula(vs ~ cyl + carb) %>%
ml_generalized_linear_regression(family = "binomial")
partitioned_data <- sdf_random_split(data, train = 0.80, test = 0.20, seed = 42)
fitted_pipeline <- ml_fit(pipeline, partitioned_data$train)
glrm_transformer <- ml_stage(fitted_pipeline, length(fitted_pipeline$stages))
with(glrm_transformer, c(intercept, coefficients))
This question is quite similar to this one, but in R instead of Python.
I took the approach of creating an S3 child class 'ml_generalized_linear_regression_model_enhanced' which is an extension of the parent class 'ml_generalized_linear_regression_model'.
This new child class adds four functions to the parent summary object:
as.ml_generalized_linear_regression_model_enhanced <- function(x, ...) {
UseMethod("as.ml_generalized_linear_regression_model_enhanced", x)
}
as.ml_generalized_linear_regression_model_enhanced.ml_generalized_linear_regression_model <- function(x) {
# 1. print()
x$summary$print <- function() {
x$summary$.jobj |>
sparklyr::invoke("toString") |>
cat()
}
# 2. feature_names()
x$summary$feature_names <- function(intercept = FALSE) {
feature_names <- x$summary$.jobj |>
sparklyr::invoke("featureNames") |>
unlist()
if (intercept)
feature_names <- c("(Intercept)", feature_names)
return(feature_names)
}
# 3. feature_count()
x$summary$feature_count <- function(intercept = FALSE) {
length(x$summary$feature_names(intercept))
}
# 4. coefficients()
x$summary$coefficients <- function(intercept = FALSE) {
coefficients <- x$coefficients
if(intercept)
coefficients <- c(x$intercept, coefficients)
return(coefficients)
}
# amend class
class(x) <- c("ml_generalized_linear_regression_model_enhanced", class(x))
# return object
return(x)
}
Then one can do something like the following:
library(sparklyr)
sc <- spark_connect(method = "databricks")
data <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE)
pipeline <- ml_pipeline(sc) %>%
ft_r_formula(vs ~ cyl + carb) %>%
ml_generalized_linear_regression(family = "binomial")
partitioned_data <- sdf_random_split(data, train = 0.80, test = 0.20, seed = 42)
fitted_pipeline <- ml_fit(pipeline, partitioned_data$train)
glrm_transformer <- ml_stage(fitted_pipeline, length(fitted_pipeline$stages))
glrm_transformer_enhanced <- as.ml_generalized_linear_regression_model_enhanced(glrm_transformer)
with(
glrm_transformer_enhanced$summary,
data.frame(
Feature = feature_names(intercept = TRUE),
Estimate = coefficients(intercept = TRUE),
'Std Error' = coefficient_standard_errors(),
'T Value' = t_values(),
'P Value' = p_values()
)
) %>%
copy_to(sc, ., "model_summary", overwrite = TRUE)
Which results in:
# Source: spark<model_summary> [?? x 5]
Feature Estimate Std_Error T_Value P_Value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 8.80 3.27 2.69 0.00712
2 cyl -1.31 0.537 -2.44 0.0148
3 carb -0.387 0.488 -0.792 0.428