rregressionlinear-regressiongamlss

Predicting binary response probabilities from gamlss R object


I want to predict binary class probabilities/class labels from gamlss R function, how can the predict function be used to get them?

I have the following sample code

library(gamlss)
X1 <- rnorm(500)
X2 <- sample(c("A","C","D","E"),500, replace = TRUE)
Y <- ifelse(X1>0.2& X2=="A",1,0)

n <- 500
training <- sample(1:n, 400)
testing <- (1:n)[-training]

fit <- gamlss(Y[training]~pcat(X2[training],Lp=1)+ri(X1[training],Lp=1),family=BI())

pred <- predict(fit,newdata = data.frame(X1,X2)[testing,],type = "response")

Error in predict.gamlss(fit, newdata = data.frame(X1, X2)[testing, ], : define the original data using the option data

Any idea?


Solution

  • You need to define the original data using the data option of gamlss:

    library(gamlss)
    set.seed(1)
    n <- 500
    X1 <- rnorm(n)
    X2 <- sample(c("A","C","D","E"), n, replace = TRUE)
    Y <- ifelse(X1>0.2 & X2=="A", 1, 0)
    dtset <- data.frame(X1, X2, Y)
    
    training <- sample(1:n, 400)
    XYtrain <- dtset[training,]
    XYtest  <- dtset[-training,]
    
    fit <- gamlss(Y ~ pcat(X2, Lp=1) + ri(X1, Lp=1), family=BI(), data=XYtrain)
    pred <- predict(fit, type="response", newdata=XYtest)
    

    Unfortunately, predict now generates a new error message:

    Error in if (p != ap) stop("the dimensions of the penalty matrix and of the design matrix are incompatible") : argument is of length zero

    This problem can be solved modifying the gamlss.ri function used by predict.gamlss:

    gamlss.ri <- function (x, y, w, xeval = NULL, ...) 
    {
        regpen <- function(sm, D, P0, lambda) {
            for (it in 1:iter) {
                RD <- rbind(R, sqrt(lambda) * sqrt(omega.) * D)
                svdRD <- svd(RD)
                rank <- sum(svdRD$d > max(svdRD$d) * .Machine$double.eps^0.8)
                np <- min(p, N)
                U1 <- svdRD$u[1:np, 1:rank]
                y1 <- t(U1) %*% Qy
                beta <- svdRD$v[, 1:rank] %*% (y1/svdRD$d[1:rank])
                dm <- max(abs(sm - beta))
                sm <- beta
                omega. <- c(1/(abs(sm)^(2 - Lp) + kappa^2))
                if (dm < c.crit) 
                    break
            }
            HH <- (svdRD$u)[1:p, 1:rank] %*% t(svdRD$u[1:p, 1:rank])
            edf <- sum(diag(HH))
            fv <- X %*% beta
            row.names(beta) <- namesX
            out <- list(fv = fv, beta = beta, edf = edf, omega = omega.)
        }
        fnGAIC <- function(lambda, k) {
            fit <- regpen(sm, D, P0, lambda)
            fv <- fit$fv
            GAIC <- sum(w * (y - fv)^2) + k * fit$edf
            GAIC
        }
        X <- if (is.null(xeval)) 
            as.matrix(attr(x, "X"))
        else as.matrix(attr(x, "X"))[seq(1, length(y)), , drop=FALSE] # Added drop=FALSE
        namesX <- as.character(attr(x, "namesX"))
        D <- as.matrix(attr(x, "D"))
        order <- as.vector(attr(x, "order"))
        lambda <- as.vector(attr(x, "lambda"))
        df <- as.vector(attr(x, "df"))
        Lp <- as.vector(attr(x, "Lp"))
        kappa <- as.vector(attr(x, "kappa"))
        iter <- as.vector(attr(x, "iter"))
        k <- as.vector(attr(x, "k"))
        c.crit <- as.vector(attr(x, "c.crit"))
        method <- as.character(attr(x, "method"))
        gamlss.env <- as.environment(attr(x, "gamlss.env"))
        startLambdaName <- as.character(attr(x, "NameForLambda"))
        N <- sum(w != 0)
        n <- nrow(X)
        p <- ncol(X)
        aN <- nrow(D)
        ap <- ncol(D)
        qrX <- qr(sqrt(w) * X, tol = .Machine$double.eps^0.8)
        R <- qr.R(qrX)
        Q <- qr.Q(qrX)
        Qy <- t(Q) %*% (sqrt(w) * y)
        if (p != ap) 
            stop("the dimensions of the penalty matrix and of the design matrix are incompatible")
        P0 <- diag(p) * 1e-06
        sm <- rep(0, p)
        omega. <- rep(1, p)
        tau2 <- sig2 <- NULL
        lambdaS <- get(startLambdaName, envir = gamlss.env)
        if (lambdaS >= 1e+07) 
            lambda <- 1e+07
        if (lambdaS <= 1e-07) 
            lambda <- 1e-07
        if (is.null(df) && !is.null(lambda) || !is.null(df) && !is.null(lambda)) {
            fit <- regpen(sm, D, P0, lambda)
            fv <- fit$fv
        }
        else if (is.null(df) && is.null(lambda)) {
            lambda <- lambdaS
            switch(method, ML = {
                for (it in 1:20) {
                    fit <- regpen(sm, D, P0, lambda)
                    gamma. <- D %*% as.vector(fit$beta) * sqrt(fit$omega)
                    fv <- X %*% fit$beta
                    sig2 <- sum(w * (y - fv)^2)/(N - fit$edf)
                    tau2 <- sum(gamma.^2)/(fit$edf - order)
                    lambda.old <- lambda
                    lambda <- sig2/tau2
                    if (abs(lambda - lambda.old) < 1e-04 || lambda > 
                      1e+05) break
                }
            }, GAIC = {
                lambda <- nlminb(lambda, fnGAIC, lower = 1e-07, upper = 1e+07, 
                    k = k)$par
                fit <- regpen(sm, D, P0, lambda)
                fv <- fit$fv
                assign(startLambdaName, lambda, envir = gamlss.env)
            }, )
        }
        else {
            edf1_df <- function(lambda) {
                edf <- sum(1/(1 + lambda * UDU$values))
                (edf - df)
            }
            Rinv <- solve(R)
            S <- t(D) %*% D
            UDU <- eigen(t(Rinv) %*% S %*% Rinv)
            lambda <- if (sign(edf1_df(0)) == sign(edf1_df(1e+05))) 
                1e+05
            else uniroot(edf1_df, c(0, 1e+05))$root
            fit <- regpen(sm, D, P0, lambda)
            fv <- fit$fv
        }
        waug <- as.vector(c(w, rep(1, nrow(D))))
        xaug <- as.matrix(rbind(X, sqrt(lambda) * D))
        lev <- hat(sqrt(waug) * xaug, intercept = FALSE)[1:n]
        var <- lev/w
        coefSmo <- list(coef = fit$beta, lambda = lambda, edf = fit$edf, 
            sigb2 = tau2, sige2 = sig2, sigb = if (is.null(tau2)) NA else sqrt(tau2), 
            sige = if (is.null(sig2)) NA else sqrt(sig2), fv = as.vector(fv), 
            se = sqrt(var), Lp = Lp)
        class(coefSmo) <- "ri"
        if (is.null(xeval)) {
            list(fitted.values = as.vector(fv), residuals = y - fv, 
                var = var, nl.df = fit$edf - 1, lambda = lambda, 
                coefSmo = coefSmo)
        }
        else {
            ll <- dim(as.matrix(attr(x, "X")))[1]
            nx <- as.matrix(attr(x, "X"))[seq(length(y) + 1, ll), 
                ]
            pred <- drop(nx %*% fit$beta)
            pred
        }
    }
    # Replace "gamlss.ri" in the package "gamlss"
    assignInNamespace("gamlss.ri", gamlss.ri, pos="package:gamlss")
    
    pred <- predict(fit, type="response", newdata=XYtest)
    
    print(head(pred))
    # [1] 2.220446e-16 2.220446e-16 2.220446e-16 4.142198e-12 2.220446e-16 2.220446e-16