rregressionlmpiecewise

How to make segmented regression first segment start at origin and last segment finish at a desired value?


I am trying to fit a piecewise regression for this dataset. I know we do not have a linear relation between the dependent and independent variable but my real world application requires me to model the data as a lm segmented regression.

Here is my code with description of the steps

bond_data <- data.frame(
yield_change = c(-1.2,-0.9,-1.8,-1.4,-1.8,-2.1,-2.3,-2.1,-2.5,-2.2,-2.4,-2.5,-2.4,-2.4,
                 -3.0,-2.6,-5.1,-4.8,-4.9,-5.0,-5.0,-6.2,-6.1,-6.3,-5.0,-5.0), 
maturity =c(10.2795,10.8603,11.7753,12.3562,12.5205,13.3589,13.8630,14.2822,14.3589,15.3589,
            15.8630,16.778,17.3616,17.8658,18.3616,21.8685,22.5288,23.8685,24.3644,25.3671,
            26.8712,27.8712,28.8712,29.8740,44.3781,49.3836))

#Defining lm model & segmented model

model <- lm(yield_change~maturity, data = bond_data)

segmented.model <- segmented(model,seg.Z=~maturity,psi = list(maturity = c(15,20,30)),fixed.psi = c(15,20,30),control = seg.control(it.max = 0, n.boot = 50))


# Getting the correct plot using regular plot function as plot.segmented gave me the error message "Error in Allpsi[[i]] : subscript out of bounds"

xp <- c(min(bond_data$maturity), segmented.model$psi[,"Est."], max(bond_data$maturity))
new_data <- data.frame(xp)
colnames(new_data) <- "maturity"
o <- segmented.model

new_data$dummy1 <- pmax(new_data$maturity - o$psi[1,2], 0)
new_data$dummy2 <- pmax(new_data$maturity - o$psi[2,2], 0)
new_data$dummy3 <- pmax(new_data$maturity - o$psi[3,2], 0)
new_data$dummy4 <-I(new_data$maturity > o$psi[1,2]) * coef(o)[3]
new_data$dummy5 <-I(new_data$maturity > o$psi[2,2]) * coef(o)[4]
new_data$dummy6 <-I(new_data$maturity > o$psi[3,2]) * coef(o)[5]
names(new_data)[-1] <- names(model.frame(o))[-c(1,2)]

yp <- predict(segmented.model,new_data)
plot(bond_data$maturity,bond_data$yield_change, pch=16, col="blue",ylim = c(-8,0))
lines(xp,yp)

I get the following image

Plot of actual values in blue points and pred line

I am trying to get the first segment start at the origin (I have tried adding "+0" to my predictor variable in my formula "maturity+0" but the prediction line does not start at 0)...

My guess is that I am setting all intercepts to zero which causes this error. But when I try to look at my intercepts of different segments I get this error (the same as when I tried to use plot.segmented):

> intercept(segmented.model)
Error in Allpsi[[i]] : subscript out of bounds

One thing to note is that all my breakpoints have fixed x positions and no estimates are made so when I run segmented.model$psi my initial values are the same as my estimates (15,20 and 30) and all my st.err are zero.

How would I go about making my prediction line start at zero (making ONLY my first segment have no intercept) and extend the last segment (from 30 to 50) to 50?


Solution

  • By removing the intercept from the original linear model that is used as an input to the segmented() function, you can force the first segment through the origin:

    library(segmented)
    #> Loading required package: MASS
    #> Loading required package: nlme
    dat <- data.frame(
    yield_change = c(-1.2,-0.9,-1.8,-1.4,-1.8,-2.1,-2.3,-2.1,-2.5,-2.2,-2.4,-2.5,-2.4,-2.4,
                     -3.0,-2.6,-5.1,-4.8,-4.9,-5.0,-5.0,-6.2,-6.1,-6.3,-5.0,-5.0), 
    maturity =c(10.2795,10.8603,11.7753,12.3562,12.5205,13.3589,13.8630,14.2822,14.3589,15.3589,
                15.8630,16.778,17.3616,17.8658,18.3616,21.8685,22.5288,23.8685,24.3644,25.3671,
                26.8712,27.8712,28.8712,29.8740,44.3781,49.3836))
    
    
    model <- lm(yield_change~maturity-1, data = dat)
    segmented.model <- segmented(model,seg.Z=~maturity,psi = list(maturity = c(15,20,30)),fixed.psi = c(15,20,30),control = seg.control(it.max = 0, n.boot = 50))
    
    xp <- c(0, segmented.model$psi[,"Est."], 50)
    new_data <- data.frame(xp)
    colnames(new_data) <- "maturity"
    o <- segmented.model
    
    new_data$dummy1 <- pmax(new_data$maturity - o$psi[1,2], 0)
    new_data$dummy2 <- pmax(new_data$maturity - o$psi[2,2], 0)
    new_data$dummy3 <- pmax(new_data$maturity - o$psi[3,2], 0)
    new_data$dummy4 <-I(new_data$maturity > o$psi[1,2]) * coef(o)[3]
    new_data$dummy5 <-I(new_data$maturity > o$psi[2,2]) * coef(o)[4]
    new_data$dummy6 <-I(new_data$maturity > o$psi[3,2]) * coef(o)[5]
    names(new_data)[-1] <- names(model.frame(o))[-c(1,2)]
    
    yp <- predict(segmented.model,new_data)
    plot(dat$maturity,dat$yield_change, pch=16, col="blue",ylim = c(-8,0), xlim=c(0,50))
    lines(xp,yp)
    

    Created on 2022-11-17 by the reprex package (v2.0.1)

    Since the last segment already goes to 50, I'm not sure exactly what other constraints you are trying to impose on the model. You could also do this just in the lm() framework, by writing a little function that generates basis functions for a piecewise linear spline.

    library(ggplot2)
    dat <- data.frame(
      yield_change = c(-1.2,-0.9,-1.8,-1.4,-1.8,-2.1,-2.3,-2.1,-2.5,-2.2,-2.4,-2.5,-2.4,-2.4,
                       -3.0,-2.6,-5.1,-4.8,-4.9,-5.0,-5.0,-6.2,-6.1,-6.3,-5.0,-5.0), 
      maturity =c(10.2795,10.8603,11.7753,12.3562,12.5205,13.3589,13.8630,14.2822,14.3589,15.3589,
                  15.8630,16.778,17.3616,17.8658,18.3616,21.8685,22.5288,23.8685,24.3644,25.3671,
                  26.8712,27.8712,28.8712,29.8740,44.3781,49.3836))
    pwl <- function(x, k)ifelse(x >= k, x-k, 0)
    mod <- lm(yield_change ~ maturity + pwl(maturity, 15) + pwl(maturity, 20) + pwl(maturity, 30) -1, data=dat)
    hyp <- data.frame(maturity = seq(0,50, length=250))
    fit <- predict(mod, newdata=hyp)
    ggplot() + 
      geom_point(data = dat, aes(x=maturity, y=yield_change)) + 
      geom_line(aes(x=hyp$maturity, y=fit), inherit.aes = FALSE) + 
      theme_classic()
    

    Created on 2022-11-17 by the reprex package (v2.0.1)


    Edit: Constrained to go through (10,0)

    If I read your question in the comments correctly, you want to constrain the model to go through the point (10,0). If you know the slope of the a line and the x-y coordinates of a point on the line, you can calculate its intercept.

    So, you would have to optimize a function that calculates the intercept given the estimated slope coefficient for the first piece of the function. You could do that as follows. First, you need to specify the likelihood function:

    llfun <- function(par, form, data, ...){
      b <- par
      X <- model.matrix(form, data)
      a <- 0-b[1]*10
      yhat <- a + X %*% b
      y <- model.response(model.frame(form, data))
      e <- y-yhat
      sigma <- sqrt(sum(e^2)/(nrow(X)-ncol(X)))
      sum(-dnorm(e, 0, sigma, log=TRUE))
    }
    

    Then you could maximize the function:

    frm <- yield_change ~ maturity + pwl(maturity, 15) + 
             pwl(maturity, 20) + pwl(maturity, 30) -1
    o <- optim(rep(0, 4), fn = llfun, form=frm, data=dat)
    

    That estimates the slopes, you could then calculate the intercept:

    b <- o$par
    a <- 0-10*b[1]
    b <- c(a, b)
    

    Then, you can make predictions with a sequence of maturity values that go from 10 to 50.

    hyp <- data.frame(maturity = seq(10,50, length=250), yield_change=0)
    X <- model.matrix(frm, hyp)
    X <- cbind(1, X)
    yhat <- X %*%b
    

    Finally, you could plot the function:

    ggplot() + 
      geom_point(data = dat, aes(x=maturity, y=yield_change)) + 
      geom_line(aes(x=hyp$maturity, y=yhat), inherit.aes = FALSE) + 
      geom_hline(yintercept=0, linetype=3) + 
      scale_x_continuous(limits = c(0,50)) + 
      theme_classic()
    

    enter image description here