Sample selection model

I want to duplicate the resuls of sampleSelection package. However, my code does not work.

Data Set

library(mvtnorm)
set.seed(123)
n = 1000
sigmau <- 1; sigmav <- 1; rho <- 0.5
Sigma <- matrix(c(sigmau^2,rho*sigmau*sigmav,rho*sigmau*sigmav,sigmav^2),2,2)

x1 <- rnorm(n,0,1); x2 <- rnorm(n,0,1); z <- rnorm(n,0,1)
u = rmvnorm(n,c(0,0),Sigma)[,1]; v = rmvnorm(n,c(0,0),Sigma)[,2]

selection_latent <- 1 + x1 + z + v
selection <- as.numeric(selection_latent >= 0)
y <- ifelse(selection == 1, 1 + x1 + x2 + u, 0)

# Create the data frame 'df'
df_sel <- data.frame(id = 1:n,x1,x2,z,u = u,v,selection_latent,selection,y)

## Estimation
library(sampleSelection)
library(texreg)
sample_selection1 <- selection(selection = selection ~ 1 + x1 + z,outcome = y ~ 1 + x1 + x2, data = df)
screenreg(sample_selection1)

I wrote below code, but I receive the error message "Error in optim(par = parameters, fn = f_selection, selection = selection, : function cannot be evaluated at initial parameters".

I would appriciate it if you find my error and modify the code. Thank you in advance.

## Sample Selection ##
Selection_Reg <- function(y_in, x_in, z_in, selection,df) {
  x <- cbind(1,as.matrix(x_in))
  z <- cbind(1,as.matrix(z_in))
  lm1 <- lm(y~1+x1 +x2,data = df)
  lm2 <- lm(y~1+x1 + z,data = df)
  parameters <- c(1,1,as.numeric(lm1$coef),as.numeric(lm2$coef))
  
  f_selection <- function(parameters,selection, y_in, x_in,z_in) {
    sigma_u <- parameters[1]
    rho <- parameters[2]
    sigma_v <- 1
    beta <- parameters[3:(ncol(x)+2)]
    theta <- parameters[(ncol(x)+3):(length(parameters))]
    xb <- x_in %*% beta
    zt <- z_in %*% theta
    z_adj <- (y_in-xb)/sigma_u
    inside <- (zt + (rho*sigma_u/sigma_v)*(y_in-xb))/(((1-rho^2)*sigma_u^2)^(1/2))
    
    log_lik <- (-log(sigma_u)+log(dnorm(z_adj))+log(pnorm(inside)))*selection +log(1 - pnorm(zt/sigma_v))*(1-selection)
    return(-sum(log_lik))
  }
  
  result <- optim(par = parameters, fn = f_selection,selection = selection, y_in = y_in, x_in = x,z_in = z)
  return(result)
}
Selection_Reg(y_in = df_sel$y,x_in = df_sel[,c("x1","x2")], z_in = df_sel[,c("x1","z")],selection = df_sel$selection,df = df_sel)

Solution

rho=1 in the initial values of the parameters passed to optim() is the main issue, since it makes the variable inside undefined. Changing rho=0.5 in the function Selection_Reg() (same initialization as in the first case) makes the code work:

parameters <- c(1,0.5,as.numeric(lm1$coef), as.numeric(lm2$coef))

Now, running optim() converges to the following (MLE) values:

Selection_Reg(y_in = df_sel$y, 
              x_in = df_sel[,c("x1","x2")], 
              z_in = df_sel[,c("x1","z")],
              selection = df_sel$selection, df = df_sel)

# $par
# [1] 0.9460186 0.2447177 0.8799579 1.0583746 0.9466844 0.9414059 0.9505922 0.8971660

# $value
# [1] 1351.807

# $counts
# function gradient 
# 501       NA 

# $convergence
# [1] 1

Where in the first case the output is the following:

screenreg(sample_selection1)
#
#============================
#                maximum     
#----------------------------
#S: (Intercept)      0.93 ***
#                   (0.06)   
#S: x1               1.04 ***
#                   (0.07)   
#S: z                0.98 ***
#                   (0.07)   
#O: (Intercept)      0.99 ***
#                   (0.07)   
#O: x1               0.99 ***
#                   (0.05)   
#O: x2               0.96 ***
#                   (0.04)   
#sigma               0.99 ***
#                   (0.03)   
#rho                -0.13    
#                   (0.16)   
#----------------------------
#AIC              2718.13    
#BIC              2757.40    
#Log Likelihood  -1351.07    
#Num. obs.        1000       
#Censored          293       
#Observed          707       
#============================
#*** p < 0.001; ** p < 0.01; * p < 0.05

Now, compare MLE values of sigma, rho, beta, theta parameters, obtained with the above two approaches.