I want to duplicate the resuls of sampleSelection package. However, my code does not work.
library(mvtnorm)
set.seed(123)
n = 1000
sigmau <- 1; sigmav <- 1; rho <- 0.5
Sigma <- matrix(c(sigmau^2,rho*sigmau*sigmav,rho*sigmau*sigmav,sigmav^2),2,2)
x1 <- rnorm(n,0,1); x2 <- rnorm(n,0,1); z <- rnorm(n,0,1)
u = rmvnorm(n,c(0,0),Sigma)[,1]; v = rmvnorm(n,c(0,0),Sigma)[,2]
selection_latent <- 1 + x1 + z + v
selection <- as.numeric(selection_latent >= 0)
y <- ifelse(selection == 1, 1 + x1 + x2 + u, 0)
# Create the data frame 'df'
df_sel <- data.frame(id = 1:n,x1,x2,z,u = u,v,selection_latent,selection,y)
## Estimation
library(sampleSelection)
library(texreg)
sample_selection1 <- selection(selection = selection ~ 1 + x1 + z,outcome = y ~ 1 + x1 + x2, data = df)
screenreg(sample_selection1)
I wrote below code, but I receive the error message "Error in optim(par = parameters, fn = f_selection, selection = selection, : function cannot be evaluated at initial parameters".
I would appriciate it if you find my error and modify the code. Thank you in advance.
## Sample Selection ##
Selection_Reg <- function(y_in, x_in, z_in, selection,df) {
x <- cbind(1,as.matrix(x_in))
z <- cbind(1,as.matrix(z_in))
lm1 <- lm(y~1+x1 +x2,data = df)
lm2 <- lm(y~1+x1 + z,data = df)
parameters <- c(1,1,as.numeric(lm1$coef),as.numeric(lm2$coef))
f_selection <- function(parameters,selection, y_in, x_in,z_in) {
sigma_u <- parameters[1]
rho <- parameters[2]
sigma_v <- 1
beta <- parameters[3:(ncol(x)+2)]
theta <- parameters[(ncol(x)+3):(length(parameters))]
xb <- x_in %*% beta
zt <- z_in %*% theta
z_adj <- (y_in-xb)/sigma_u
inside <- (zt + (rho*sigma_u/sigma_v)*(y_in-xb))/(((1-rho^2)*sigma_u^2)^(1/2))
log_lik <- (-log(sigma_u)+log(dnorm(z_adj))+log(pnorm(inside)))*selection +log(1 - pnorm(zt/sigma_v))*(1-selection)
return(-sum(log_lik))
}
result <- optim(par = parameters, fn = f_selection,selection = selection, y_in = y_in, x_in = x,z_in = z)
return(result)
}
Selection_Reg(y_in = df_sel$y,x_in = df_sel[,c("x1","x2")], z_in = df_sel[,c("x1","z")],selection = df_sel$selection,df = df_sel)
rho=1
in the initial values of the parameters passed to optim()
is the main issue, since it makes the variable inside
undefined. Changing rho=0.5
in the function Selection_Reg()
(same initialization as in the first case) makes the code work:
parameters <- c(1,0.5,as.numeric(lm1$coef), as.numeric(lm2$coef))
Now, running optim()
converges to the following (MLE) values:
Selection_Reg(y_in = df_sel$y,
x_in = df_sel[,c("x1","x2")],
z_in = df_sel[,c("x1","z")],
selection = df_sel$selection, df = df_sel)
# $par
# [1] 0.9460186 0.2447177 0.8799579 1.0583746 0.9466844 0.9414059 0.9505922 0.8971660
# $value
# [1] 1351.807
# $counts
# function gradient
# 501 NA
# $convergence
# [1] 1
Where in the first case the output is the following:
screenreg(sample_selection1)
#
#============================
# maximum
#----------------------------
#S: (Intercept) 0.93 ***
# (0.06)
#S: x1 1.04 ***
# (0.07)
#S: z 0.98 ***
# (0.07)
#O: (Intercept) 0.99 ***
# (0.07)
#O: x1 0.99 ***
# (0.05)
#O: x2 0.96 ***
# (0.04)
#sigma 0.99 ***
# (0.03)
#rho -0.13
# (0.16)
#----------------------------
#AIC 2718.13
#BIC 2757.40
#Log Likelihood -1351.07
#Num. obs. 1000
#Censored 293
#Observed 707
#============================
#*** p < 0.001; ** p < 0.01; * p < 0.05
Now, compare MLE values of sigma, rho, beta, theta parameters, obtained with the above two approaches.