I have data in R.
set.seed(123)
n <- 100
xt <- rnorm(n)
yt <- rnorm(n)
df <- data.frame(
time = 1:n,
y = yt,
y_lag = c(NA, yt[-n]),
x = xt
)
df_clean <- na.omit(df)
I want to fit this model (https://stats.stackexchange.com/a/660345/430997):
I read about the optim() function in R and tried to write my own procedure for this model.
First I define the log-likelihood
neg_loglik <- function(params, data) {
c <- as.numeric(params[1])
phi1 <- as.numeric(params[2])
alpha <- as.numeric(params[3])
sigma <- exp(as.numeric(params[4]))
y <- as.numeric(data$y)
y_lag <- as.numeric(data$y_lag)
x <- as.numeric(data$x)
mu <- c + phi1*y_lag + exp(alpha)*x
if (any(is.na(mu)) || any(is.infinite(mu))) return(Inf)
neg_ll <- -sum(dnorm(y, mean = mu, sd = sigma, log = TRUE), na.rm = TRUE)
if (is.na(neg_ll) || is.infinite(neg_ll)) return(Inf)
return(neg_ll)
}
From here, I defined starting parameters. I am not sure if there is an exact way to do this, but I manually defined some:
start_params <- c(0, 0.5, 0, 0)
Then, I ran the optim function to estimate parameters and variances (I know from my statistics classes that the variance is related to the hessian):
fit <- optim(par = start_params,
fn = neg_loglik,
data = df_clean,
method = "BFGS",
hessian = TRUE)
params_est <- fit$par
final_params <- c(
c = params_est[1],
phi1 = params_est[2],
beta = exp(params_est[3]),
sigma = exp(params_est[4])
)
se <- sqrt(diag(solve(fit$hessian)))
Is this the correct way to do this in R?
Rather than giving you a fish, I'm going to try to teach you to fish. When you have code like this that estimates the parameters of a statistical model, a good way to test the code is to simulate that model, generate estimates from your code, and see if the estimates are anywhere near the true parameters used in the simulation.
Here is some sample code to simulate N = 1000
simulations from your model:
#Set model parameters
n <- 1000
c <- 1.2
phi1 <- 0.2
alpha <- -1
sigma <- 2
sigx <- 4
#Set number of simulations
N <- 1000
#Simulate data from model
set.seed(1)
SIMS <- vector(mode = 'list', length = N)
for (i in 1:N) {
EE <- rnorm(n, mean = 0, sd = sigma)
XX <- rnorm(n, mean = 0, sd = sigx)
YY <- rep(0, n)
YY[1] <- c + phi1*rnorm(1) + exp(alpha)*XX[1] + EE[1]
for (t in 2:n) { YY[t] <- c + phi1*YY[t-1] + exp(alpha)*XX[t] + EE[t] }
SIMS[[i]] <- data.frame(y = YY[2:n], y_lag = YY[1:(n-1)], x = XX[2:n]) }
Now we test your code to see if it estimates reasonably well:
#Generate estimates from simulations
EST <- data.frame(c = rep(0, N), phi1 = rep(0, N),
alpha = rep(0, N), sigma = rep(0, N))
for (i in 1:N) {
SIMDATA <- SIMS[[i]]
FIT <- optim(par = start_params, fn = neg_loglik, data = SIMDATA,
method = "BFGS", hessian = TRUE)
EST[i, ] <- FIT$par }
#Compare estimates to true parameters
mean(EST$c) - c
[1] 0.0004657322
mean(EST$phi1) - phi1
[1] -0.0007644282
mean(EST$alpha) - alpha
[1] 0.0009737879
mean(exp(EST$sigma)) - sigma
[1] -0.00233892
Finally, some advice to slightly improve your code: (1) It is better to set sigma
to be the error standard deviation rather than its logarithm, or define a new variable logsigma
, to avoid confusion; (2), rather than using arbitrary starting values, it is better to use starting values that come from optimisation of a closely related problem with an analytic solution (in this case, OLS estimation with adjustment of the beta parameter gives a close approximation to the optima, so it would make a good starting value for the algorithm); and (3) you can tie your code together into a single fitting function that takes the input data, defines the log-likelihood and then optimises and gives useful output.
Here is an example where I have implemented these improvements in the function fit.model
. This function takes in a data frame with a y
and x
variable and fits it to your model. It outputs various useful quantities including the MLEs, asymptotic standard errors, asymptotic variance matrix, maximised log-likelihood, the maximised likelihood-per-data-point and the convergence/message outputs from the optim
function. (Bear in mind that a slight problem here is that there is a reasonable change of getting an estimator on the boundary of the beta parameter, in which case the asymptotic variance may be invalid.)
#Set estimation function
fit.model <- function(data, ...) {
#Check input data and extract variables
if(!is.data.frame(data)) stop('Error: Input data should be a data frame')
nn <- nrow(data)
if (nn < 2) stop('Error: Input data must have at least two data points')
xx <- data$x
yy <- data$y
if (is.null(yy)) stop('Error: Input data should contain a y variable')
if (is.null(xx)) stop('Error: Input data should contain an x variable')
if (!is.numeric(yy)) stop('Error: Variable y in input data should be numeric')
if (!is.numeric(xx)) stop('Error: Variable x in input data should be numeric')
if (sum(is.na(yy)) > 0) stop('Error: Variable y in input data cannot contain NA values')
if (sum(is.na(xx)) > 0) stop('Error: Variable x in input data cannot contain NA values')
if (max(abs(yy)) == Inf) stop('Error: Variable y in input data must be finite')
if (max(abs(xx)) == Inf) stop('Error: Variable x in input data must be finite')
y <- yy[2:nn]
y_lag <- yy[1:(nn-1)]
x <- xx[2:nn]
n <- nn-1
#Set new data frame
DATA <- data.frame(y = y, y_lag = y_lag, x = x)
#Set negative log-likelihood function
NEGLOGLIKE <- function(pars) {
c <- pars[1]
phi <- pars[2]
logbeta <- pars[3]
logsigma <- pars[4]
mu <- c + phi*y_lag + exp(logbeta)*x
-sum(dnorm(y, mean = mu, sd = exp(logsigma), log = TRUE)) }
#Set starting parameters
REG <- lm(y ~ y_lag + x, data = DATA)
c0 <- unname(REG$coef[1])
phi0 <- unname(REG$coef[2])
logbeta0 <- max(unname(REG$coef[3]), -100)
logsigma0 <- 0.5*log(sum(REG$residuals^2)/REG$df.residual)
PAR0 <- c(c0, phi0, logbeta0, logsigma0)
#Perform optimisation
PAR.NAMES <- c('(Intercept)', 'phi', 'logbeta', 'logsigma')
FIT <- optim(par = PAR0, fn = NEGLOGLIKE, method = "BFGS", hessian = TRUE, ...)
CONVERGENCE <- FIT$convergence
MESSAGE <- FIT$message
MAX.LOGLIKE <- -FIT$value
MAX.LIKEPP <- exp(MAX.LOGLIKE/n)
MLE.VAR <- solve(FIT$hessian)
MLE.SE <- sqrt(diag(MLE.VAR))
MLE <- FIT$par
MLE.MAT <- matrix(c(MLE, MLE.SE), nrow = 2, byrow = TRUE)
rownames(MLE.MAT) <- c('MLE', 'Std Err')
colnames(MLE.MAT) <- PAR.NAMES
rownames(MLE.VAR) <- PAR.NAMES
colnames(MLE.VAR) <- PAR.NAMES
#Set output
OUT <- list(n = n,
MLE = MLE.MAT, MLE.variance = MLE.VAR,
max.loglike = MAX.LOGLIKE, max.likeperpoint = MAX.LIKEPP,
optim.convergence = CONVERGENCE, optim.message = MESSAGE)
#Give output
OUT }
Here is what you get when you use this function on the data in your question.
FIT.MODEL <- fit.model(df)
FIT.MODEL
$n
[1] 99
$MLE
(Intercept) phi logbeta logsigma
MLE -0.11400973 -0.1291760 -6.661433 -0.04390831
Std Err 0.09668425 0.1001072 12.300076 0.07106554
$MLE.variance
(Intercept) phi logbeta logsigma
(Intercept) 9.347844e-03 9.690700e-04 -0.019090086 -6.909620e-07
phi 9.690700e-04 1.002144e-02 -0.003317831 -4.928774e-07
logbeta -1.909009e-02 -3.317831e-03 151.291859518 9.875562e-03
logsigma -6.909620e-07 -4.928774e-07 0.009875562 5.050310e-03
$max.loglike
[1] -136.1362
$max.likeperpoint
[1] 0.2528111
$optim.convergence
[1] 0
$optim.message
NULL