rfor-loopsimulationnested-loops

R For loops and Simulations



set.seed(2024)
#conditions

b1<-c(-0.3,0.3)
b2<-c(-0.3,0.3)


conditions <- expand.grid(b1 = b1, b2 = b2)

# some other conditions 
sample_size <- 200
nsim<- 10 #number of simulations 

#empty holders 

xcoef<-numeric()
zcoef<-numeric()
int_pvalue<-numeric()

results <- data.frame(
  condition_id = integer(),
  xcoef = numeric(),
  zcoef = numeric(),
  t1er = numeric()
)


for (i in 1:nrow(conditions)){
  
  xcoef <- numeric(nsim) # Reinitialize storage for x coefficients
  zcoef <- numeric(nsim) # Reinitialize storage for z coefficients
  int_pvalue <- numeric(nsim) # Reinitialize storage for p-values
  
  for( j in 1:nsim){
  
  #true data generation
   sigma<-matrix(c(1,0,
                  0,1),2,2)
   XZ<-as.data.frame(MASS::mvrnorm(n=sample_size,mu=c(0,0),Sigma=sigma))
   colnames(XZ)<-c("x","z")
   XZ$err<-rnorm(n=sample_size,mean=0,sd=1)
 
   
  XZ$True_y <- 
               conditions$b1[i] * XZ$x +
               conditions$b2[i] * XZ$z +
               XZ$err
  
  lmodel<-summary(lm(True_y ~ x * z, data=XZ))
   xcoef[[j]]<-lmodel$coefficients[["x","Estimate"]]
   zcoef[[j]]<-lmodel$coefficients[["z","Estimate"]]
   
   
  # Calculate aggregated results
  results$condition_id<-i
  results$xcoef[i] <- mean(xcoef) # Mean x coefficient estimate
  results$zcoef[i] <- mean(zcoef) # Mean z coefficient estimate
 
  
  }
  


  }

the error message pops up.

Further, I've noticed that

Calculate aggregated results

this part, it is not calculating aggrgated results, rather, it only put the last nsim (nested loop)'s results.

I wanted to aggregate the results across the number of simulations to get mean of each estimates and type 1 error rate.


Solution

  • Something like this?
    Rewrite the loops as functions.

    The result below does not include the condition numbers 1 to 4 but that is just a matter of cbind(condition = 1:4, result).

    sim_one <- function(condition, n, mu, sigma) {
      # true data generation
      XZ <- MASS::mvrnorm(n = n, mu = mu, Sigma = sigma)
      colnames(XZ) <- c("x", "z")
      err <- rnorm(n = sample_size, mean = 0, sd = 1)
      True_y <- c(condition %*% t(XZ)) + err
      XZ <- cbind.data.frame(XZ, True_y)
      
      # fit one regression
      lm(True_y ~ x*z, data = XZ)
    }
    sim_many <- function(condition, R, n, mu, sigma) {
      # run nsim regressions
      fit_list <- replicate(R, sim_one(condition, n, mu, sigma), simplify = FALSE)
      
      # extract the coefficients and compute their mean values
      mean_xz <- sapply(fit_list, \(fit) coef(fit)[c("x", "z")]) |> rowMeans()
      mean_xz <- unname(mean_xz)
      
      # get the F statistics mean value
      ff <- sapply(fit_list, \(fit) summary(fit)[["fstatistic"]][1L]) |> mean()
      
      # the degrees of freedom are always the same, extract them
      # from any of the fits, in this case from the 1st
      df12 <- summary(fit_list[[1L]])[["fstatistic"]][2:3]
      
      # p-value of the average F stat
      t1er <- pf(ff, df1 = df12[1L], df2 = df12[2L], lower.tail = FALSE)
      
      # return a named vector
      c(xcoef = mean_xz[1L], zcoef = mean_xz[2L], t1er = t1er)
    }
    
    # conditions
    b1 <- c(-0.3, 0.3)
    b2 <- c(-0.3, 0.3)
    conditions <- expand.grid(b1 = b1, b2 = b2) |> as.matrix()
    ncond <- nrow(conditions)
    
    # some other conditions 
    sample_size <- 200
    nsim <- 10 # number of simulations 
    
    mu <- c(0, 0)
    sigma <- matrix(c(1, 0, 0, 1), 2, 2)
    
    set.seed(2024)
    t(apply(conditions, 1L, sim_many, R = nsim, n = sample_size, mu = mu, sigma = sigma))
    #>           xcoef      zcoef         t1er
    #> [1,] -0.3316541 -0.2840518 4.240253e-08
    #> [2,]  0.2629164 -0.2967959 7.393880e-07
    #> [3,] -0.3222810  0.2615112 8.463187e-08
    #> [4,]  0.2823582  0.3164789 2.251141e-08
    

    Created on 2024-12-01 with reprex v2.1.1