rfor-loopregressionlapplysurvey

Why is this looping function returning one element of both for loop and lapply list?


The Goal

I'm trying to estimate proportions and CIs for each of four levels of my variable sexual_orientation (straight, gay/lesbian, bisexual, other) for each county in California using the survey package, to account for complex survey design. What I expect to get is this (thanks to @jpsmith for looping code using svyby):

    county        mean         ci_l       ci_u sex_orient
0.A      A 0.911193691 0.8520671385 0.94812694          0
1.A      A 0.036149402 0.0149796639 0.08466537          1
2.A      A 0.042363554 0.0196451241 0.08897020          2
3.A      A 0.010293353 0.0018801543 0.05430515          3
0.B      B 0.923758035 0.8636429959 0.95863968          0
1.B      B 0.036042649 0.0141337843 0.08885182          1
2.B      B 0.038706656 0.0164219903 0.08851038          2
3.B      B 0.001492660 0.0002056946 0.01074521          3

The Problem

When I try to wrap the loop code in a function, though, I get only the proportions for the last level ("other") of my sexual_orientation variable:

0.A      A 0.010293353 0.0018801543 0.05430515                  0
1.A      A 0.010293353 0.0018801543 0.05430515                  1
2.A      A 0.010293353 0.0018801543 0.05430515                  2
3.A      A 0.010293353 0.0018801543 0.05430515                  3
0.B      B 0.001492660 0.0002056946 0.01074521                  0
1.B      B 0.001492660 0.0002056946 0.01074521                  1
2.B      B 0.001492660 0.0002056946 0.01074521                  2
3.B      B 0.001492660 0.0002056946 0.01074521                  3

Reprex

library(survey)
library(dplyr)

#Create df
# Set the number of trials and the probabilities for each outcome
set.seed(130)
probs <- c(.94, .03, .02, .01)
n <- 2000

sogi <- data.frame(t(rmultinom(n, 1, prob = probs)))
names(sogi) <- c("straight","gay","bisexual","other")
df_sogi <- sogi %>% mutate(sexual_orientation =
                             as.factor(case_when(
                               straight %in% 1 ~ 0,
                               gay %in% 1 ~ 1,
                               bisexual %in% 1 ~ 2,
                               other %in% 1 ~ 3))) %>%
  mutate(pw = runif(n, 0.2, 3.5)) %>%
  mutate(county = as.factor(rep(LETTERS[1:10], each = n/length(LETTERS[1:10])))) %>%
  mutate(ind = row_number()) %>%
  select(ind, county, sexual_orientation, pw)

table(df_sogi$county, df_sogi$sexual_orientation)

#Make survey design and survey design replicate objects
df_svy_z1 <- svydesign(id=~ind, data = df_sogi, weights = ~pw, strata = ~county)

Attempt 1

###In function taking df arg
get_table <- function(df) {
  svCounty <- list()
  for(i in sort(unique(df[ , "sexual_orientation"]))) {
    svCounty[[i]]  <- svyby(~ I(sexual_orientation == i),
                           design = df_svy_z1,
                           by = ~ county,
                           FUN = svyciprop,
                           vartype = "ci",
                           method = "xlogit") 
    
    svCounty[[i]]$sexual_orientation <- i
    names(svCounty[[i]])[2] <- "mean"
  }
  return(svCounty)
}

do.call(rbind, get_table(df_sogi))[order(do.call(rbind, get_table(df_sogi))$county), ]

Attempt 2

###In function looping through values of sexual_orientation
svCounty <- list()
get_table <- function(i) {
  svCounty[[i]] <- svyby(~ I(sexual_orientation == i),
                         design = df_svy_z1,
                         by = ~ county,
                         FUN = svyciprop,
                         vartype = "ci",
                         method = "xlogit")
  svCounty[[i]]$sex_orient <- i
  names(svCounty[[i]])[2] <- "mean"
  
  svCounty[[i]]
}

rbindlist(lapply(sort(unique(df_sogi$sexual_orientation)), get_table))

How can I make these functions work, please?


Solution

  • Using sprintf and as.formula. Since we have a factor variable we can exploit levels. I'd rather use the "survey.design" instead of the raw data to avoid confusion.

    > library(survey)
    > 
    > lapply(levels(df_svy_z1$variables$sexual_orientation), \(x) {
    +   fo <- sprintf('~ I(sexual_orientation == %s)', x) |> as.formula()
    +   fit <- svyby(formula=fo, design=df_svy_z1, by=~county, 
    +         FUN=svyciprop, vartype="ci", method="xlogit")
    +   names(fit)[2] <- 'mean'
    +   fit$sexual_orientation <- x
    +   fit
    + }) |> do.call(what='rbind')
       county        mean         ci_l       ci_u sexual_orientation
    A       A 0.911193691 0.8520671385 0.94812694                  0
    B       B 0.923758035 0.8636429959 0.95863968                  0
    C       C 0.929524740 0.8762428563 0.96089073                  0
    D       D 0.914172405 0.8548171915 0.95066197                  0
    E       E 0.927501303 0.8739160975 0.95937173                  0
    F       F 0.959591984 0.9139878698 0.98150581                  0
    G       G 0.916107681 0.8583546261 0.95163978                  0
    H       H 0.968962680 0.9313564497 0.98627013                  0
    I       I 0.919874178 0.8655967205 0.95341196                  0
    J       J 0.919018480 0.8629143921 0.95340118                  0
    A1      A 0.036149402 0.0149796639 0.08466537                  1
    B1      B 0.036042649 0.0141337843 0.08885182                  1
    C1      C 0.025837059 0.0097931213 0.06640293                  1
    D1      D 0.053382252 0.0263936541 0.10499180                  1
    E1      E 0.046687261 0.0229193724 0.09276293                  1
    F1      F 0.028968138 0.0119900253 0.06832504                  1
    G1      G 0.028234286 0.0117621310 0.06622873                  1
    H1      H 0.017505999 0.0067078219 0.04490118                  1
    I1      I 0.058574701 0.0296281887 0.11252238                  1
    J1      J 0.036362103 0.0164375895 0.07850995                  1
    A2      A 0.042363554 0.0196451241 0.08897020                  2
    B2      B 0.038706656 0.0164219903 0.08851038                  2
    C2      C 0.036774521 0.0157330277 0.08356782                  2
    D2      D 0.011188560 0.0021662472 0.05569108                  2
    E2      E 0.017724015 0.0053455758 0.05712038                  2
    F2      F 0.011439877 0.0022008972 0.05723769                  2
    G2      G 0.040455477 0.0175027776 0.09072823                  2
    H2      H 0.005907451 0.0008172062 0.04139059                  2
    I2      I 0.013016323 0.0047080705 0.03546360                  2
    J2      J 0.037334343 0.0157769021 0.08578046                  2
    A3      A 0.010293353 0.0018801543 0.05430515                  3
    B3      B 0.001492660 0.0002056946 0.01074521                  3
    C3      C 0.007863680 0.0015179030 0.03968435                  3
    D3      D 0.021256783 0.0067288906 0.06509525                  3
    E3      E 0.008087422 0.0011207690 0.05593360                  3
    F3      F 0.000000000          NaN        NaN                  3
    G3      G 0.015202557 0.0036968435 0.06034853                  3
    H3      H 0.007623870 0.0010562198 0.05286832                  3
    I3      I 0.008534798 0.0020115015 0.03546143                  3
    J3      J 0.007285074 0.0010090080 0.05062042                  3
    

    You can wrap this in a function like so:

    get_table <- \(dsg) {
      lapply(levels(dsg$variables$sexual_orientation), \(x) {
        fo <- sprintf('~ I(sexual_orientation == %s)', x) |> as.formula()
        fit <- svyby(formula=fo, design=dsg, by=~county, 
                     FUN=svyciprop, vartype="ci", method="xlogit")
        names(fit)[2] <- 'mean'
        fit$sexual_orientation <- x
        fit
      }) |> do.call(what='rbind')      
    }
    

    then

    get_table(df_svy_z1)
    

    Data:

    n <- 2000
    set.seed(130)
    sogi <- data.frame(t(rmultinom(n, 1, prob=c(.94, .03, .02, .01)))) |>
      setNames(c("straight", "gay", "bisexual", "other"))
    df_sogi <- data.frame(ind=seq_len(n),
                county=as.factor(rep(LETTERS[1:10], each=n/10)),
                sexual_orientation=as.factor((0:3)[apply(sogi == 1, 1, which)]),
                pw=runif(n, 0.2, 3.5))
    df_svy_z1 <- survey::svydesign(id=~ind, data=df_sogi, weights=~pw, strata=~county)