I'm trying to estimate proportions and CIs for each of four levels of my variable sexual_orientation
(straight, gay/lesbian, bisexual, other) for each county in California using the survey
package, to account for complex survey design. What I expect to get is this (thanks to @jpsmith for looping code using svyby
):
county mean ci_l ci_u sex_orient
0.A A 0.911193691 0.8520671385 0.94812694 0
1.A A 0.036149402 0.0149796639 0.08466537 1
2.A A 0.042363554 0.0196451241 0.08897020 2
3.A A 0.010293353 0.0018801543 0.05430515 3
0.B B 0.923758035 0.8636429959 0.95863968 0
1.B B 0.036042649 0.0141337843 0.08885182 1
2.B B 0.038706656 0.0164219903 0.08851038 2
3.B B 0.001492660 0.0002056946 0.01074521 3
When I try to wrap the loop code in a function, though, I get only the proportions for the last level ("other") of my sexual_orientation
variable:
0.A A 0.010293353 0.0018801543 0.05430515 0
1.A A 0.010293353 0.0018801543 0.05430515 1
2.A A 0.010293353 0.0018801543 0.05430515 2
3.A A 0.010293353 0.0018801543 0.05430515 3
0.B B 0.001492660 0.0002056946 0.01074521 0
1.B B 0.001492660 0.0002056946 0.01074521 1
2.B B 0.001492660 0.0002056946 0.01074521 2
3.B B 0.001492660 0.0002056946 0.01074521 3
library(survey)
library(dplyr)
#Create df
# Set the number of trials and the probabilities for each outcome
set.seed(130)
probs <- c(.94, .03, .02, .01)
n <- 2000
sogi <- data.frame(t(rmultinom(n, 1, prob = probs)))
names(sogi) <- c("straight","gay","bisexual","other")
df_sogi <- sogi %>% mutate(sexual_orientation =
as.factor(case_when(
straight %in% 1 ~ 0,
gay %in% 1 ~ 1,
bisexual %in% 1 ~ 2,
other %in% 1 ~ 3))) %>%
mutate(pw = runif(n, 0.2, 3.5)) %>%
mutate(county = as.factor(rep(LETTERS[1:10], each = n/length(LETTERS[1:10])))) %>%
mutate(ind = row_number()) %>%
select(ind, county, sexual_orientation, pw)
table(df_sogi$county, df_sogi$sexual_orientation)
#Make survey design and survey design replicate objects
df_svy_z1 <- svydesign(id=~ind, data = df_sogi, weights = ~pw, strata = ~county)
###In function taking df arg
get_table <- function(df) {
svCounty <- list()
for(i in sort(unique(df[ , "sexual_orientation"]))) {
svCounty[[i]] <- svyby(~ I(sexual_orientation == i),
design = df_svy_z1,
by = ~ county,
FUN = svyciprop,
vartype = "ci",
method = "xlogit")
svCounty[[i]]$sexual_orientation <- i
names(svCounty[[i]])[2] <- "mean"
}
return(svCounty)
}
do.call(rbind, get_table(df_sogi))[order(do.call(rbind, get_table(df_sogi))$county), ]
###In function looping through values of sexual_orientation
svCounty <- list()
get_table <- function(i) {
svCounty[[i]] <- svyby(~ I(sexual_orientation == i),
design = df_svy_z1,
by = ~ county,
FUN = svyciprop,
vartype = "ci",
method = "xlogit")
svCounty[[i]]$sex_orient <- i
names(svCounty[[i]])[2] <- "mean"
svCounty[[i]]
}
rbindlist(lapply(sort(unique(df_sogi$sexual_orientation)), get_table))
How can I make these functions work, please?
Using sprintf
and as.formula
. Since we have a factor variable we can exploit levels
. I'd rather use the "survey.design"
instead of the raw data to avoid confusion.
> library(survey)
>
> lapply(levels(df_svy_z1$variables$sexual_orientation), \(x) {
+ fo <- sprintf('~ I(sexual_orientation == %s)', x) |> as.formula()
+ fit <- svyby(formula=fo, design=df_svy_z1, by=~county,
+ FUN=svyciprop, vartype="ci", method="xlogit")
+ names(fit)[2] <- 'mean'
+ fit$sexual_orientation <- x
+ fit
+ }) |> do.call(what='rbind')
county mean ci_l ci_u sexual_orientation
A A 0.911193691 0.8520671385 0.94812694 0
B B 0.923758035 0.8636429959 0.95863968 0
C C 0.929524740 0.8762428563 0.96089073 0
D D 0.914172405 0.8548171915 0.95066197 0
E E 0.927501303 0.8739160975 0.95937173 0
F F 0.959591984 0.9139878698 0.98150581 0
G G 0.916107681 0.8583546261 0.95163978 0
H H 0.968962680 0.9313564497 0.98627013 0
I I 0.919874178 0.8655967205 0.95341196 0
J J 0.919018480 0.8629143921 0.95340118 0
A1 A 0.036149402 0.0149796639 0.08466537 1
B1 B 0.036042649 0.0141337843 0.08885182 1
C1 C 0.025837059 0.0097931213 0.06640293 1
D1 D 0.053382252 0.0263936541 0.10499180 1
E1 E 0.046687261 0.0229193724 0.09276293 1
F1 F 0.028968138 0.0119900253 0.06832504 1
G1 G 0.028234286 0.0117621310 0.06622873 1
H1 H 0.017505999 0.0067078219 0.04490118 1
I1 I 0.058574701 0.0296281887 0.11252238 1
J1 J 0.036362103 0.0164375895 0.07850995 1
A2 A 0.042363554 0.0196451241 0.08897020 2
B2 B 0.038706656 0.0164219903 0.08851038 2
C2 C 0.036774521 0.0157330277 0.08356782 2
D2 D 0.011188560 0.0021662472 0.05569108 2
E2 E 0.017724015 0.0053455758 0.05712038 2
F2 F 0.011439877 0.0022008972 0.05723769 2
G2 G 0.040455477 0.0175027776 0.09072823 2
H2 H 0.005907451 0.0008172062 0.04139059 2
I2 I 0.013016323 0.0047080705 0.03546360 2
J2 J 0.037334343 0.0157769021 0.08578046 2
A3 A 0.010293353 0.0018801543 0.05430515 3
B3 B 0.001492660 0.0002056946 0.01074521 3
C3 C 0.007863680 0.0015179030 0.03968435 3
D3 D 0.021256783 0.0067288906 0.06509525 3
E3 E 0.008087422 0.0011207690 0.05593360 3
F3 F 0.000000000 NaN NaN 3
G3 G 0.015202557 0.0036968435 0.06034853 3
H3 H 0.007623870 0.0010562198 0.05286832 3
I3 I 0.008534798 0.0020115015 0.03546143 3
J3 J 0.007285074 0.0010090080 0.05062042 3
You can wrap this in a function like so:
get_table <- \(dsg) {
lapply(levels(dsg$variables$sexual_orientation), \(x) {
fo <- sprintf('~ I(sexual_orientation == %s)', x) |> as.formula()
fit <- svyby(formula=fo, design=dsg, by=~county,
FUN=svyciprop, vartype="ci", method="xlogit")
names(fit)[2] <- 'mean'
fit$sexual_orientation <- x
fit
}) |> do.call(what='rbind')
}
then
get_table(df_svy_z1)
Data:
n <- 2000
set.seed(130)
sogi <- data.frame(t(rmultinom(n, 1, prob=c(.94, .03, .02, .01)))) |>
setNames(c("straight", "gay", "bisexual", "other"))
df_sogi <- data.frame(ind=seq_len(n),
county=as.factor(rep(LETTERS[1:10], each=n/10)),
sexual_orientation=as.factor((0:3)[apply(sogi == 1, 1, which)]),
pw=runif(n, 0.2, 3.5))
df_svy_z1 <- survey::svydesign(id=~ind, data=df_sogi, weights=~pw, strata=~county)