I have survey data I'm creating contingency tables for. I have 3 specific variables I want to both loop over and subset my survey object with the factor levels of those 3 specific variables to get the proportions I am seeking.
I have read several posts on SO, but the two posts which get me extremely close to my intended goal are here (rafa pereira's reply) and here.
My problem is in the combining of both solutions for my purposes, which encounters various errors I haven't been able to resolve.
here's a reprex with my initial solution:
library(dplyr)
library(survey)
df<-structure(list(USOC_Wave = c(2, 6, 9, 5, 11, 11, 2, 2, 4, 3,
10, 8, 11, 11, 11, 5, 6, 10, 6, 1, 9, 4, 9, 4, 11, 12, 5, 2,
10, 11, 7, 5, 4, 11, 6, 10, 9, 13, 3, 7, 5, 10, 8, 7, 6, 12,
12, 1, 12, 5), ethnicity = c("White", "White", "White", "Asian",
"Asian", "White", "White", "White", "White", "White", "White",
"Asian", "White", "White", "White", "White", "White", "Asian",
"White", "White", NA, "Asian", "Asian", "White", "White", "White",
"White", "White", "White", "White", "White", "White", "Asian",
"White", "White", "Asian", "White", "White", "White", "White",
"White", "White", "White", "White", "White", "White", "Asian",
"White", "White", "Mixed"), sex = c("Men", "Men", "Men", "Men",
"Men", "Women", "Men", "Men", "Women", "Men", "Women", "Women",
"Women", "Women", "Women", "Women", "Women", "Men", "Women",
"Women", "Men", "Men", "Men", "Men", "Women", "Women", "Women",
"Men", "Women", "Women", "Women", "Men", "Women", "Women", "Women",
"Men", "Women", "Women", "Women", "Men", "Women", "Women", "Women",
"Women", "Women", "Men", "Men", "Men", "Women", "Women"), age = c("16-29",
"30-64", "30-64", "30-64", "30-64", "30-64", "16-29", "65+",
"30-64", "30-64", "30-64", "30-64", "30-64", "65+", "30-64",
"30-64", "30-64", "30-64", "65+", "30-64", "16-29", "65+", "16-29",
"16-29", "16-29", "30-64", "30-64", "30-64", "65+", "16-29",
"30-64", "30-64", "65+", "30-64", "30-64", "30-64", "16-29",
"16-29", "30-64", "30-64", "30-64", "30-64", "30-64", "30-64",
"16-29", "30-64", "65+", "65+", "30-64", "30-64"), strata = c(2902,
3165, 3069, 2108, 3943, 2683, 2521, 3175, 3232, 3256, 42, 3401,
2326, 2108, 701, 2074, 1, 5122, 12, 2721, 5122, 3991, 3717, 3157,
2311, 101, 2717, 118, 2425, 2584, 2523, 2222, 2400, 2729, 2199,
3361, 10, 2427, 2151, 2584, 2327, 2, 2750, 3297, 2363, 114, 2750,
2574, 2843, 4121), psu = c(3804, 4330, 4138, 2215, 38089, 3365,
3041, 4350, 4464, 4512, 156, 11187, 2651, 2216, 1672, 2147, 3,
52063, 47, 3441, 52086, 40537, 26666, 4314, 2621, 403, 3433,
458, 2849, 3168, 3045, 2443, 2800, 3458, 2397, 9013, 31, 2854,
2302, 3168, 2653, 6, 3500, 4593, 2725, 447, 3499, 3148, 3686,
46785), weight_cs = c(2.80231904983521, 0, 0.950280964374542,
0.28423735499382, 0.300251632928848, 0.766829490661621, 2.18452429771423,
0.680638015270233, 0.224062830209732, 2.74595475196838, 0.718028843402863,
0.340109616518021, 2.88688373565674, 1.17885708808899, 0.620745718479156,
1.20946884155273, 0.57785838842392, 0.305908054113388, 0.727640688419342,
1.17930126190186, 0, 0.623862087726593, 0.372526079416275, 0,
1.3677384853363, 2.87374138832092, 1.31425619125366, 0.462548196315765,
1.18157768249512, 0.814507722854614, 1.21053576469421, 2.14700984954834,
0.449016481637955, 1.151535987854, 0.790829658508301, 0.359708696603775,
3.43058443069458, 0.309507787227631, 1.17791354656219, 1.70297181606293,
0.741691768169403, 1.52170836925507, 0, 0.989463746547699, 1.34024882316589,
0.842447340488434, 0.869455099105835, 0.846965670585632, 3.40495872497559,
0.730816066265106), unpaid = c(2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1)), row.names = c(NA,
-50L), class = c("tbl_df", "tbl", "data.frame"))
uos_design<-survey::svydesign(id= ~psu, strata= ~strata, survey.lonely.psu="adjust",
weights= ~weight_cs, data=df)
Here is the an initial solution that allows me to subset dynamically and then bind into a dataframe, let's call it #1:
groups <- unique(df$ethnicity) #get unique levels
tablefun <- function(i){svytable(~USOC_Wave+unpaid+ethnicity,
design = subset(uos_design, ethnicity == i)) %>%
prop.table(margin = 1)
}
results <- do.call(rbind, lapply(groups, tablefun)) #into df
and here is #2 which allows me to iterate object x in the formula of svytable:
lapply(names(df[c("ethnicity","sex","age")]), function(x){
svytable(bquote(~.(as.name(x)) +USOC_Wave + unpaid),
design = uos_design) %>%
prop.table(margin = 1)
} )
I am aware of possible dplyr and srvyr resolutions to my challenge but srvyr::summarise is taking forever to run (possibly due to this issue) whereas survey is painlessly quick - ergo tidyverse solutions are unlikely to work for my use case, which pains me.
#1 is my perfect solution, it just lacks automation. I can just manually type #1 for my other variables (sex
and age
), but surely there must be a way to combine both?
I have tried adapting the two solutions by replacing ethnicity
with y
and calling y
in both a function and a for loop that preceded the existing function(i)
, and tried using mapply instead of lapply too but am getting stuck.
Help is gratefully received!
Try the following, which makes use of nesting your functions. The results are not ideally suited for rbind
ing into a data frame, so I left it as a named list.
vars <- c('ethnicity', 'sex', 'age')
f <- function(var) {
form <- as.formula(paste0("~USOC_Wave+unpaid"))
groups <- na.omit(unique(df[,var, drop=TRUE])) # get unique levels
tablefun <- function(grp, var){
design <- subset(uos_design, get(var)==grp)
svytable(form, design) |> prop.table(margin=1)
}
setNames(lapply(groups, tablefun, var=var), groups)
}
result <- setNames(lapply(vars, f), vars); result
$ethnicity
$ethnicity$White
unpaid
USOC_Wave 1 2
1 0.41799314 0.58200686
2 0.07545611 0.92454389
3 0.00000000 1.00000000
4 0.00000000 1.00000000
5 0.39668156 0.60331844
6 0.00000000 1.00000000
7 0.56367299 0.43632701
8
9 0.00000000 1.00000000
10 0.00000000 1.00000000
11 0.00000000 1.00000000
12 0.00000000 1.00000000
13 0.00000000 1.00000000
$ethnicity$Asian
unpaid
USOC_Wave 2
4 1
5 1
8 1
9 1
10 1
11 1
12 1
$ethnicity$Mixed
unpaid
USOC_Wave 1
5 1
$sex
$sex$Men
unpaid
USOC_Wave 1 2
1 1.00000000 0.00000000
2 0.07545611 0.92454389
3 0.00000000 1.00000000
4 0.00000000 1.00000000
5 0.88308990 0.11691010
6
7 0.00000000 1.00000000
9 0.00000000 1.00000000
10 0.00000000 1.00000000
11 0.00000000 1.00000000
12 0.00000000 1.00000000
$sex$Women
unpaid
USOC_Wave 1 2
1 0.0000000 1.0000000
3 0.0000000 1.0000000
4 0.0000000 1.0000000
5 0.1828762 0.8171238
6 0.0000000 1.0000000
7 1.0000000 0.0000000
8 0.0000000 1.0000000
9 0.0000000 1.0000000
10 0.0000000 1.0000000
11 0.0000000 1.0000000
12 0.0000000 1.0000000
13 0.0000000 1.0000000
$age
$age$`16-29`
unpaid
USOC_Wave 2
2 1
4
6 1
9 1
11 1
13 1
$age$`30-64`
unpaid
USOC_Wave 1 2
1 0.0000000 1.0000000
2 1.0000000 0.0000000
3 0.0000000 1.0000000
4 0.0000000 1.0000000
5 0.4477378 0.5522622
6 0.0000000 1.0000000
7 0.5636730 0.4363270
8 0.0000000 1.0000000
9 0.0000000 1.0000000
10 0.0000000 1.0000000
11 0.0000000 1.0000000
12 0.0000000 1.0000000
$age$`65+`
unpaid
USOC_Wave 1 2
1 1 0
2 0 1
4 0 1
6 0 1
10 0 1
11 0 1
12 0 1