structure(list(sex = c("Male", "Female", "Female", "Female",
"Male", "Female", "Female", "Male", "Female"), cigarettes_smoking_status = c("Non-smoker",
"Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker",
"Non-smoker", "Regular Smoker", "Non-smoker")), row.names = 2:10, class = "data.frame")
smoking_status_by_per <- smoking_dataset %>%
group_by(cigarettes_smoking_status, sex) %>%
dplyr::summarise(count1=n()) %>%
mutate(percentage=(count1/sum(count1))*100) %>%
pivot_wider(names_from = sex, values_from = percentage) %>%
group_by(cigarettes_smoking_status)
I am having difficulty producing a percentage table in R that is condensed to 4 rows (Occasional smokers, Non-smokers, regular smokers and Prefer not to say) that clearly shows the percentage in each category by sex. Ideally, I am looking to produce a table in R that looks like this How I want the table to look:
I have been attempting to use janitor::tabyl
and pivot_wider
to condense the rows, so there are just 4 rows. One row for Regular smokers. One row for occasional smokers etc. This is what my current output looks like.
Current dodgy output:
We can use proportions
and some binding to get what you have in the example.
Starting with enough data to fill out the matrix,
set.seed(42)
quux <- data.frame(response = sample(c("Non-smoker", "Occasional smoker", "Prefer not to say", "Regular smoker"), size=5000, replace=TRUE), gender = sample(c("Male", "Female", "Prefer not to say", "Unknown"), size=5000, replace=TRUE))
head(quux)
# response gender
# 1 Non-smoker Unknown
# 2 Non-smoker Prefer not to say
# 3 Non-smoker Female
# 4 Non-smoker Unknown
# 5 Occasional smoker Female
# 6 Regular smoker Unknown
We can look at a simple table with:
table(quux)
# gender
# response Female Male Prefer not to say Unknown
# Non-smoker 330 294 323 312
# Occasional smoker 308 344 287 325
# Prefer not to say 292 337 310 304
# Regular smoker 309 308 311 306
For future verification, the sum of the first column (Female
) is 1239, and the expected column-wise percentages for that are
c(330, 308, 292, 309) / 1239
# [1] 0.2663438 0.2485876 0.2356739 0.2493947
We can get the percentages with
round(100 * proportions(table(quux), margin = 2), 2)
# gender
# response Female Male Prefer not to say Unknown
# Non-smoker 26.63 22.92 26.24 25.02
# Occasional smoker 24.86 26.81 23.31 26.06
# Prefer not to say 23.57 26.27 25.18 24.38
# Regular smoker 24.94 24.01 25.26 24.54
Do get the right-most (Total) and bottom summary, we'll need to bind things.
tbl1 <- do.call(table, quux)
tbl2 <- 100 * proportions(tbl1, margin = 2)
tbl3 <- rbind(tbl2, `Number of Respondents` = colSums(tbl1))
tbl3
# Female Male Prefer not to say Unknown
# Non-smoker 26.63438 22.91504 26.23883 25.02005
# Occasional smoker 24.85876 26.81216 23.31438 26.06255
# Prefer not to say 23.56739 26.26656 25.18278 24.37851
# Regular smoker 24.93947 24.00624 25.26401 24.53889
# Number of Respondents 1239.00000 1283.00000 1231.00000 1247.00000
tbl4 <- cbind(tbl3, `Total %` = c(100 * proportions(rowSums(tbl1)), sum(tbl1)))
tbl4
# Female Male Prefer not to say Unknown Total %
# Non-smoker 26.63438 22.91504 26.23883 25.02005 25.18
# Occasional smoker 24.85876 26.81216 23.31438 26.06255 25.28
# Prefer not to say 23.56739 26.26656 25.18278 24.37851 24.86
# Regular smoker 24.93947 24.00624 25.26401 24.53889 24.68
# Number of Respondents 1239.00000 1283.00000 1231.00000 1247.00000 5000.00
And we can round the numbers:
round(tbl4, 1)
# Female Male Prefer not to say Unknown Total %
# Non-smoker 26.6 22.9 26.2 25.0 25.2
# Occasional smoker 24.9 26.8 23.3 26.1 25.3
# Prefer not to say 23.6 26.3 25.2 24.4 24.9
# Regular smoker 24.9 24.0 25.3 24.5 24.7
# Number of Respondents 1239.0 1283.0 1231.0 1247.0 5000.0
library(dplyr)
library(tidyr) # pivot_wider
tbl1 <- tibble(quux) %>%
count(response, gender) %>%
pivot_wider(response, names_from = gender, values_from = n)
tbl1
# # A tibble: 4 × 5
# response Female Male `Prefer not to say` Unknown
# <chr> <int> <int> <int> <int>
# 1 Non-smoker 330 294 323 312
# 2 Occasional smoker 308 344 287 325
# 3 Prefer not to say 292 337 310 304
# 4 Regular smoker 309 308 311 306
tbl2 <- tbl1 %>%
summarize(
response = "Number of Respondents",
across(-response, ~ sum(.)),
`Total %` = sum(tbl1[,-1])
)
tbl2
# # A tibble: 1 × 6
# response Female Male `Prefer not to say` Unknown `Total %`
# <chr> <int> <int> <int> <int> <int>
# 1 Number of Respondents 1239 1283 1231 1247 5000
tbl1 %>%
mutate(
across(Female:Unknown, ~ 100 * . / sum(.)),
`Total %` = rowSums(tbl1[,-1]),
`Total %` = 100 * `Total %` / sum(`Total %`)
) %>%
bind_rows(tbl2)
# # A tibble: 5 × 6
# response Female Male `Prefer not to say` Unknown `Total %`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Non-smoker 26.6 22.9 26.2 25.0 25.2
# 2 Occasional smoker 24.9 26.8 23.3 26.1 25.3
# 3 Prefer not to say 23.6 26.3 25.2 24.4 24.9
# 4 Regular smoker 24.9 24.0 25.3 24.5 24.7
# 5 Number of Respondents 1239 1283 1231 1247 5000