Having difficulty creating a percentage table

Example Dataframe

structure(list(sex = c("Male", "Female", "Female", "Female", 
"Male", "Female", "Female", "Male", "Female"), cigarettes_smoking_status = c("Non-smoker", 
"Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker", 
"Non-smoker", "Regular Smoker", "Non-smoker")), row.names = 2:10, class = "data.frame")

Code

smoking_status_by_per <- smoking_dataset %>% 
  group_by(cigarettes_smoking_status, sex) %>%  
  dplyr::summarise(count1=n()) %>% 
  mutate(percentage=(count1/sum(count1))*100) %>%
  pivot_wider(names_from = sex, values_from = percentage) %>%  
  group_by(cigarettes_smoking_status)

The problem

I am having difficulty producing a percentage table in R that is condensed to 4 rows (Occasional smokers, Non-smokers, regular smokers and Prefer not to say) that clearly shows the percentage in each category by sex. Ideally, I am looking to produce a table in R that looks like this How I want the table to look:

I have been attempting to use janitor::tabyl and pivot_wider to condense the rows, so there are just 4 rows. One row for Regular smokers. One row for occasional smokers etc. This is what my current output looks like. Current dodgy output:

Current dodgy output

Solution

We can use proportions and some binding to get what you have in the example.

Starting with enough data to fill out the matrix,

set.seed(42)
quux <- data.frame(response = sample(c("Non-smoker", "Occasional smoker", "Prefer not to say", "Regular smoker"), size=5000, replace=TRUE), gender = sample(c("Male", "Female", "Prefer not to say", "Unknown"), size=5000, replace=TRUE))
head(quux)
#            response            gender
# 1        Non-smoker           Unknown
# 2        Non-smoker Prefer not to say
# 3        Non-smoker            Female
# 4        Non-smoker           Unknown
# 5 Occasional smoker            Female
# 6    Regular smoker           Unknown

base R

We can look at a simple table with:

table(quux)
#                    gender
# response            Female Male Prefer not to say Unknown
#   Non-smoker           330  294               323     312
#   Occasional smoker    308  344               287     325
#   Prefer not to say    292  337               310     304
#   Regular smoker       309  308               311     306

For future verification, the sum of the first column (Female) is 1239, and the expected column-wise percentages for that are

c(330, 308, 292, 309) / 1239
# [1] 0.2663438 0.2485876 0.2356739 0.2493947

We can get the percentages with

round(100 * proportions(table(quux), margin = 2), 2)
#                    gender
# response            Female  Male Prefer not to say Unknown
#   Non-smoker         26.63 22.92             26.24   25.02
#   Occasional smoker  24.86 26.81             23.31   26.06
#   Prefer not to say  23.57 26.27             25.18   24.38
#   Regular smoker     24.94 24.01             25.26   24.54

Do get the right-most (Total) and bottom summary, we'll need to bind things.

tbl1 <- do.call(table, quux)
tbl2 <- 100 * proportions(tbl1, margin = 2)
tbl3 <- rbind(tbl2, `Number of Respondents` = colSums(tbl1))
tbl3
#                           Female       Male Prefer not to say    Unknown
# Non-smoker              26.63438   22.91504          26.23883   25.02005
# Occasional smoker       24.85876   26.81216          23.31438   26.06255
# Prefer not to say       23.56739   26.26656          25.18278   24.37851
# Regular smoker          24.93947   24.00624          25.26401   24.53889
# Number of Respondents 1239.00000 1283.00000        1231.00000 1247.00000

tbl4 <- cbind(tbl3, `Total %` = c(100 * proportions(rowSums(tbl1)), sum(tbl1)))
tbl4
#                           Female       Male Prefer not to say    Unknown Total %
# Non-smoker              26.63438   22.91504          26.23883   25.02005   25.18
# Occasional smoker       24.85876   26.81216          23.31438   26.06255   25.28
# Prefer not to say       23.56739   26.26656          25.18278   24.37851   24.86
# Regular smoker          24.93947   24.00624          25.26401   24.53889   24.68
# Number of Respondents 1239.00000 1283.00000        1231.00000 1247.00000 5000.00

And we can round the numbers:

round(tbl4, 1)
#                       Female   Male Prefer not to say Unknown Total %
# Non-smoker              26.6   22.9              26.2    25.0    25.2
# Occasional smoker       24.9   26.8              23.3    26.1    25.3
# Prefer not to say       23.6   26.3              25.2    24.4    24.9
# Regular smoker          24.9   24.0              25.3    24.5    24.7
# Number of Respondents 1239.0 1283.0            1231.0  1247.0  5000.0

dplyr

library(dplyr)
library(tidyr) # pivot_wider
tbl1 <- tibble(quux) %>%
  count(response, gender) %>%
  pivot_wider(response, names_from = gender, values_from = n)
tbl1
# # A tibble: 4 × 5
#   response          Female  Male `Prefer not to say` Unknown
#   <chr>              <int> <int>               <int>   <int>
# 1 Non-smoker           330   294                 323     312
# 2 Occasional smoker    308   344                 287     325
# 3 Prefer not to say    292   337                 310     304
# 4 Regular smoker       309   308                 311     306

tbl2 <- tbl1 %>%
  summarize(
    response = "Number of Respondents",
    across(-response, ~ sum(.)),
    `Total %` = sum(tbl1[,-1])
  )
tbl2
# # A tibble: 1 × 6
#   response              Female  Male `Prefer not to say` Unknown `Total %`
#   <chr>                  <int> <int>               <int>   <int>     <int>
# 1 Number of Respondents   1239  1283                1231    1247      5000

tbl1 %>%
  mutate(
    across(Female:Unknown, ~ 100 * . / sum(.)),
    `Total %` = rowSums(tbl1[,-1]),
    `Total %` = 100 * `Total %` / sum(`Total %`)
  ) %>%
  bind_rows(tbl2)
# # A tibble: 5 × 6
#   response              Female   Male `Prefer not to say` Unknown `Total %`
#   <chr>                  <dbl>  <dbl>               <dbl>   <dbl>     <dbl>
# 1 Non-smoker              26.6   22.9                26.2    25.0      25.2
# 2 Occasional smoker       24.9   26.8                23.3    26.1      25.3
# 3 Prefer not to say       23.6   26.3                25.2    24.4      24.9
# 4 Regular smoker          24.9   24.0                25.3    24.5      24.7
# 5 Number of Respondents 1239   1283                1231    1247      5000