rdplyrpivotpercentagejanitor

Having difficulty creating a percentage table


Example Dataframe

structure(list(sex = c("Male", "Female", "Female", "Female", 
"Male", "Female", "Female", "Male", "Female"), cigarettes_smoking_status = c("Non-smoker", 
"Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker", 
"Non-smoker", "Regular Smoker", "Non-smoker")), row.names = 2:10, class = "data.frame")

Code

smoking_status_by_per <- smoking_dataset %>% 
  group_by(cigarettes_smoking_status, sex) %>%  
  dplyr::summarise(count1=n()) %>% 
  mutate(percentage=(count1/sum(count1))*100) %>%
  pivot_wider(names_from = sex, values_from = percentage) %>%  
  group_by(cigarettes_smoking_status) 
 

The problem

I am having difficulty producing a percentage table in R that is condensed to 4 rows (Occasional smokers, Non-smokers, regular smokers and Prefer not to say) that clearly shows the percentage in each category by sex. Ideally, I am looking to produce a table in R that looks like this How I want the table to look: How I want the table to look

I have been attempting to use janitor::tabyl and pivot_wider to condense the rows, so there are just 4 rows. One row for Regular smokers. One row for occasional smokers etc. This is what my current output looks like. Current dodgy output:

Current dodgy output


Solution

  • We can use proportions and some binding to get what you have in the example.

    Starting with enough data to fill out the matrix,

    set.seed(42)
    quux <- data.frame(response = sample(c("Non-smoker", "Occasional smoker", "Prefer not to say", "Regular smoker"), size=5000, replace=TRUE), gender = sample(c("Male", "Female", "Prefer not to say", "Unknown"), size=5000, replace=TRUE))
    head(quux)
    #            response            gender
    # 1        Non-smoker           Unknown
    # 2        Non-smoker Prefer not to say
    # 3        Non-smoker            Female
    # 4        Non-smoker           Unknown
    # 5 Occasional smoker            Female
    # 6    Regular smoker           Unknown
    

    base R

    We can look at a simple table with:

    table(quux)
    #                    gender
    # response            Female Male Prefer not to say Unknown
    #   Non-smoker           330  294               323     312
    #   Occasional smoker    308  344               287     325
    #   Prefer not to say    292  337               310     304
    #   Regular smoker       309  308               311     306
    

    For future verification, the sum of the first column (Female) is 1239, and the expected column-wise percentages for that are

    c(330, 308, 292, 309) / 1239
    # [1] 0.2663438 0.2485876 0.2356739 0.2493947
    

    We can get the percentages with

    round(100 * proportions(table(quux), margin = 2), 2)
    #                    gender
    # response            Female  Male Prefer not to say Unknown
    #   Non-smoker         26.63 22.92             26.24   25.02
    #   Occasional smoker  24.86 26.81             23.31   26.06
    #   Prefer not to say  23.57 26.27             25.18   24.38
    #   Regular smoker     24.94 24.01             25.26   24.54
    

    Do get the right-most (Total) and bottom summary, we'll need to bind things.

    tbl1 <- do.call(table, quux)
    tbl2 <- 100 * proportions(tbl1, margin = 2)
    tbl3 <- rbind(tbl2, `Number of Respondents` = colSums(tbl1))
    tbl3
    #                           Female       Male Prefer not to say    Unknown
    # Non-smoker              26.63438   22.91504          26.23883   25.02005
    # Occasional smoker       24.85876   26.81216          23.31438   26.06255
    # Prefer not to say       23.56739   26.26656          25.18278   24.37851
    # Regular smoker          24.93947   24.00624          25.26401   24.53889
    # Number of Respondents 1239.00000 1283.00000        1231.00000 1247.00000
    
    tbl4 <- cbind(tbl3, `Total %` = c(100 * proportions(rowSums(tbl1)), sum(tbl1)))
    tbl4
    #                           Female       Male Prefer not to say    Unknown Total %
    # Non-smoker              26.63438   22.91504          26.23883   25.02005   25.18
    # Occasional smoker       24.85876   26.81216          23.31438   26.06255   25.28
    # Prefer not to say       23.56739   26.26656          25.18278   24.37851   24.86
    # Regular smoker          24.93947   24.00624          25.26401   24.53889   24.68
    # Number of Respondents 1239.00000 1283.00000        1231.00000 1247.00000 5000.00
    

    And we can round the numbers:

    round(tbl4, 1)
    #                       Female   Male Prefer not to say Unknown Total %
    # Non-smoker              26.6   22.9              26.2    25.0    25.2
    # Occasional smoker       24.9   26.8              23.3    26.1    25.3
    # Prefer not to say       23.6   26.3              25.2    24.4    24.9
    # Regular smoker          24.9   24.0              25.3    24.5    24.7
    # Number of Respondents 1239.0 1283.0            1231.0  1247.0  5000.0
    

    dplyr

    library(dplyr)
    library(tidyr) # pivot_wider
    tbl1 <- tibble(quux) %>%
      count(response, gender) %>%
      pivot_wider(response, names_from = gender, values_from = n)
    tbl1
    # # A tibble: 4 × 5
    #   response          Female  Male `Prefer not to say` Unknown
    #   <chr>              <int> <int>               <int>   <int>
    # 1 Non-smoker           330   294                 323     312
    # 2 Occasional smoker    308   344                 287     325
    # 3 Prefer not to say    292   337                 310     304
    # 4 Regular smoker       309   308                 311     306
    
    tbl2 <- tbl1 %>%
      summarize(
        response = "Number of Respondents",
        across(-response, ~ sum(.)),
        `Total %` = sum(tbl1[,-1])
      )
    tbl2
    # # A tibble: 1 × 6
    #   response              Female  Male `Prefer not to say` Unknown `Total %`
    #   <chr>                  <int> <int>               <int>   <int>     <int>
    # 1 Number of Respondents   1239  1283                1231    1247      5000
    
    tbl1 %>%
      mutate(
        across(Female:Unknown, ~ 100 * . / sum(.)),
        `Total %` = rowSums(tbl1[,-1]),
        `Total %` = 100 * `Total %` / sum(`Total %`)
      ) %>%
      bind_rows(tbl2)
    # # A tibble: 5 × 6
    #   response              Female   Male `Prefer not to say` Unknown `Total %`
    #   <chr>                  <dbl>  <dbl>               <dbl>   <dbl>     <dbl>
    # 1 Non-smoker              26.6   22.9                26.2    25.0      25.2
    # 2 Occasional smoker       24.9   26.8                23.3    26.1      25.3
    # 3 Prefer not to say       23.6   26.3                25.2    24.4      24.9
    # 4 Regular smoker          24.9   24.0                25.3    24.5      24.7
    # 5 Number of Respondents 1239   1283                1231    1247      5000