> df_1
# A tibble: 47 x 3
# Groups: therapy_class [9]
therapy_class Year_month count
<ord> <yearmon> <int>
1 ALK Inhibitors Dec 2019 16
2 ALK Inhibitors Jan 2020 14
3 ALK Inhibitors Feb 2020 14
4 ALK Inhibitors Mar 2020 22
5 ALK Inhibitors Apr 2020 13
6 ALK Inhibitors May 2020 17
7 Anti-VEGF-based therapies Dec 2019 33
8 Anti-VEGF-based therapies Jan 2020 35
9 Anti-VEGF-based therapies Feb 2020 36
10 Anti-VEGF-based therapies Mar 2020 20
# … with 37 more rows
A tibble: 10 x 7
therapy_class `Dec 2019` `Jan 2020` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020`
<ord> <chr> <chr> <chr> <chr> <chr> <chr>
1 ALK Inhibitors "16 <br>[2.7%]" "14 <br>[2.0%]" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%]" "17 <br>[3.4%]"
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "35 <br>[4.9%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%]" "20 <br>[4.0%]"
3 EGFR TKIs "52 <br>[8.8%]" "57 <br>[8.0%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%]" "49 <br>[9.8%]"
4 EGFR-antibody based therapies "" "" "" "" "" ""
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]" "4 <br>[0.6%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" ""
6 IO-based therapies "308 <br>[52.0%]" "385 <br>[54.0%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]" "147 <br>[20.6%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]"
8 Single agent chemotherapies "29 <br>[4.9%]" "33 <br>[4.6%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%]" "22 <br>[4.4%]"
9 Other "30 <br>[5.1%]" "38 <br>[5.3%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%]" "21 <br>[4.2%]"
10 <strong>Total</strong> "<strong>592</str… "<strong>713</str… "<strong>631</st… "<strong>668</st… "<strong>612</st… "<strong>501</st…
> df_2
# A tibble: 46 x 3
# Groups: therapy_class [9]
therapy_class Year_month count
<ord> <yearmon> <int>
1 ALK Inhibitors Dec 2019 16
2 ALK Inhibitors Feb 2020 14
3 ALK Inhibitors Mar 2020 22
4 ALK Inhibitors Apr 2020 13
5 ALK Inhibitors May 2020 17
6 Anti-VEGF-based therapies Dec 2019 33
7 Anti-VEGF-based therapies Jan 2020 35
8 Anti-VEGF-based therapies Feb 2020 36
9 Anti-VEGF-based therapies Mar 2020 20
10 Anti-VEGF-based therapies Apr 2020 21
# … with 36 more rows
> t2
# A tibble: 10 x 7
therapy_class `Dec 2019` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020` `Jan 2020`
<ord> <chr> <chr> <chr> <chr> <chr> <chr>
1 ALK Inhibitors "16 <br>[2.7%]" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%]" "17 <br>[3.4%]" ""
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%]" "20 <br>[4.0%]" "35 <br>[5.0%]"
3 EGFR TKIs "52 <br>[8.8%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%]" "49 <br>[9.8%]" "57 <br>[8.2%]"
4 EGFR-antibody based therapies "" "" "" "" "" ""
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" "" "4 <br>[0.6%]"
6 IO-based therapies "308 <br>[52.0%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]" "385 <br>[55.1%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]" "147 <br>[21.0%]"
8 Single agent chemotherapies "29 <br>[4.9%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%]" "22 <br>[4.4%]" "33 <br>[4.7%]"
9 Other "30 <br>[5.1%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%]" "21 <br>[4.2%]" "38 <br>[5.4%]"
10 <strong>Total</strong> "<strong>592</str… "<strong>631</str… "<strong>668</st… "<strong>612</st… "<strong>501</st… "<strong>699</st…
>
I am trying to create a wide table with counts and percentages from long table. The columns are 'Month Year' which needs to be in order. My issue is when there are rows missing for certain 'Month Year' for the first group (ALK Inhibitors) then the order of the column is disrupted. The missing 'Month Year' is place at the end. Also the long table is not a fixed table. It is generated from function where user gets to choose the month year range. So Year_month column could have any range.
In this example I used Dec 2019 to May 2020 6 month range. "df_1" has all 6 month so the resulting wide table is as expected. "df_2" has Jan 2020 missing for ALK Inhibitors. So the resulting table has 'Jan 2020' at the end.
This is the code I am using the generate the wide table:
df_2 %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")), list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]", 100 * (. / sum(., na.rm = TRUE)))))
))
Here is the sample data df_2
structure(list(therapy_class = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), Year_month = structure(c(2019.91666666667,
2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333,
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25,
2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667,
2020.25, 2020.33333333333, NA, 2019.91666666667, 2020, 2020.08333333333,
2020.25, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667,
2020.25, 2020.33333333333, 2019.91666666667, 2020, 2020.08333333333,
2020.16666666667, 2020.25, 2020.33333333333, 2019.91666666667,
2020, 2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333,
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25,
2020.33333333333), class = "yearmon"), count = c(16L, 14L, 22L,
13L, 17L, 33L, 35L, 36L, 20L, 21L, 20L, 52L, 57L, 60L, 52L, 56L,
49L, NA, 1L, 4L, 4L, 1L, 308L, 385L, 330L, 379L, 345L, 265L,
123L, 147L, 128L, 134L, 120L, 107L, 29L, 33L, 17L, 28L, 25L,
22L, 30L, 38L, 42L, 33L, 31L, 21L)), row.names = c(NA, -46L), groups = structure(list(
therapy_class = structure(1:9, .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:5, 6:11, 12:17, 18L, 19:22, 23:28, 29:34, 35:40, 41:46), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Here is new dataset with month-quarter data
df <- structure(list(therapy_class = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L,
7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), quarter = structure(c(2020.75,
2021, 2021.25, 2020.5, 2020.75, 2021.25, 2020.5, 2020.75, 2021,
2021.25, NA, NA, 2020.5, 2020.75, 2021, 2021.25, 2021.5, 2020.5,
2020.75, 2021, 2021.25, 2021.5, 2020.5, 2020.75, 2021, 2021.25,
2021.5, 2020.5, 2020.75, 2021, 2021.25, 2021.5), class = "yearqtr"),
count = c(4L, 2L, 2L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, NA, NA,
252L, 184L, 190L, 180L, 120L, 59L, 46L, 63L, 47L, 25L, 4L,
4L, 1L, 2L, 1L, 16L, 12L, 21L, 21L, 11L)), row.names = c(NA,
-32L), groups = structure(list(therapy_class = structure(1:9, .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:3, 4:6, 7:10, 11L, 12L, 13:17, 18:22, 23:27, 28:32), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
-- The Quarter dates are out of order due to missing data
therapy_class `2020 Q4` `2021 Q1` `2021 Q2` `2020 Q3` `NA` `2021 Q3`
<ord> <chr> <chr> <chr> <chr> <int> <chr>
1 ALK Inhibitors "4 [1.58%]" "2 [0.72%]" "2 [0.78%]" "" NA ""
2 Anti-VEGF-based therapies "2 [0.79%]" "" "3 [1.17%]" "3 [0.90%]" NA ""
3 EGFR TKIs "1 [0.40%]" "1 [0.36%]" "1 [0.39%]" "1 [0.30%]" NA ""
4 EGFR-antibody based therapies "" "" "" "" NA ""
5 Non-platinum-based chemotherapy combinations "" "" "" "" NA ""
6 IO-based therapies "184 [72.73%]" "190 [68.35%]" "180 [70.31%]" "252 [75.22%]" NA "120 [76.43%]"
7 Platinum-based chemotherapy combinations "46 [18.18%]" "63 [22.66%]" "47 [18.36%]" "59 [17.61%]" NA "25 [15.92%]"
8 Single agent chemotherapies "4 [1.58%]" "1 [0.36%]" "2 [0.78%]" "4 [1.19%]" NA "1 [0.64%]"
9 Other "12 [4.74%]" "21 [7.55%]" "21 [8.20%]" "16 [4.78%]" NA "11 [7.01%]"
10 Total "253" "278" "256" "335" 0 "157"
An option would be to create the missing year months with complete
before doing the pivot_wider
. With pivot_wider
, the default order is based on the unique value appearance in the order
library(dplyr)
library(tidyr)
library(zoo)
df_2 %>%
ungroup %>%
mutate(Year_month = as.Date(Year_month)) %>%
complete(therapy_class, Year_month = seq(from = min(Year_month,
na.rm = TRUE), to = max(Year_month, na.rm = TRUE),
by = '1 month')) %>%
mutate(Year_month = as.yearmon(Year_month)) %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")),
list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]",
100 * (. / sum(., na.rm = TRUE)))))
))
-output
# A tibble: 9 × 8
therapy_class `Dec 2019` `Jan 2020` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020` `NA`
<ord> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALK Inhibitors "16 <br>[2.7%]" "" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%… "17 <br>[3.4… NA
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "35 <br>[5.0%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%… "20 <br>[4.0… NA
3 EGFR TKIs "52 <br>[8.8%]" "57 <br>[8.2%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%… "49 <br>[9.8… NA
4 EGFR-antibody based therapies "" "" "" "" "" "" NA
5 Non-platinum-based chemotherapy combinations "1 <br>[0.2%]" "4 <br>[0.6%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" "" NA
6 IO-based therapies "308 <br>[52.0%]" "385 <br>[55.1%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.… "265 <br>[52… NA
7 Platinum-based chemotherapy combinations "123 <br>[20.8%]" "147 <br>[21.0%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.… "107 <br>[21… NA
8 Single agent chemotherapies "29 <br>[4.9%]" "33 <br>[4.7%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%… "22 <br>[4.4… NA
9 Other "30 <br>[5.1%]" "38 <br>[5.4%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%… "21 <br>[4.2… NA
With the new data
df %>%
ungroup %>%
mutate(Year_month = as.Date(quarter)) %>%
complete(therapy_class,
Year_month = seq(from = min(Year_month,na.rm = TRUE ),
to = max(Year_month, na.rm = TRUE), by = '1 quarter')) %>%
mutate(Year_month = as.yearqtr(Year_month)) %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")),
list( ~ ifelse(is.na(.), "",
paste(., sprintf("<br>[%1.1f%%]",
100 * (. / sum(., na.rm = TRUE))))) ))
-output
# A tibble: 35 × 8
therapy_class quarter `2020 Q3` `2020 Q4` `2021 Q1` `2021 Q2` `2021 Q3` `NA`
<ord> <yearqtr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALK Inhibitors <NA> "" "" "" "" "" NA
2 ALK Inhibitors 2020 Q4 "" "4 <br>[1.6%]" "" "" "" NA
3 ALK Inhibitors 2021 Q1 "" "" "2 <br>[0.7%]" "" "" NA
4 ALK Inhibitors 2021 Q2 "" "" "" "2 <br>[0.8%]" "" NA
5 Anti-VEGF-based therapies 2020 Q3 "3 <br>[0.9%]" "" "" "" "" NA
6 Anti-VEGF-based therapies 2020 Q4 "" "2 <br>[0.8%]" "" "" "" NA
7 Anti-VEGF-based therapies <NA> "" "" "" "" "" NA
8 Anti-VEGF-based therapies 2021 Q2 "" "" "" "3 <br>[1.2%]" "" NA
9 EGFR TKIs 2020 Q3 "1 <br>[0.3%]" "" "" "" "" NA
10 EGFR TKIs 2020 Q4 "" "1 <br>[0.4%]" "" "" "" NA
# … with 25 more rows