I am trying to create a new column, first_attack, with several conditions based on 3 columns. I am trying to achieve this with tidyverse only. Here are my conditions:
Yes
in one column, regardless of other categories (No/Unknown/NA
) in the same id across columns: previous_cabg, previous_pci, previous_ami
then assign Yes
in test variableNo
in all columns for the same id then assign NO for the test variableNO
for one column and NA/Unknown
in the other columns for the same id then assign with No
in the test variableYes
in all column for the same id then assign Yes
in the test variableYes in one column and
NA/Unknownfor the same id in each column then assign
Yes`in test variableThis is the type of dataset I have:
structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier",
"1.01 Hospital identifier", "1.01 Hospital identifier"),
expected = c("value in level set", "value in level set",
"value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
I thought the best is to use case_when. Thus, I oddly I proceeded with it. This is the code.
test_first_attack <- sample_n %>%
dplyr::mutate(first_attack =
dplyr::case_when(
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
# deal with the unknown category
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~'Yes'
))
This is the output.
test_first_attack <- structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor"), first_attack = c("Yes",
"Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes",
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes",
NA, "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes",
"Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes",
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes",
"Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes"
)), row.names = c(NA, -60L), problems = structure(list(row = c(34136L,
121773L, 121779L), col = c("1.01 Hospital identifier", "1.01 Hospital identifier",
"1.01 Hospital identifier"), expected = c("value in level set",
"value in level set", "value in level set"), actual = c("CMH",
"CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
As you can see, I get 'Yes' as output for the columns with id rows with NO across all the columns. Let alone taking into consideration more complex conditions like 'Unknown\NA\Yes' or other conditions.
Why I cannot get my desired output? Is there another way to achieve it with tidyverse only?
I think all you need is:
case_when(
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
TRUE ~ "Unknown/NA"
)
The first line will result in a "Yes" for every row where at least one of the three test columns is "Yes". The rows that satisfy the first line of the case_when
will not be tested for any subsequent conditions.
Of the remaining rows, the second line will result in a "No" where any of the three test columns is "No". The rows that meet the second line of the case_when
will not be tested for any subsequent conditions.
The last line assigns "Unknown/NA" to all remaining rows, which should be only those rows that don't have any "Yes" or "No" values in the three test columns.
For completeness, here's a full reproducible example using your sample data:
library(tidyverse)
test_first_attack = structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier",
"1.01 Hospital identifier", "1.01 Hospital identifier"),
expected = c("value in level set", "value in level set",
"value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
test_first_attack = test_first_attack %>%
mutate(first_attack = case_when(
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
TRUE ~ "Unknown/NA"
))
test_first_attack
#> # A tibble: 60 x 5
#> id previous_cabg previous_pci previous_ami first_attack
#> <int> <fct> <fct> <fct> <chr>
#> 1 112139 No No No No
#> 2 43919 No No No No
#> 3 92430 No Yes Yes Yes
#> 4 87137 <NA> <NA> No No
#> 5 95417 No No No No
#> 6 66955 <NA> <NA> <NA> Unknown/NA
#> 7 16293 <NA> <NA> No No
#> 8 61396 No Yes Yes Yes
#> 9 25379 No Yes No Yes
#> 10 79229 No No No No
#> # … with 50 more rows