I have the following data:
aa <- data.frame(
group = c("asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa"),
id = c(1, 2, 2, 2, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10),
Fr_1 = c(NA, "Apple", "Apple", "Apple", "Blueberry", "Blueberry", "Blueberry", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Mango", "mango", "Mango", "Mango", "Mango", "Mango", "mango", "Mango", "Mango", "Mango", "Watermelon", "Watermelon", "Watermelon", "Watermelon", "Watermelon", "Watermelon", "Watermelon", NA),
Fr_2 = c("\bBanana\b", "Apple", "Apple", "Apple", "Blueberry", "Blueberry", "Blueberry", "Orange", "Strawberry", "Grapes", "Pineapple", "Mango", "Watermelon", "Kiwi", "\bBanana\b", "Apple", "Blueberry", "Orange", "Strawberry", "Grapes", "Pineapple", "Mango", "Mango", "Mango", "Mango", "Mango", "Mango", "Mango", "Mango", "Mango", "Mango", "Watermelon", "Watermelon", "Watermelon", "Watermelon", "Watermelon", "Watermelon", "Watermelon", "Kiwi")
)
I want to create a new column called cnt
that shows the number of instances of the elements in the variable Fr_1
. If it has same word but starting with capital letter, the column should add the number of instances and and keep them both. The expected result of group fdsa
should be:
bb <- data.frame(
group = c("asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "asdf", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa", "fdsa"),
id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10),
Fr_1 = c(NA, "Apple", "Blueberry", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Mango", "mango", "Watermelon", NA),
Fr_2 = c("\bBanana\b", "Apple", "Blueberry", "Orange", "Strawberry", "Grapes", "Pineapple", "Mango", "Watermelon", "Kiwi", "\bBanana\b", "Apple", "Blueberry", "Orange", "Strawberry", "Grapes", "Pineapple", "Mango", "Mango", "Watermelon", "Kiwi"),
cnt = c(0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 7, 0)
)
This is what I am trying:
cc <- aa |>
dplyr::group_by(group, Fr_1, Fr_2, id) |>
dplyr::count(Fr_1, group, name = "cnt", sort = F)
The main issue is that id
should have a natural order (1, 2, 3, ...). Secondly, NAs are counted as 1, it should be 0. Thirdly, Mango and mango both should be 10.
If I understood the condition correctly, then it is enough to simply check the equality of two columns converted to lower case (in my solution, this is the new column fr_equals). Then we group using the new column instead of the two original ones.
Using dplyr:
cc <- aa |>
mutate(fr_equals = tolower(Fr_1) == tolower(Fr_2)) |>
group_by(group, id, fr_equals) |>
mutate(cnt = n()) |>
ungroup() |>
mutate(cnt = if_else(is.na(fr_equals), 0L, cnt)) |>
select(-fr_equals) |>
distinct()
The same using data.table:
cc <- aa |>
as.data.table() |>
_[, fr_equals := tolower(Fr_1) == tolower(Fr_2)] |>
_[, cnt := .N, by = .(group, id, fr_equals)] |>
_[is.na(fr_equals), cnt := 0] |>
_[, -c("fr_equals")] |>
unique()