rrenameanti-join

Anti_join between df1 and df2 but how to change all mismatch in df2 to NA


Below are my two dataframes, df1 and df2

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))

id     text response
1  632592651     asdf        y
2  633322173      cat        y
3  634703802      dog        y
4  634927873    mouse        n
5  635812953 elephant        n
6  636004739    goose        y
7  636101211      rat        y
8  636157799     mice        n
9  636263106    kitty        n
10 636752420   kitten        y

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))

 id           text volume
1   632592651       asdf_xyz   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  636809222      tiger_xyz      6
12 2004722036           lion    345
13 2004894388        leopard     23
14 2005045755        ostrich      2
15 2005535472       kangaroo   4778
16 2005630542       platypus    234
17 2005788781           fish   8675
18 2005809679        reptile   3459
19 2005838317        mammals      8
20 2005866692 amphibians_xyz      9

How do I change the non-matching items from row id1:20 of df2 to NA (i.e. all of them as no matching with df1) and the column 'text' (i.e. asdf_xyz) of id1 to NA?

I have tried

library(dplyr)

df3 <- df2 %>%
  anti_join(df1, by=c("id"))

id           text volume
1   636809222      tiger_xyz      6
2  2004722036           lion    345
3  2004894388        leopard     23
4  2005045755        ostrich      2
5  2005535472       kangaroo   4778
6  2005630542       platypus    234
7  2005788781           fish   8675
8  2005809679        reptile   3459
9  2005838317        mammals      8
10 2005866692 amphibians_xyz      9

df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA

(Doing this one by one because I couldn't find solution how to change the entire value of the dataframe to NA)

id text volume
1  <NA> <NA>   <NA>
2  <NA> <NA>   <NA>
3  <NA> <NA>   <NA>
4  <NA> <NA>   <NA>
5  <NA> <NA>   <NA>
6  <NA> <NA>   <NA>
7  <NA> <NA>   <NA>
8  <NA> <NA>   <NA>
9  <NA> <NA>   <NA>
10 <NA> <NA>   <NA>

and df4 (solution from How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'?)

inner_join(x = df1, 
           y = df2, 
           by = "id") %>%
  mutate_if(is.factor, as.character) %>%
  mutate(text = ifelse(test = text.x != text.y, 
                       yes = NA, 
                       no = text.x)) %>%
  select(id, text, response, volume)

id     text response volume
1  632592651     <NA>        y   1234
2  633322173      cat        y    432
3  634703802      dog        y    324
4  634927873    mouse        n    333
5  635812953 elephant        n   2223
6  636004739    goose        y 412346
7  636101211      rat        y   7456
8  636157799     mice        n   3456
9  636263106    kitty        n   2345
10 636752420   kitten        y   2345

but not sure how to replace df2 with df3 and df4. The desired output is shown below:

id           text volume
1   632592651       NA   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  NA               NA      NA
12  NA               NA      NA
13  NA               NA      NA
14  NA               NA      NA
15  NA               NA      NA
16  NA               NA      NA
17  NA               NA      NA
18  NA               NA      NA
19  NA               NA      NA
20  NA               NA      NA

Can someone help please? If possible, may I also know if there's a manual approach to select subset of df2 based on df3$id and change all values to NA?

Part 2:

For the second part of my request, I would like to create another dataframes from joined_df which appears only in df1 (call it found_in_df1). Example of output:

found_in_df1:

#           id     text volume
# 1: 632592651     <NA>   1234
# 2: 633322173      cat    432
# 3: 634703802      dog    324
# 4: 634927873    mouse    333
# 5: 635812953 elephant   2223
# 6: 636004739    goose 412346
# 7: 636101211      rat   7456
# 8: 636157799     mice   3456
# 9: 636263106    kitty   2345
#10: 636752420   kitten   2345

The solution is given in How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'? but I'm looking for an alternative approach, i.e., is it possible to write a script to say retrieve from joined_df using df1 to give found_in_df1 since we have df1 and joined_df?


Solution

  • One potential solution for dealing with conflicts is to use the powerjoin package, e.g.

    library(dplyr)
    
    df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                      text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                      response=c("y","y","y","n","n","y","y","n","n","y"))
    
    df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                      text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                      volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))
    
    expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
                                          NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                                   text = c(NA, "cat", "dog", "mouse", "elephant", "goose", 
                                            "rat", "mice", "kitty", "kitten", 
                                            NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                                   volume = c(1234, 432, 324, 333, 2223, 412346, 7456, 
                                              3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))
    
    library(powerjoin)
    joined_df <- power_full_join(df1, df2, by = c("id"),
                                 conflict = rw ~ ifelse(.x != .y,
                                                        NA_integer_, 
                                                        .x))
    
    final_df <- joined_df %>%
      mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
      select(id, text, volume)
    final_df
    #>           id     text volume
    #> 1  632592651     <NA>   1234
    #> 2  633322173      cat    432
    #> 3  634703802      dog    324
    #> 4  634927873    mouse    333
    #> 5  635812953 elephant   2223
    #> 6  636004739    goose 412346
    #> 7  636101211      rat   7456
    #> 8  636157799     mice   3456
    #> 9  636263106    kitty   2345
    #> 10 636752420   kitten   2345
    #> 11      <NA>     <NA>     NA
    #> 12      <NA>     <NA>     NA
    #> 13      <NA>     <NA>     NA
    #> 14      <NA>     <NA>     NA
    #> 15      <NA>     <NA>     NA
    #> 16      <NA>     <NA>     NA
    #> 17      <NA>     <NA>     NA
    #> 18      <NA>     <NA>     NA
    #> 19      <NA>     <NA>     NA
    #> 20      <NA>     <NA>     NA
    
    all_equal(final_df, expected_outcome)
    #> [1] TRUE
    
    # Part 2
    found_in_df1 <- power_left_join(df1, df2, by = c("id"),
                                    conflict = rw ~ ifelse(.x != .y,
                                                           NA_integer_, 
                                                           .x)) %>%
      select(id, text, volume)
    found_in_df1
    #>           id     text volume
    #> 1  632592651     <NA>   1234
    #> 2  633322173      cat    432
    #> 3  634703802      dog    324
    #> 4  634927873    mouse    333
    #> 5  635812953 elephant   2223
    #> 6  636004739    goose 412346
    #> 7  636101211      rat   7456
    #> 8  636157799     mice   3456
    #> 9  636263106    kitty   2345
    #> 10 636752420   kitten   2345
    

    Created on 2022-07-02 by the reprex package (v2.0.1)

    Edit

    Per the comment below from the creator of the powerjoin package (Mr. Mudskipper): these operations are vectorised, so you don't need to perform the command 'rowwise', i.e. you can remove "rw" to simplify and gain performance. There is no practical difference between including and excluding "rw" with df1 and df2, but if we use larger dataframes you can see a clear increase in performance, e.g.

    library(dplyr)
    library(powerjoin)
    
    # define functions
    power_full_join_func_rowwise <- function(df1, df2) {
      joined_df <- power_full_join(df1, df2, by = c("id"),
                                   conflict = rw ~ ifelse(.x != .y,
                                                          NA_integer_, 
                                                          .x))
      
      final_df <- joined_df %>%
        mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
        select(id, text, volume)
      return(final_df)
    }
    
    power_full_join_func_not_rowwise <- function(df1, df2) {
      joined_df <- power_full_join(df1, df2, by = c("id"),
                                   conflict = ~ifelse(.x != .y,
                                                          NA_integer_, 
                                                          .x))
      
      final_df <- joined_df %>%
        mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
        select(id, text, volume)
      return(final_df)
    }
    
    library(microbenchmark)
    library(purrr)
    library(ggplot2)
    
    # make larger dfs (copy df1 and df2 X100)
    df3 <- map_dfr(seq_len(100), ~ df1)
    df4 <- map_dfr(seq_len(100), ~ df2)
    
    # benchmark performance on the larger dataframes
    res <- microbenchmark(power_full_join_func_rowwise(df3, df4),
                          power_full_join_func_not_rowwise(df3, df4))
    res
    #> Unit: milliseconds
    #>                                        expr       min        lq      mean
    #>      power_full_join_func_rowwise(df3, df4) 397.32661 426.08117 449.88787
    #>  power_full_join_func_not_rowwise(df3, df4)  71.85757  77.25344  90.36191
    #>     median        uq      max neval cld
    #>  446.41715 472.47817 587.3301   100   b
    #>   81.18239  93.95103 191.1248   100  a
    autoplot(res)
    #> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
    

    # Is the result the same?
    all_equal(power_full_join_func_rowwise(df3, df4),
              power_full_join_func_not_rowwise(df3, df4))
    #> [1] TRUE
    

    Created on 2022-11-24 by the reprex package (v2.0.1)