rdplyrstringrstrsplit

R str_split_i inside ifelse with two conditions


I need to replace names of students inside longer file using a shorter mapping file. Longer file contains only first names and exam file names. The mapping file has the full names and exam file name. I need to get full names from the mapping file. Two conditions have to be met, a student's first name has to match in both files, and because multiple students have the same first name, the exam file name has to match too.

student_file_mapping <- data.frame(student=c("paul_johns","mary_bash","paul_simons"), file=c("johns bash hunter 2022_exam1.paul_johns.csv","johns bash hunter 2022_exam1.mary_bash.csv","nichols simons smith 2022_exam1.paul_simonsh.csv"))

> student_file_mapping
      student                                             file
1  paul_johns      johns bash hunter 2022_exam1.paul_johns.csv
2   mary_bash       johns bash hunter 2022_exam1.mary_bash.csv
3 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv
data_full <- data.frame(student_file=c("paul_johns bash hunter 2022_exam1","paul_johns bash hunter 2022_exam1","paul_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","paul_nichols simons smith 2022_exam1","paul_nichols simons smith 2022_exam1","paul_nichols simons smith 2022_exam1","paul_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","paul_nichols simons smith 2022_exam1"), File=c("johns bash hunter 2022_exam1.csv", "johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","nichols simons smith 2022_exam1.csv","nichols simons smith 2022_exam1.csv","nichols simons smith 2022_exam1.csv","jons bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","nichols simons smith 2022_exam1.csv"))

> data_full
                           student_file                                File
1     paul_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv
2     paul_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv
3     paul_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv
4     mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv
5     mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv
6     mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv
7  paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv
8  paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv
9  paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv
10    paul_johns bash hunter 2022_exam1     jons bash hunter 2022_exam1.csv
11    mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv
12 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv

I can get students' first names from both files

str_split_i(student_file_mapping$student,"_",1)[1] -> paul
str_split_i(data_full$student_file,"_",1)[1] -> paul

Exam file names

str_split_i(student_file_mapping$file,"\\.",1)[1] -> johns bash hunter 2022_exam1
str_split_i(data_full$File,".csv",1)[1] -> johns bash hunter 2022_exam1

If I embed these in ifelse, I get correct output only for one row per student in the long file

data_full$student <- ifelse( str_split_i(student_file_mapping$file,"\\.",1) == str_split_i(data_full$File,".csv",1) & str_split_i(student_file_mapping$student,"_",1) == str_split_i(data_full$student_file,"_",1) , paste0(student_file_mapping$student),"NOT FOUND" )
> data_full
                           student_file                                File     student
1     paul_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv  paul_johns
2     paul_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv   NOT FOUND
3     paul_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv   NOT FOUND
4     mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv   NOT FOUND
5     mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv   mary_bash
6     mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv   NOT FOUND
7  paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv   NOT FOUND
8  paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv   NOT FOUND
9  paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv paul_simons
10    paul_johns bash hunter 2022_exam1     jons bash hunter 2022_exam1.csv   NOT FOUND
11    mary_johns bash hunter 2022_exam1    johns bash hunter 2022_exam1.csv   mary_bash
12 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv paul_simons

In my actual dataset it does partial replacement and also gives an error longer object length is not a multiple of shorter object length, but no error for the example data.


Solution

  • Had to create columns for the first names and file names, then use merge with fuzzy matching, which takes care of the misspelling!

    library(stringr)
    library(fuzzyjoin)
    library(dplyr)
    
    student_file_mapping$first_name <- str_split_i(student_file_mapping$student,"_",1)
    student_file_mapping$file_name <- str_split_i(student_file_mapping$file,"\\.",1)
    
    data_full$first_name <- str_split_i(data_full$student_file,"_",1)
    data_full$file_name <- str_split_i(data_full$File,".csv",1) 
    
    
    data_merged <- stringdist_join(student_file_mapping,data_full, 
                    by=c("first_name", "file_name"),
                    mode='left',
                    method = "lv")
    
    > data_merged
           student                                             file first_name.x
    1   paul_johns      johns bash hunter 2022_exam1.paul_johns.csv         paul
    2   paul_johns      johns bash hunter 2022_exam1.paul_johns.csv         paul
    3   paul_johns      johns bash hunter 2022_exam1.paul_johns.csv         paul
    4   paul_johns      johns bash hunter 2022_exam1.paul_johns.csv         paul
    5    mary_bash       johns bash hunter 2022_exam1.mary_bash.csv         mary
    6    mary_bash       johns bash hunter 2022_exam1.mary_bash.csv         mary
    7    mary_bash       johns bash hunter 2022_exam1.mary_bash.csv         mary
    8    mary_bash       johns bash hunter 2022_exam1.mary_bash.csv         mary
    9  paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv         paul
    10 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv         paul
    11 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv         paul
    12 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv         paul
                           file_name.x                         student_file
    1     johns bash hunter 2022_exam1    paul_johns bash hunter 2022_exam1
    2     johns bash hunter 2022_exam1    paul_johns bash hunter 2022_exam1
    3     johns bash hunter 2022_exam1    paul_johns bash hunter 2022_exam1
    4     johns bash hunter 2022_exam1    paul_johns bash hunter 2022_exam1
    5     johns bash hunter 2022_exam1    mary_johns bash hunter 2022_exam1
    6     johns bash hunter 2022_exam1    mary_johns bash hunter 2022_exam1
    7     johns bash hunter 2022_exam1    mary_johns bash hunter 2022_exam1
    8     johns bash hunter 2022_exam1    mary_johns bash hunter 2022_exam1
    9  nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
    10 nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
    11 nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
    12 nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
                                      File first_name.y                     file_name.y
    1     johns bash hunter 2022_exam1.csv         paul    johns bash hunter 2022_exam1
    2     johns bash hunter 2022_exam1.csv         paul    johns bash hunter 2022_exam1
    3     johns bash hunter 2022_exam1.csv         paul    johns bash hunter 2022_exam1
    4      jons bash hunter 2022_exam1.csv         paul     jons bash hunter 2022_exam1
    5     johns bash hunter 2022_exam1.csv         mary    johns bash hunter 2022_exam1
    6     johns bash hunter 2022_exam1.csv         mary    johns bash hunter 2022_exam1
    7     johns bash hunter 2022_exam1.csv         mary    johns bash hunter 2022_exam1
    8     johns bash hunter 2022_exam1.csv         mary    johns bash hunter 2022_exam1
    9  nichols simons smith 2022_exam1.csv         paul nichols simons smith 2022_exam1
    10 nichols simons smith 2022_exam1.csv         paul nichols simons smith 2022_exam1
    11 nichols simons smith 2022_exam1.csv         paul nichols simons smith 2022_exam1
    12 nichols simons smith 2022_exam1.csv         paul nichols simons smith 2022_exam1