rdata.tabletidyverse

Concatenate strings over all rows in a single column of a data frame


I have a data frame with a column 'eligible_pmids'. Each row is of this column contains a comma separated string of unique identifiers.

My goal is to 1) concatenate all rows of 'eligible_pmids' into a single string, then 2) convert the long string into a character vector and finally 3) remove duplicate elements.

The data frame below is a small subset.

test <- structure(list(eligible_pmids = c("17898458, 12207834, 17381964, 15159244, 15277146, 11110851, 12031264, 14577658, 16289507, 19923377, 12821971, 15354322, 21697535, 17623486, 19473822, 22261578, 21310304, 16087973, 17119522, 17313718, 15795434, 17449580, 20140250, 18641189, 12700622, 15321800, 16924272, 16339498, 11592727, 15674307", 
"12184147, 11124748, 8988911, 7733026, 9286464, 1415008, 8614308, 1901440, 3989378, 17440525, 2012009, 1506599, 11773507, 15081561, 12923649, 9082038, 22071711, 19215678, 16825681, 22030224, 9082037, 3362176, 9778135, 17224066", 
"15047684, 15533268, 16015270, 22030228, 24622806, 21591986, 2621294, 8780967, 8363165, 12428175, 567008", 
"19515739, 11303493, 23182924, 7665996, 21346711, 7355782", "20843978, 16306540", 
"10817122, 16018792, 9415002, 15735093, 23958266, 24597664, 8688759, 12122556, 8606322, 8324774, 2983212, 9149659, 7442730, 17023718, 25256234, 15166397, 8387811, 15234935, 17925631, 3988448, 15781956, 2043019, 23404536, 21636104, 9417007, 14525873, 23186748, 9151243, 17053177, 22383309, 23920478, 23553155, 11253967, 24522444, 22059639, 8094827, 21173413, 23407305, 25411288"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))

Solution

  • Change the order to 1) convert each row to a character vector (with strsplit), 2) combine the character vectors (unlist), and 3) remove duplicates (unique).

    unique(unlist(strsplit(test$eligible_pmids, ", ")))
    #>   [1] "17898458" "12207834" "17381964" "15159244" "15277146" "11110851"
    #>   [7] "12031264" "14577658" "16289507" "19923377" "12821971" "15354322"
    #>  [13] "21697535" "17623486" "19473822" "22261578" "21310304" "16087973"
    #>  [19] "17119522" "17313718" "15795434" "17449580" "20140250" "18641189"
    #>  [25] "12700622" "15321800" "16924272" "16339498" "11592727" "15674307"
    #>  [31] "12184147" "11124748" "8988911"  "7733026"  "9286464"  "1415008" 
    #>  [37] "8614308"  "1901440"  "3989378"  "17440525" "2012009"  "1506599" 
    #>  [43] "11773507" "15081561" "12923649" "9082038"  "22071711" "19215678"
    #>  [49] "16825681" "22030224" "9082037"  "3362176"  "9778135"  "17224066"
    #>  [55] "15047684" "15533268" "16015270" "22030228" "24622806" "21591986"
    #>  [61] "2621294"  "8780967"  "8363165"  "12428175" "567008"   "19515739"
    #>  [67] "11303493" "23182924" "7665996"  "21346711" "7355782"  "20843978"
    #>  [73] "16306540" "10817122" "16018792" "9415002"  "15735093" "23958266"
    #>  [79] "24597664" "8688759"  "12122556" "8606322"  "8324774"  "2983212" 
    #>  [85] "9149659"  "7442730"  "17023718" "25256234" "15166397" "8387811" 
    #>  [91] "15234935" "17925631" "3988448"  "15781956" "2043019"  "23404536"
    #>  [97] "21636104" "9417007"  "14525873" "23186748" "9151243"  "17053177"
    #> [103] "22383309" "23920478" "23553155" "11253967" "24522444" "22059639"
    #> [109] "8094827"  "21173413" "23407305" "25411288"