rdata.tablestrsplit

Split row based off of row value in a previous column (2 patterns) - incorrect values using tstrsplit


I am attempting to move the values of the Structural Notation column to a new column named either SMILES or InChI based off of the value in the Structural Notation Type column. Essentially, I want all SMILES notations in the Structural Notation column to be in the SMILES column and all InCHI notations in the Structural Notation column in the InCHI column.

In order to reach that conclusion, I had created the Combine column where I combined the contents of the Structural Notation Type column and the Structural Notation column and add either a " && " or " %% " for the SMILES and INCHI row values respectively.


library(data.table)

structures <- structure(list(CAS = c("70024-85-0", "80934-44-7", "356-86-5",
"70024-86-1", "79004-87-8", "63335-88-6", "70024-79-2", "28994-41-4",
"63021-86-3", "63041-90-7"), `Structural Notation Type` = c("SMILES",
"SMILES", "SMILES", "SMILES", "SMILES", "INCHI", "SMILES", "INCHI",
"INCHI", "INCHI"), `Structural Notation` = c("", "", "O=C(OCC(F)(F)C(F)(F)F)C=C",
"", "", "1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", "",
"1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), Combine = c("SMILES && ", "SMILES && ", "SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C",
"SMILES && ", "SMILES && ", "INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3",
"SMILES && ", "INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
))

Rendered

         CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:                                                                                                
 2:                                                                                                
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                                
 5:                                                                                                
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                                
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                                     Combine
 1:                                                                                               SMILES && 
 2:                                                                                               SMILES && 
 3:                                                                      SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                               SMILES && 
 5:                                                                                               SMILES && 
 6:                                                INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                               SMILES && 
 8:                                INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

In the second step, I used the tstrsplit split function from data.table. You will see that the end result is not correct. For example, you'll see in row 3 which is a SMILES row and should be in the SMILES column is actually in the InCHI column.

structures[, c("SMILES", "InChI") := tstrsplit(Combine, c("&&", "%%"), fixed = TRUE)][]

structures <- structure(list(CAS = c("70024-85-0", "80934-44-7", "356-86-5",
"70024-86-1", "79004-87-8", "63335-88-6", "70024-79-2", "28994-41-4",
"63021-86-3", "63041-90-7"), `Structural Notation Type` = c("SMILES",
"SMILES", "SMILES", "SMILES", "SMILES", "INCHI", "SMILES", "INCHI",
"INCHI", "INCHI"), `Structural Notation` = c("", "", "O=C(OCC(F)(F)C(F)(F)F)C=C",
"", "", "1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", "",
"1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), Combine = c("SMILES && ", "SMILES && ", "SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C",
"SMILES && ", "SMILES && ", "INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3",
"SMILES && ", "INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), SMILES = c("SMILES ", "SMILES && ", "SMILES ", "SMILES && ",
"SMILES ", "INCHI ", "SMILES ", "INCHI ", "INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI "), InChI = c(" ", NA, " O=C(OCC(F)(F)C(F)(F)F)C=C", NA,
" ", " 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", " ",
" 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
NA, " 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
))


Rendered

          CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:                                                                                                
 2:                                                                                                
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                                
 5:                                                                                                
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                                
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                                     Combine
 1:                                                                                               SMILES && 
 2:                                                                                               SMILES && 
 3:                                                                      SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                               SMILES && 
 5:                                                                                               SMILES && 
 6:                                                INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                               SMILES && 
 8:                                INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                     SMILES
 1:                                                                                 SMILES 
 2:                                                                              SMILES && 
 3:                                                                                 SMILES
 4:                                                                              SMILES &&
 5:                                                                                 SMILES
 6:                                                                                  INCHI
 7:                                                                                 SMILES
 8:                                                                                  INCHI
 9: INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10:                                                                                  INCHI
                                                                                               InChI
 1:
 2:                                                                                             <NA>
 3:                                                                        O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                             <NA>
 5:
 6:                                                 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:
 8:                                 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                                                                                             <NA>
10:  1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

Is there a way to avoid creating the Combine column and move each row matching SMILES into the SMILES column and move each row matching the INCHI column into the InCHI column directly?

If not, is there a way to avoid the errors that are appearing with the use of tstrsplit?

@Wimpel, thank you for your answer. I have included both a Preferred and an Ideal Solution below.

This is the Preferred Solution

           CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:
 2:
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:
 5:
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                                     Combine
 1:                                                                                               SMILES &&
 2:                                                                                               SMILES &&
 3:                                                                      SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                               SMILES &&
 5:                                                                                               SMILES &&
 6:                                                INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                               SMILES &&
 8:                                INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
     SMILES                                                                                
1: 
2: 
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4: 
5: 
6: 
7: 
8: 
9: 
10: 

                                                                                            
InChI
1: 
2: 
3: 
4: 
5: 
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7: 
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

This is the Ideal Solution (where the Combined column is not needed to be created in the first place)


           CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:
 2:
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:
 5:
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

     SMILES                                                                                
1: 
2: 
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4: 
5: 
6: 
7: 
8: 
9: 
10: 

                                                                                            
InChI
1: 
2: 
3: 
4: 
5: 
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7: 
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

UPDATE:

This is the correct solution offered by @Wimpel.

> dcast(structures, CAS ~ `Structural Notation Type`, 
+       value.var = "Structural Notation", fill = "")
           CAS
 1: 28994-41-4
 2:   356-86-5
 3: 63021-86-3
 4: 63041-90-7
 5: 63335-88-6
 6: 70024-79-2
 7: 70024-85-0
 8: 70024-86-1
 9: 79004-87-8
10: 80934-44-7
                                                                                              INCHI
 1:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 2:
 3:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
 4: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
 5:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 6:
 7:
 8:
 9:
10:
                       SMILES
 1:
 2: O=C(OCC(F)(F)C(F)(F)F)C=C
 3:
 4:
 5:
 6:
 7:
 8:
 9:
10:

Solution

  • Looks like casting to wide format should work... Since your sample data is lacking desired output, it is hard to check if this solution works for you..

    dcast(structures, CAS ~ `Structural Notation Type`, 
          value.var = "Structural Notation", fill = "")