rdplyr

Detect dropout (using R)


I want to make an script that detects if a person dropped.

This is my database and the script to count the number of dropouts.

library(data.table)
datos <- data.frame(
  name = c("LAVAYEN LAVAYEN JOSE ANTONIO", "LAVAYEN LAVAYEN JHON JAIRO", 
              "LEON LEON LUIS ALEJANDRO", "LIMA MARTINEZ AURORA YSABELL", 
              "MONTALVAN MEJIA JOSE NICOLAS", "RODRIGUEZ ROMERO DIGNA LISSETH", 
              "ULCUANGO CONLAGO PEDRO DAVID"),
  Id = c("aa", "bb", "cc", "dd", "ee", "ff", "gg"),
  `2017` = c(1, NA, 1, NA, NA, 1, NA),
  `2018` = c(1, NA, 1, NA, 1, NA, 1),
  `2019` = c(NA, 1, NA, NA, 1, 1, 1),
  `2020` = c(NA, 1, NA, NA, NA, NA, NA),
  `2021` = c(NA, 1, NA, NA, 1, 1, 1),
  `2022` = c(NA, 1, NA, NA, 1, NA, NA),
  `2023` = c(NA, 1, NA, 1, 1, 1, 1),
  stringsAsFactors = FALSE
) %>% as.data.table()

 helper <- function(x) {
   i <- Position(\(x) !is.na(x), x) # i is the position of first non-NA
   if (is.na(i)) return(0L)         # If not found return 0 
   j <- length(x)                   # Otherwise sum number of NAs from i to the end
   sum(is.na(x[i:j]))
 }
 
 datos[, n_dropout := helper(unlist(.SD)), .SDcols = `X2017`:`X2023`, by = .(name,Id)]
 datos

The expected output is to mark the next record as a dropout when a person has marked with n_dropout:

output <- data.table(
  name = c(
    "LAVAYEN LAVAYEN JOSE ANTONIO",
    "LAVAYEN LAVAYEN JHON JAIRO",
    "LEON LEON LUIS ALEJANDRO",
    "LIMA MARTINEZ AURORA YSABELL",
    "MONTALVAN MEJIA JOSE NICOLAS",
    "RODRIGUEZ ROMERO DIGNA LISSETH",
    "ULCUANGO CONLAGO PEDRO DAVID"
  ),
  Id = c("aa", "bb", "cc", "dd", "ee", "ff", "ee"),
  `2017` = c(NA, NA, NA, NA, NA, NA, NA),
  `2018` = c(NA, NA, NA, NA, NA, 1, NA),
  `2019` = c(1, NA, 1, NA, NA, NA, NA),
  `2020` = c(NA, NA, NA, NA, 1, 1, 1),
  `2021` = c(NA, NA, NA, NA, NA, NA, NA),
  `2022` = c(NA, NA, NA, NA, NA, 1, 1),
  `2023` = c(NA, NA, NA, NA, NA, NA, NA),
  n_dropout = c(5, 0, 5, 0, 1, 3, 2)
)

                             name     Id   2017  2018  2019  2020   2021  2022   2023 n_dropout
                           <char> <char> <lgcl> <num> <num> <num> <lgcl> <num> <lgcl>     <num>
1:   LAVAYEN LAVAYEN JOSE ANTONIO     aa     NA    NA     1    NA     NA    NA     NA         5
2:     LAVAYEN LAVAYEN JHON JAIRO     bb     NA    NA    NA    NA     NA    NA     NA         0
3:       LEON LEON LUIS ALEJANDRO     cc     NA    NA     1    NA     NA    NA     NA         5
4:   LIMA MARTINEZ AURORA YSABELL     dd     NA    NA    NA    NA     NA    NA     NA         0
5:   MONTALVAN MEJIA JOSE NICOLAS     ee     NA    NA    NA     1     NA    NA     NA         1
6: RODRIGUEZ ROMERO DIGNA LISSETH     ff     NA     1    NA     1     NA     1     NA         3
7:   ULCUANGO CONLAGO PEDRO DAVID     ee     NA    NA    NA     1     NA     1     NA         2

Solution

  • I think this is what you mean:

    output <- copy(datos)
    
    # Get year names
    
    year_names <- names(output)[grepl("X", colnames(output))]
    
    # Initialize with NA
    output[, (year_names) := lapply(.SD, function(x) NA), .SDcols = year_names]
    
    # Process each row
    for(i in 1:nrow(output)) {
      row_data <- datos[i, ..year_names]
      for (j in 2:length(year_names)) {
        if(is.na(row_data[[j]]) & !is.na(row_data[[j-1]])) {
          output[i, (year_names[j]) := 1]
        }
      }
    }
    colnames(output) <- gsub("X","", colnames(output))
    

    Result:

    name Id 2017 2018 2019 2020 2021 2022 2023 n_dropout
    LAVAYEN LAVAYEN JOSE ANTONIO aa NA NA 1 NA NA NA NA 5
    LAVAYEN LAVAYEN JHON JAIRO bb NA NA NA NA NA NA NA 0
    LEON LEON LUIS ALEJANDRO cc NA NA 1 NA NA NA NA 5
    LIMA MARTINEZ AURORA YSABELL dd NA NA NA NA NA NA NA 0
    MONTALVAN MEJIA JOSE NICOLAS ee NA NA NA 1 NA NA NA 1
    RODRIGUEZ ROMERO DIGNA LISSETH ff NA 1 NA 1 NA 1 NA 3
    ULCUANGO CONLAGO PEDRO DAVID gg NA NA NA 1 NA 1 NA 2