I want to make an script that detects if a person dropped.
This is my database and the script to count the number of dropouts.
library(data.table)
datos <- data.frame(
name = c("LAVAYEN LAVAYEN JOSE ANTONIO", "LAVAYEN LAVAYEN JHON JAIRO",
"LEON LEON LUIS ALEJANDRO", "LIMA MARTINEZ AURORA YSABELL",
"MONTALVAN MEJIA JOSE NICOLAS", "RODRIGUEZ ROMERO DIGNA LISSETH",
"ULCUANGO CONLAGO PEDRO DAVID"),
Id = c("aa", "bb", "cc", "dd", "ee", "ff", "gg"),
`2017` = c(1, NA, 1, NA, NA, 1, NA),
`2018` = c(1, NA, 1, NA, 1, NA, 1),
`2019` = c(NA, 1, NA, NA, 1, 1, 1),
`2020` = c(NA, 1, NA, NA, NA, NA, NA),
`2021` = c(NA, 1, NA, NA, 1, 1, 1),
`2022` = c(NA, 1, NA, NA, 1, NA, NA),
`2023` = c(NA, 1, NA, 1, 1, 1, 1),
stringsAsFactors = FALSE
) %>% as.data.table()
helper <- function(x) {
i <- Position(\(x) !is.na(x), x) # i is the position of first non-NA
if (is.na(i)) return(0L) # If not found return 0
j <- length(x) # Otherwise sum number of NAs from i to the end
sum(is.na(x[i:j]))
}
datos[, n_dropout := helper(unlist(.SD)), .SDcols = `X2017`:`X2023`, by = .(name,Id)]
datos
The expected output is to mark the next record as a dropout when a person has marked with n_dropout:
output <- data.table(
name = c(
"LAVAYEN LAVAYEN JOSE ANTONIO",
"LAVAYEN LAVAYEN JHON JAIRO",
"LEON LEON LUIS ALEJANDRO",
"LIMA MARTINEZ AURORA YSABELL",
"MONTALVAN MEJIA JOSE NICOLAS",
"RODRIGUEZ ROMERO DIGNA LISSETH",
"ULCUANGO CONLAGO PEDRO DAVID"
),
Id = c("aa", "bb", "cc", "dd", "ee", "ff", "ee"),
`2017` = c(NA, NA, NA, NA, NA, NA, NA),
`2018` = c(NA, NA, NA, NA, NA, 1, NA),
`2019` = c(1, NA, 1, NA, NA, NA, NA),
`2020` = c(NA, NA, NA, NA, 1, 1, 1),
`2021` = c(NA, NA, NA, NA, NA, NA, NA),
`2022` = c(NA, NA, NA, NA, NA, 1, 1),
`2023` = c(NA, NA, NA, NA, NA, NA, NA),
n_dropout = c(5, 0, 5, 0, 1, 3, 2)
)
name Id 2017 2018 2019 2020 2021 2022 2023 n_dropout
<char> <char> <lgcl> <num> <num> <num> <lgcl> <num> <lgcl> <num>
1: LAVAYEN LAVAYEN JOSE ANTONIO aa NA NA 1 NA NA NA NA 5
2: LAVAYEN LAVAYEN JHON JAIRO bb NA NA NA NA NA NA NA 0
3: LEON LEON LUIS ALEJANDRO cc NA NA 1 NA NA NA NA 5
4: LIMA MARTINEZ AURORA YSABELL dd NA NA NA NA NA NA NA 0
5: MONTALVAN MEJIA JOSE NICOLAS ee NA NA NA 1 NA NA NA 1
6: RODRIGUEZ ROMERO DIGNA LISSETH ff NA 1 NA 1 NA 1 NA 3
7: ULCUANGO CONLAGO PEDRO DAVID ee NA NA NA 1 NA 1 NA 2
I think this is what you mean:
output <- copy(datos)
# Get year names
year_names <- names(output)[grepl("X", colnames(output))]
# Initialize with NA
output[, (year_names) := lapply(.SD, function(x) NA), .SDcols = year_names]
# Process each row
for(i in 1:nrow(output)) {
row_data <- datos[i, ..year_names]
for (j in 2:length(year_names)) {
if(is.na(row_data[[j]]) & !is.na(row_data[[j-1]])) {
output[i, (year_names[j]) := 1]
}
}
}
colnames(output) <- gsub("X","", colnames(output))
Result:
name | Id | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | n_dropout |
---|---|---|---|---|---|---|---|---|---|
LAVAYEN LAVAYEN JOSE ANTONIO | aa | NA | NA | 1 | NA | NA | NA | NA | 5 |
LAVAYEN LAVAYEN JHON JAIRO | bb | NA | NA | NA | NA | NA | NA | NA | 0 |
LEON LEON LUIS ALEJANDRO | cc | NA | NA | 1 | NA | NA | NA | NA | 5 |
LIMA MARTINEZ AURORA YSABELL | dd | NA | NA | NA | NA | NA | NA | NA | 0 |
MONTALVAN MEJIA JOSE NICOLAS | ee | NA | NA | NA | 1 | NA | NA | NA | 1 |
RODRIGUEZ ROMERO DIGNA LISSETH | ff | NA | 1 | NA | 1 | NA | 1 | NA | 3 |
ULCUANGO CONLAGO PEDRO DAVID | gg | NA | NA | NA | 1 | NA | 1 | NA | 2 |