rdplyr

How to replace with NAN no more than 30% of the values with the specified flag in R


I have such dataset (example)

dat=structure(list(Y = c(2282L, 2565L, 2242L, 2109L, 2704L, 2352L, 
2492L), is_red_ndvi_v_down = c("yes", "yes", "no", "yes", "yes", 
"yes", "no"), ndvi_v_down = c(0.032460447, 0.028369653, 0.017094017, 
0.016972906, 0.015228979, 0.020649285, 0.028151986), is_red_mtci_vi_max = c("yes", 
"yes", "yes", "yes", "yes", "yes", "no"), mtci_vi_max = c(0.459463725, 
0.624581753, 0.573445082, 0.478436429, 0.299561108, 0.446878491, 
0.818814539), is_red_ndvi_d85down = c("yes", "yes", "no", "yes", 
"yes", "yes", "no"), ndvi_d85down = c(159L, 157L, 150L, 150L, 
172L, 178L, 153L), is_red_rgvis_vi_min2 = c("yes", "yes", "no", 
"yes", "yes", "yes", "yes"), rgvis_vi_min2 = c(0.606465651, 0.6024961, 
0.665637406, 0.657639244, 0.630343941, 0.641025641, 0.644292757
)), class = "data.frame", row.names = c(NA, -7L))

It contains a Y variable and several predictors. data structure

  $ is_red_ndvi_v_down : chr "yes" "yes" "yes" "yes" ...
  $ ndvi_v_down : chr "0.032460447" "0.028369653" "0.017094017" "0.016972906" ...
  $ is_red_mtci_vi_max : chr "yes" "yes" "yes" "yes" ...
  $ mtci_vi_max : chr "0.459463725" "0.624581753" "0.573445082" "0.478436429"

is_red text prefix , means a value has a yes or no flag, followed by a variable that contains the values. For example is_red_ndvi_v_down here are text flags, and next ndvi_v_down is already the values for the flags

I am doing the following algorithm: If FOR ALL ROWs of a particular observation, the value “no” is found in more than 3 variables, then replace all values opposite no in this row with NA. For example, relatively speaking, in row number 10 more than two predictors have the value no, then all values opposite no in this row are marked as NA.

i try do so

no_count <- apply(dat, 1, function(row) sum(grepl("no", row)))

rows_to_replace <- which(no_count > 3)

num_rows_to_replace <- length(rows_to_replace)

if (num_rows_to_replace > round(0.3 * nrow(dat))) {
  dat[rows_to_replace, grepl("no", names(dat))] <- NA
}

but as a result, in a number of variables 104 row are replaced instead of 80

> paste("Number of rows replaced:", num_rows_to_replace)
[1] "Number of lines replaced: 104"
> paste("Acceptable number of NAs:", round(0.3 * nrow(dat)))
[1] "Acceptable number of NA: 80"

This means that in 104 rows the flag no occurs in more than two variables In my case, i need 104-80=24 rows not changed by NA.

In simple terms, after the code has seen that 80 row have already become NA, then there is no need to mark the remaining 24 rows in any way. My code constantly marks 104 rows. What am I doing wrong and how to fix it

I didn’t see any of the suggested topics with my specifics

Thanks for your valuable help


Solution

  • I wasn't sure if you want all values in a row to be NA even if a "yes" is present, or whether you only want three NAs. The first option NAs all vaues where "no" >= 3. The second option, which I'm guessing is the one you want, only NAs values if "no" >= 3 AND if "no" is present in the previous column. This distinction is demonstrated in row 5 of the results shown below. Note the .3 value returns 79.5 so you may need to change n() * .3 to ceiling(n() * .3) or nrow(df1) * .3 to ceiling(nrow(df1) * .3) if you want the threshold to be 80. In the sample df, row 109 is the last row where there are 3 "no" values AND is <= 30% of observations.

    Sample data:

    library(dplyr)
    set.seed(1)
    df <- data.frame(Y = 1:265,
                     is_red_ndvi_v_down = c(rep(c("no", "no", "yes"), 88), "yes"),
                     ndvi_v_down = round(runif(265, 0, 1), 2),
                     is_red_mtci_vi_max = c(rep(c("no", "yes", "no"), 88), "yes"),
                     mtci_vi_max = round(runif(265, 0, 1), 2),
                     is_red_ndvi_d85down = c(rep(c("no", "yes", "no", "no"), 66), "yes"),
                     ndvi_d85down = sample(150:178, 265, replace = TRUE),
                     is_red_rgvis_vi_min2 = rep(c("no", "no", "yes", "no", "no"), 53),
                     rgvis_vi_min2 = round(runif(265, 0, 1), 2))
    

    This option returns all NA even if only three NAs present (i'm guessing you don't want this):

    df1 <- df %>% 
      mutate(no_count = rowSums(across(everything(), ~ .x == "no")), # Get count of "no" values
             no_gte_3 = ifelse(no_count >= 3, 1, 0), # Define rows with >= 3 "no" values
             no_cutoff = cumsum(no_gte_3)) %>% # Assign cumulative value of "no" rows
      mutate_at(vars(-starts_with(c("is_red", "Y"))), # Select value cols
                list(~if_else(no_gte_3 == 1 & no_cutoff <= n() * .3, NA, .))) %>% # Use new columns to NA data
      select(-starts_with("no")) # Drop temporary columns
    
    df1[c(1:5, 109:114), ]
    Y is_red_ndvi_v_down ndvi_v_down is_red_mtci_vi_max mtci_vi_max is_red_ndvi_d85down ndvi_d85down is_red_rgvis_vi_min2 rgvis_vi_min2
    1                 no          NA                 no          NA                  no           NA                   no            NA
    2                 no        0.37                yes        0.06                 yes          151                   no          0.72
    3                yes        0.57                 no        0.75                  no          170                  yes          0.35
    4                 no          NA                 no          NA                  no           NA                   no            NA
    5                 no          NA                yes          NA                  no           NA                   no            NA
    109               no          NA                 no          NA                  no           NA                   no            NA
    110               no        0.60                yes        0.22                 yes          154                   no          0.29
    111              yes        0.98                 no        0.42                  no          172                   no          0.50
    112               no        0.73                 no        0.33                  no          155                   no          0.43
    113               no        0.36                yes        0.86                  no          168                  yes          0.61
    114              yes        0.43                 no        0.18                 yes          169                   no          0.93
    
    

    This option adds NAs only if preceding row value == "no" (guessing this is what you want). Note the change in values in row 5.

    df1 <- df %>% 
      mutate(no_count = rowSums(across(everything(), ~ .x == "no")),
             no_gte_3 = ifelse(no_count >= 3, 1, 0),
             no_cutoff = cumsum(no_gte_3))
    
    for(i in which(names(df1) %in% names(df1[, grepl("is_red", names(df1))]))) {
      
      df1[,i+1] <- ifelse(df1[,i] == "no" & 
                            df1$no_gte_3 == 1 & 
                            df1$no_cutoff <= nrow(df1) * .3,
                          NA, df1[,i+1])
      
    }
    
    df1 <- df1 %>%
      select(-starts_with("no"))
    
    df1[c(1:5, 109:114), ]
    Y is_red_ndvi_v_down ndvi_v_down is_red_mtci_vi_max mtci_vi_max is_red_ndvi_d85down ndvi_d85down is_red_rgvis_vi_min2 rgvis_vi_min2
    1                 no          NA                 no          NA                  no           NA                   no            NA
    2                 no        0.37                yes        0.06                 yes          151                   no          0.72
    3                yes        0.57                 no        0.75                  no          170                  yes          0.35
    4                 no          NA                 no          NA                  no           NA                   no            NA
    5                 no          NA                yes        0.17                  no           NA                   no            NA
    109               no          NA                 no          NA                  no           NA                   no            NA
    110               no        0.60                yes        0.22                 yes          154                   no          0.29
    111              yes        0.98                 no        0.42                  no          172                   no          0.50
    112               no        0.73                 no        0.33                  no          155                   no          0.43
    113               no        0.36                yes        0.86                  no          168                  yes          0.61
    114              yes        0.43                 no        0.18                 yes          169                   no          0.93