rdplyrbigdatainterpolationzoo

Interpolation over different groups of values with not enough non-NA values


I have a dataframe including all observation of distances (column DIST.y) and altitude (Z) and corrected altitude by moving average (ma_Z) encompassed within an interval fixed by the column LimAm and LimAv for a given point ID.x for a given distance Dist.x. My purpose do a regression of the DIST.y and the ma_Z for each group of ID.x. Here is the code I use:

df_sl %>% 
  do({
    mod = lm(ma_Z ~ DIST.y, data = .)
    data.frame(int = coef(mod)[1], slope = coef(mod)[2])
  })

This code works properly.

However, I would like to fill the NA in the column ma_Z by using an interpolation with the funtion zoo::na.approx() before doing the regression. Unfortunately, for some of the groups ID.x in the dataframe I have a error message saying there is not enough non-NA values for the code to works properly (in this example, the groups ID.x = "188473" and "188473").

I do not understand why I have this error message for the ID.x "188473" and "188474" having two non-NA values. While ID.x "9383" and "9384" does not trigger the error message but only have one non-NA value each.

df_sl <- structure(list(ID.x = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 
2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 
5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 9383, 9383, 9384, 9384, 
188473, 188473, 188473, 188473, 188474, 188474, 188474, 188474
), DIST.x = c(0, 0, 0, 0, 0, 0, 0, 0, 11.515675154, 11.515675154, 
11.515675154, 11.515675154, 11.515675154, 11.515675154, 11.515675154, 
11.515675154, 21.823439218, 21.823439218, 21.823439218, 21.823439218, 
21.823439218, 21.823439218, 21.823439218, 21.823439218, 21.988363443, 
21.988363443, 21.988363443, 21.988363443, 21.988363443, 21.988363443, 
21.988363443, 21.988363443, 32.961571068, 32.961571068, 32.961571068, 
32.961571068, 32.961571068, 32.961571068, 32.961571068, 32.961571068, 
43.934778692, 43.934778692, 43.934778692, 43.934778692, 43.934778692, 
43.934778692, 43.934778692, 43.934778692, 0, 0, 13.891845289, 
13.891845289, 0, 0, 0, 0, 0, 0, 0, 0), ID.y = c(1, 2, 3, 4, 5, 
6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 
3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 
8, 9383, 9384, 9383, 9384, 188473, 188474, 188475, 188476, 188473, 
188474, 188475, 188476), LimAm = c(-375, -363.484324846, -353.176560782, 
-353.011636557, -342.038428932, -331.065221308, -330.991532192, 
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557, 
-342.038428932, -331.065221308, -330.991532192, -320.018324568, 
-375, -363.484324846, -353.176560782, -353.011636557, -342.038428932, 
-331.065221308, -330.991532192, -320.018324568, -375, -363.484324846, 
-353.176560782, -353.011636557, -342.038428932, -331.065221308, 
-330.991532192, -320.018324568, -375, -363.484324846, -353.176560782, 
-353.011636557, -342.038428932, -331.065221308, -330.991532192, 
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557, 
-342.038428932, -331.065221308, -330.991532192, -320.018324568, 
-375, -361.108154711, -375, -361.108154711, -375, -375, -375, 
-362.193750234, -375, -375, -375, -362.193750234), LimAv = c(375, 
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692, 
419.008467808, 429.981675432, 375, 386.515675154, 396.823439218, 
396.988363443, 407.961571068, 418.934778692, 419.008467808, 429.981675432, 
375, 386.515675154, 396.823439218, 396.988363443, 407.961571068, 
418.934778692, 419.008467808, 429.981675432, 375, 386.515675154, 
396.823439218, 396.988363443, 407.961571068, 418.934778692, 419.008467808, 
429.981675432, 375, 386.515675154, 396.823439218, 396.988363443, 
407.961571068, 418.934778692, 419.008467808, 429.981675432, 375, 
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692, 
419.008467808, 429.981675432, 375, 388.891845289, 375, 388.891845289, 
375, 375, 375, 387.806249766, 375, 375, 375, 387.806249766), 
    DIST.y = c(0, 11.515675154, 21.823439218, 21.988363443, 32.961571068, 
    43.934778692, 44.008467808, 54.981675432, 0, 11.515675154, 
    21.823439218, 21.988363443, 32.961571068, 43.934778692, 44.008467808, 
    54.981675432, 0, 11.515675154, 21.823439218, 21.988363443, 
    32.961571068, 43.934778692, 44.008467808, 54.981675432, 0, 
    11.515675154, 21.823439218, 21.988363443, 32.961571068, 43.934778692, 
    44.008467808, 54.981675432, 0, 11.515675154, 21.823439218, 
    21.988363443, 32.961571068, 43.934778692, 44.008467808, 54.981675432, 
    0, 11.515675154, 21.823439218, 21.988363443, 32.961571068, 
    43.934778692, 44.008467808, 54.981675432, 0, 13.891845289, 
    0, 13.891845289, 0, 0, 0, 12.806249766, 0, 0, 0, 12.806249766
    ), Z = c(193.07513428, 193.15454102, 192.17289734, 192.17289734, 
    190.82974243, 190.63618469, 190.63618469, 189.45043945, 193.07513428, 
    193.15454102, 192.17289734, 192.17289734, 190.82974243, 190.63618469, 
    190.63618469, 189.45043945, 193.07513428, 193.15454102, 192.17289734, 
    192.17289734, 190.82974243, 190.63618469, 190.63618469, 189.45043945, 
    193.07513428, 193.15454102, 192.17289734, 192.17289734, 190.82974243, 
    190.63618469, 190.63618469, 189.45043945, 193.07513428, 193.15454102, 
    192.17289734, 192.17289734, 190.82974243, 190.63618469, 190.63618469, 
    189.45043945, 193.07513428, 193.15454102, 192.17289734, 192.17289734, 
    190.82974243, 190.63618469, 190.63618469, 189.45043945, 353.19342041, 
    353.02838135, 353.19342041, 353.02838135, 344.16003418, 344.16003418, 
    344.16003418, 344.1892395, 344.16003418, 344.16003418, 344.16003418, 
    344.1892395), ma_Z = c(193.07513428, NA, 192.800857546667, 
    NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428, 
    NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875, 
    NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325, 
    NA, 191.0687522875, NA, 193.07513428, NA, 192.800857546667, 
    NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428, 
    NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875, 
    NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325, 
    NA, 191.0687522875, NA, 353.19342041, NA, 353.19342041, NA, 
    344.16003418, NA, 344.16003418, NA, 344.16003418, NA, 344.16003418, 
    NA)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(ID.x = c(1, 
2, 3, 4, 5, 6, 9383, 9384, 188473, 188474), .rows = structure(list(
    1:8, 9:16, 17:24, 25:32, 33:40, 41:48, 49:50, 51:52, 53:56, 
    57:60), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-10L), .drop = TRUE))
df_sl %>% group_by(ID.x) %>% mutate(ma_Z = zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2)) 

And the error that I get :

Error in `mutate()`:
ℹ In argument: `ma_Z = ifelse(...)`.
ℹ In group 9: `ID.x = 188473`.
Caused by error in `approx()`:
! need at least two non-NA values to interpolate

I still need the groups with NA values for the linear regression (it works even if there is only one non-NA values). Also the full dataset is composed of millions of observations. I cannot allow myself to check all of it.

I had the idea to use a ifelse() function to bypass the interpolation if the amount of non-NA is lower than 2. However the error is the same. If I rise the threshold for the condition (>2), there is no errors but it gives an unique value for all observations of each group (No interpolation then)...

df_sl %>% 
  group_by(ID.x) %>%
  mutate(ma_Z = ifelse(
    length(na.omit(ma_Z)) > 1, 
    zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2),
    ma_Z)
  )

How could I fix the error for the whole dataset and obtain a proper interpolation in the ma_Z column?


Solution

  • Is this what you want?

    > df_sl |> 
    +   transform(ma_Z=ave(
    +     ma_Z, ID.x, FUN=\(x) {
    +       if (all(is.na(x))) {
    +         rep_len(NA_real_, length(x))
    +       }
    +       else if (len <- length(x) < 3L) {
    +         rep_len(na.omit(x), len)
    +       } 
    +       else {
    +         approx(x, xout=seq_along(x), rule=2)$y 
    +       }
    +     })
    +   )
         ID.x   DIST.x   ID.y     LimAm    LimAv   DIST.y        Z     ma_Z
    1       1  0.00000      1 -375.0000 375.0000  0.00000 193.0751 193.0751
    2       1  0.00000      2 -363.4843 386.5157 11.51568 193.1545 192.9380
    3       1  0.00000      3 -353.1766 396.8234 21.82344 192.1729 192.8009
    4       1  0.00000      4 -353.0116 396.9884 21.98836 192.1729 192.4417
    5       1  0.00000      5 -342.0384 407.9616 32.96157 190.8297 192.0825
    6       1  0.00000      6 -331.0652 418.9348 43.93478 190.6362 191.5756
    7       1  0.00000      7 -330.9915 419.0085 44.00847 190.6362 191.0688
    8       1  0.00000      8 -320.0183 429.9817 54.98168 189.4504 191.0688
    9       2 11.51568      1 -375.0000 375.0000  0.00000 193.0751 193.0751
    10      2 11.51568      2 -363.4843 386.5157 11.51568 193.1545 192.9380
    11      2 11.51568      3 -353.1766 396.8234 21.82344 192.1729 192.8009
    12      2 11.51568      4 -353.0116 396.9884 21.98836 192.1729 192.4417
    13      2 11.51568      5 -342.0384 407.9616 32.96157 190.8297 192.0825
    14      2 11.51568      6 -331.0652 418.9348 43.93478 190.6362 191.5756
    15      2 11.51568      7 -330.9915 419.0085 44.00847 190.6362 191.0688
    16      2 11.51568      8 -320.0183 429.9817 54.98168 189.4504 191.0688
    17      3 21.82344      1 -375.0000 375.0000  0.00000 193.0751 193.0751
    18      3 21.82344      2 -363.4843 386.5157 11.51568 193.1545 192.9380
    19      3 21.82344      3 -353.1766 396.8234 21.82344 192.1729 192.8009
    20      3 21.82344      4 -353.0116 396.9884 21.98836 192.1729 192.4417
    21      3 21.82344      5 -342.0384 407.9616 32.96157 190.8297 192.0825
    22      3 21.82344      6 -331.0652 418.9348 43.93478 190.6362 191.5756
    23      3 21.82344      7 -330.9915 419.0085 44.00847 190.6362 191.0688
    24      3 21.82344      8 -320.0183 429.9817 54.98168 189.4504 191.0688
    25      4 21.98836      1 -375.0000 375.0000  0.00000 193.0751 193.0751
    26      4 21.98836      2 -363.4843 386.5157 11.51568 193.1545 192.9380
    27      4 21.98836      3 -353.1766 396.8234 21.82344 192.1729 192.8009
    28      4 21.98836      4 -353.0116 396.9884 21.98836 192.1729 192.4417
    29      4 21.98836      5 -342.0384 407.9616 32.96157 190.8297 192.0825
    30      4 21.98836      6 -331.0652 418.9348 43.93478 190.6362 191.5756
    31      4 21.98836      7 -330.9915 419.0085 44.00847 190.6362 191.0688
    32      4 21.98836      8 -320.0183 429.9817 54.98168 189.4504 191.0688
    33      5 32.96157      1 -375.0000 375.0000  0.00000 193.0751 193.0751
    34      5 32.96157      2 -363.4843 386.5157 11.51568 193.1545 192.9380
    35      5 32.96157      3 -353.1766 396.8234 21.82344 192.1729 192.8009
    36      5 32.96157      4 -353.0116 396.9884 21.98836 192.1729 192.4417
    37      5 32.96157      5 -342.0384 407.9616 32.96157 190.8297 192.0825
    38      5 32.96157      6 -331.0652 418.9348 43.93478 190.6362 191.5756
    39      5 32.96157      7 -330.9915 419.0085 44.00847 190.6362 191.0688
    40      5 32.96157      8 -320.0183 429.9817 54.98168 189.4504 191.0688
    41      6 43.93478      1 -375.0000 375.0000  0.00000 193.0751 193.0751
    42      6 43.93478      2 -363.4843 386.5157 11.51568 193.1545 192.9380
    43      6 43.93478      3 -353.1766 396.8234 21.82344 192.1729 192.8009
    44      6 43.93478      4 -353.0116 396.9884 21.98836 192.1729 192.4417
    45      6 43.93478      5 -342.0384 407.9616 32.96157 190.8297 192.0825
    46      6 43.93478      6 -331.0652 418.9348 43.93478 190.6362 191.5756
    47      6 43.93478      7 -330.9915 419.0085 44.00847 190.6362 191.0688
    48      6 43.93478      8 -320.0183 429.9817 54.98168 189.4504 191.0688
    49   9383  0.00000   9383 -375.0000 375.0000  0.00000 353.1934 353.1934
    50   9383  0.00000   9384 -361.1082 388.8918 13.89185 353.0284 353.1934
    51   9384 13.89185   9383 -375.0000 375.0000  0.00000 353.1934 353.1934
    52   9384 13.89185   9384 -361.1082 388.8918 13.89185 353.0284 353.1934
    53 188473  0.00000 188473 -375.0000 375.0000  0.00000 344.1600 344.1600
    54 188473  0.00000 188474 -375.0000 375.0000  0.00000 344.1600 344.1600
    55 188473  0.00000 188475 -375.0000 375.0000  0.00000 344.1600 344.1600
    56 188473  0.00000 188476 -362.1938 387.8062 12.80625 344.1892 344.1600
    57 188474  0.00000 188473 -375.0000 375.0000  0.00000 344.1600 344.1600
    58 188474  0.00000 188474 -375.0000 375.0000  0.00000 344.1600 344.1600
    59 188474  0.00000 188475 -375.0000 375.0000  0.00000 344.1600 344.1600
    60 188474  0.00000 188476 -362.1938 387.8062 12.80625 344.1892 344.1600