I have a dataframe including all observation of distances (column DIST.y) and altitude (Z) and corrected altitude by moving average (ma_Z) encompassed within an interval fixed by the column LimAm and LimAv for a given point ID.x for a given distance Dist.x. My purpose do a regression of the DIST.y and the ma_Z for each group of ID.x. Here is the code I use:
df_sl %>%
do({
mod = lm(ma_Z ~ DIST.y, data = .)
data.frame(int = coef(mod)[1], slope = coef(mod)[2])
})
This code works properly.
However, I would like to fill the NA in the column ma_Z by using an interpolation with the funtion zoo::na.approx()
before doing the regression.
Unfortunately, for some of the groups ID.x in the dataframe I have a error message saying there is not enough non-NA values for the code to works properly (in this example, the groups ID.x = "188473" and "188473").
I do not understand why I have this error message for the ID.x "188473" and "188474" having two non-NA values. While ID.x "9383" and "9384" does not trigger the error message but only have one non-NA value each.
df_sl <- structure(list(ID.x = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5,
5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 9383, 9383, 9384, 9384,
188473, 188473, 188473, 188473, 188474, 188474, 188474, 188474
), DIST.x = c(0, 0, 0, 0, 0, 0, 0, 0, 11.515675154, 11.515675154,
11.515675154, 11.515675154, 11.515675154, 11.515675154, 11.515675154,
11.515675154, 21.823439218, 21.823439218, 21.823439218, 21.823439218,
21.823439218, 21.823439218, 21.823439218, 21.823439218, 21.988363443,
21.988363443, 21.988363443, 21.988363443, 21.988363443, 21.988363443,
21.988363443, 21.988363443, 32.961571068, 32.961571068, 32.961571068,
32.961571068, 32.961571068, 32.961571068, 32.961571068, 32.961571068,
43.934778692, 43.934778692, 43.934778692, 43.934778692, 43.934778692,
43.934778692, 43.934778692, 43.934778692, 0, 0, 13.891845289,
13.891845289, 0, 0, 0, 0, 0, 0, 0, 0), ID.y = c(1, 2, 3, 4, 5,
6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2,
3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,
8, 9383, 9384, 9383, 9384, 188473, 188474, 188475, 188476, 188473,
188474, 188475, 188476), LimAm = c(-375, -363.484324846, -353.176560782,
-353.011636557, -342.038428932, -331.065221308, -330.991532192,
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557,
-342.038428932, -331.065221308, -330.991532192, -320.018324568,
-375, -363.484324846, -353.176560782, -353.011636557, -342.038428932,
-331.065221308, -330.991532192, -320.018324568, -375, -363.484324846,
-353.176560782, -353.011636557, -342.038428932, -331.065221308,
-330.991532192, -320.018324568, -375, -363.484324846, -353.176560782,
-353.011636557, -342.038428932, -331.065221308, -330.991532192,
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557,
-342.038428932, -331.065221308, -330.991532192, -320.018324568,
-375, -361.108154711, -375, -361.108154711, -375, -375, -375,
-362.193750234, -375, -375, -375, -362.193750234), LimAv = c(375,
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692,
419.008467808, 429.981675432, 375, 386.515675154, 396.823439218,
396.988363443, 407.961571068, 418.934778692, 419.008467808, 429.981675432,
375, 386.515675154, 396.823439218, 396.988363443, 407.961571068,
418.934778692, 419.008467808, 429.981675432, 375, 386.515675154,
396.823439218, 396.988363443, 407.961571068, 418.934778692, 419.008467808,
429.981675432, 375, 386.515675154, 396.823439218, 396.988363443,
407.961571068, 418.934778692, 419.008467808, 429.981675432, 375,
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692,
419.008467808, 429.981675432, 375, 388.891845289, 375, 388.891845289,
375, 375, 375, 387.806249766, 375, 375, 375, 387.806249766),
DIST.y = c(0, 11.515675154, 21.823439218, 21.988363443, 32.961571068,
43.934778692, 44.008467808, 54.981675432, 0, 11.515675154,
21.823439218, 21.988363443, 32.961571068, 43.934778692, 44.008467808,
54.981675432, 0, 11.515675154, 21.823439218, 21.988363443,
32.961571068, 43.934778692, 44.008467808, 54.981675432, 0,
11.515675154, 21.823439218, 21.988363443, 32.961571068, 43.934778692,
44.008467808, 54.981675432, 0, 11.515675154, 21.823439218,
21.988363443, 32.961571068, 43.934778692, 44.008467808, 54.981675432,
0, 11.515675154, 21.823439218, 21.988363443, 32.961571068,
43.934778692, 44.008467808, 54.981675432, 0, 13.891845289,
0, 13.891845289, 0, 0, 0, 12.806249766, 0, 0, 0, 12.806249766
), Z = c(193.07513428, 193.15454102, 192.17289734, 192.17289734,
190.82974243, 190.63618469, 190.63618469, 189.45043945, 193.07513428,
193.15454102, 192.17289734, 192.17289734, 190.82974243, 190.63618469,
190.63618469, 189.45043945, 193.07513428, 193.15454102, 192.17289734,
192.17289734, 190.82974243, 190.63618469, 190.63618469, 189.45043945,
193.07513428, 193.15454102, 192.17289734, 192.17289734, 190.82974243,
190.63618469, 190.63618469, 189.45043945, 193.07513428, 193.15454102,
192.17289734, 192.17289734, 190.82974243, 190.63618469, 190.63618469,
189.45043945, 193.07513428, 193.15454102, 192.17289734, 192.17289734,
190.82974243, 190.63618469, 190.63618469, 189.45043945, 353.19342041,
353.02838135, 353.19342041, 353.02838135, 344.16003418, 344.16003418,
344.16003418, 344.1892395, 344.16003418, 344.16003418, 344.16003418,
344.1892395), ma_Z = c(193.07513428, NA, 192.800857546667,
NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428,
NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875,
NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325,
NA, 191.0687522875, NA, 193.07513428, NA, 192.800857546667,
NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428,
NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875,
NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325,
NA, 191.0687522875, NA, 353.19342041, NA, 353.19342041, NA,
344.16003418, NA, 344.16003418, NA, 344.16003418, NA, 344.16003418,
NA)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(ID.x = c(1,
2, 3, 4, 5, 6, 9383, 9384, 188473, 188474), .rows = structure(list(
1:8, 9:16, 17:24, 25:32, 33:40, 41:48, 49:50, 51:52, 53:56,
57:60), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L), .drop = TRUE))
df_sl %>% group_by(ID.x) %>% mutate(ma_Z = zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2))
And the error that I get :
Error in `mutate()`:
ℹ In argument: `ma_Z = ifelse(...)`.
ℹ In group 9: `ID.x = 188473`.
Caused by error in `approx()`:
! need at least two non-NA values to interpolate
I still need the groups with NA values for the linear regression (it works even if there is only one non-NA values). Also the full dataset is composed of millions of observations. I cannot allow myself to check all of it.
I had the idea to use a ifelse()
function to bypass the interpolation if the amount of non-NA is lower than 2. However the error is the same. If I rise the threshold for the condition (>2), there is no errors but it gives an unique value for all observations of each group (No interpolation then)...
df_sl %>%
group_by(ID.x) %>%
mutate(ma_Z = ifelse(
length(na.omit(ma_Z)) > 1,
zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2),
ma_Z)
)
How could I fix the error for the whole dataset and obtain a proper interpolation in the ma_Z column?
Is this what you want?
> df_sl |>
+ transform(ma_Z=ave(
+ ma_Z, ID.x, FUN=\(x) {
+ if (all(is.na(x))) {
+ rep_len(NA_real_, length(x))
+ }
+ else if (len <- length(x) < 3L) {
+ rep_len(na.omit(x), len)
+ }
+ else {
+ approx(x, xout=seq_along(x), rule=2)$y
+ }
+ })
+ )
ID.x DIST.x ID.y LimAm LimAv DIST.y Z ma_Z
1 1 0.00000 1 -375.0000 375.0000 0.00000 193.0751 193.0751
2 1 0.00000 2 -363.4843 386.5157 11.51568 193.1545 192.9380
3 1 0.00000 3 -353.1766 396.8234 21.82344 192.1729 192.8009
4 1 0.00000 4 -353.0116 396.9884 21.98836 192.1729 192.4417
5 1 0.00000 5 -342.0384 407.9616 32.96157 190.8297 192.0825
6 1 0.00000 6 -331.0652 418.9348 43.93478 190.6362 191.5756
7 1 0.00000 7 -330.9915 419.0085 44.00847 190.6362 191.0688
8 1 0.00000 8 -320.0183 429.9817 54.98168 189.4504 191.0688
9 2 11.51568 1 -375.0000 375.0000 0.00000 193.0751 193.0751
10 2 11.51568 2 -363.4843 386.5157 11.51568 193.1545 192.9380
11 2 11.51568 3 -353.1766 396.8234 21.82344 192.1729 192.8009
12 2 11.51568 4 -353.0116 396.9884 21.98836 192.1729 192.4417
13 2 11.51568 5 -342.0384 407.9616 32.96157 190.8297 192.0825
14 2 11.51568 6 -331.0652 418.9348 43.93478 190.6362 191.5756
15 2 11.51568 7 -330.9915 419.0085 44.00847 190.6362 191.0688
16 2 11.51568 8 -320.0183 429.9817 54.98168 189.4504 191.0688
17 3 21.82344 1 -375.0000 375.0000 0.00000 193.0751 193.0751
18 3 21.82344 2 -363.4843 386.5157 11.51568 193.1545 192.9380
19 3 21.82344 3 -353.1766 396.8234 21.82344 192.1729 192.8009
20 3 21.82344 4 -353.0116 396.9884 21.98836 192.1729 192.4417
21 3 21.82344 5 -342.0384 407.9616 32.96157 190.8297 192.0825
22 3 21.82344 6 -331.0652 418.9348 43.93478 190.6362 191.5756
23 3 21.82344 7 -330.9915 419.0085 44.00847 190.6362 191.0688
24 3 21.82344 8 -320.0183 429.9817 54.98168 189.4504 191.0688
25 4 21.98836 1 -375.0000 375.0000 0.00000 193.0751 193.0751
26 4 21.98836 2 -363.4843 386.5157 11.51568 193.1545 192.9380
27 4 21.98836 3 -353.1766 396.8234 21.82344 192.1729 192.8009
28 4 21.98836 4 -353.0116 396.9884 21.98836 192.1729 192.4417
29 4 21.98836 5 -342.0384 407.9616 32.96157 190.8297 192.0825
30 4 21.98836 6 -331.0652 418.9348 43.93478 190.6362 191.5756
31 4 21.98836 7 -330.9915 419.0085 44.00847 190.6362 191.0688
32 4 21.98836 8 -320.0183 429.9817 54.98168 189.4504 191.0688
33 5 32.96157 1 -375.0000 375.0000 0.00000 193.0751 193.0751
34 5 32.96157 2 -363.4843 386.5157 11.51568 193.1545 192.9380
35 5 32.96157 3 -353.1766 396.8234 21.82344 192.1729 192.8009
36 5 32.96157 4 -353.0116 396.9884 21.98836 192.1729 192.4417
37 5 32.96157 5 -342.0384 407.9616 32.96157 190.8297 192.0825
38 5 32.96157 6 -331.0652 418.9348 43.93478 190.6362 191.5756
39 5 32.96157 7 -330.9915 419.0085 44.00847 190.6362 191.0688
40 5 32.96157 8 -320.0183 429.9817 54.98168 189.4504 191.0688
41 6 43.93478 1 -375.0000 375.0000 0.00000 193.0751 193.0751
42 6 43.93478 2 -363.4843 386.5157 11.51568 193.1545 192.9380
43 6 43.93478 3 -353.1766 396.8234 21.82344 192.1729 192.8009
44 6 43.93478 4 -353.0116 396.9884 21.98836 192.1729 192.4417
45 6 43.93478 5 -342.0384 407.9616 32.96157 190.8297 192.0825
46 6 43.93478 6 -331.0652 418.9348 43.93478 190.6362 191.5756
47 6 43.93478 7 -330.9915 419.0085 44.00847 190.6362 191.0688
48 6 43.93478 8 -320.0183 429.9817 54.98168 189.4504 191.0688
49 9383 0.00000 9383 -375.0000 375.0000 0.00000 353.1934 353.1934
50 9383 0.00000 9384 -361.1082 388.8918 13.89185 353.0284 353.1934
51 9384 13.89185 9383 -375.0000 375.0000 0.00000 353.1934 353.1934
52 9384 13.89185 9384 -361.1082 388.8918 13.89185 353.0284 353.1934
53 188473 0.00000 188473 -375.0000 375.0000 0.00000 344.1600 344.1600
54 188473 0.00000 188474 -375.0000 375.0000 0.00000 344.1600 344.1600
55 188473 0.00000 188475 -375.0000 375.0000 0.00000 344.1600 344.1600
56 188473 0.00000 188476 -362.1938 387.8062 12.80625 344.1892 344.1600
57 188474 0.00000 188473 -375.0000 375.0000 0.00000 344.1600 344.1600
58 188474 0.00000 188474 -375.0000 375.0000 0.00000 344.1600 344.1600
59 188474 0.00000 188475 -375.0000 375.0000 0.00000 344.1600 344.1600
60 188474 0.00000 188476 -362.1938 387.8062 12.80625 344.1892 344.1600