I have a set of 10 years (2009-2020), 30-min interval meteorological datasets, but the data has missing values during night (~17:00 to ~08:00 next day) for 2 two years (2015-2017) due to battery failure of the instrument. Variables are: air temperature, humidity, wind, radiations.
I tried to do using fill_by_function(fun = mean)
of padR, but it uses mean of all datasets and a single value.
Below I have put a slice example of my data frame structure:
chhota_1 <- data %>%
slice(89845:89900) %>%
pad() %>%
fill_by_value(na.pad=TRUE)
You could accomplish it via data.table
(among many other methods):
library(data.table)
data = data.table(
"timestep" = rep(1:5, 4),
"year" = rep(2010:2013, each = 5),
"value" = rnorm(20)
)
# create NA year
data[year == 2012, value := NA_real_]
> data
timestep year value
1: 1 2010 0.7383885
2: 2 2010 0.6547628
3: 3 2010 -0.9825121
4: 4 2010 0.2670737
5: 5 2010 0.8688922
6: 1 2011 0.1509628
7: 2 2011 1.2482106
8: 3 2011 -0.9492589
9: 4 2011 1.2311409
10: 5 2011 -0.1162351
11: 1 2012 NA
12: 2 2012 NA
13: 3 2012 NA
14: 4 2012 NA
15: 5 2012 NA
16: 1 2013 -1.0179958
17: 2 2013 0.4368148
18: 3 2013 0.7547140
19: 4 2013 -0.4759922
20: 5 2013 -0.2393624
fill NA
by mean for that timestep (over other years):
data[, value := ifelse(is.na(value), mean(value, na.rm = TRUE), value), by = "timestep"]
> data
timestep year value
1: 1 2010 0.73838849
2: 2 2010 0.65476283
3: 3 2010 -0.98251205
4: 4 2010 0.26707371
5: 5 2010 0.86889218
6: 1 2011 0.15096278
7: 2 2011 1.24821056
8: 3 2011 -0.94925891
9: 4 2011 1.23114088
10: 5 2011 -0.11623511
11: 1 2012 -0.04288152
12: 2 2012 0.77992939
13: 3 2012 -0.39235232
14: 4 2012 0.34074078
15: 5 2012 0.17109823
16: 1 2013 -1.01799582
17: 2 2013 0.43681478
18: 3 2013 0.75471400
19: 4 2013 -0.47599225
20: 5 2013 -0.23936237
EDIT:
From the sample data provided and comments:
> library(lubridate)
> library(data.table)
>
> data = fread("~/Downloads/test_data.csv")
>
> data[, timestamp := as.POSIXct(fast_strptime(timestamp, format = "%m/%d/%Y %H:%M"))]
>
> data[, date := format(timestamp, "%m-%d")]
> data[, year := format(timestamp, "%Y")]
>
> data[is.na(value), ]
timestamp value time date year
1: 2012-06-22 11:00:00 NA 11:00:00 06-22 2012
2: 2015-02-22 18:00:00 NA 18:00:00 02-22 2015
3: 2015-02-22 18:30:00 NA 18:30:00 02-22 2015
4: 2015-02-22 19:00:00 NA 19:00:00 02-22 2015
5: 2015-02-22 19:30:00 NA 19:30:00 02-22 2015
---
16194: 2017-04-07 05:30:00 NA 5:30:00 04-07 2017
16195: 2017-04-07 06:30:00 NA 6:30:00 04-07 2017
16196: 2017-04-07 18:30:00 NA 18:30:00 04-07 2017
16197: 2017-04-07 23:00:00 NA 23:00:00 04-07 2017
16198: 2017-04-08 19:00:00 NA 19:00:00 04-08 2017
>
> data[, value := ifelse(is.na(value), mean(value, na.rm = TRUE), value), by = c("time", "date")]
> data[is.na(value), ]
Empty data.table (0 rows and 5 cols): timestamp,value,time,date,year