rlagcumsumlead

Extract rows with consecutive zero values if they are preceded by significant values in R


I'm working with a dataset in R called "data," originating from data collection on a Fronius inverter. This dataset contains one record per minute and a column named "pac_w," which represents the wattage of generated energy. The inverter has a protection system that interrupts energy generation in case of overvoltage. When this happens, the "pac_w" column is recorded as zero for four consecutive minutes (remembering that each row represents one minute), and it takes an additional two minutes to stabilize energy generation. These interruptions have occurred frequently in recent months, significantly impacting energy generation.

Example below with real data.

EDIT

Now with more rows.

pac_w <- c(3336,3294,0,0,0,0,742,1620,2530,3438,2626,3704,2321,3088,1672,2722,
           1953,0,0,0,0,836,1746,2654,3566,0,0,0,0,995,1908,2800)

day_energy_wh <- c(2479,2536,2555,2555,2555,2555,2560,2580,2615,2665,2717,2766,
                   2811,2868,2903,2944,2966,2979,2979,2979,2979,2986,3008,3045,
                   3097,3097,3097,3097,3097,3106,3131,3171)

date_time <- c("2023-12-23,08:13:00","2023-12-23,08:14:00","2023-12-23,08:15:00",
               "2023-12-23,08:16:00","2023-12-23,08:17:00","2023-12-23,08:18:00",
               "2023-12-23,08:19:00","2023-12-23,08:20:00","2023-12-23,08:21:00",
               "2023-12-23,08:22:00","2023-12-23,08:23:00","2023-12-23,08:24:00",
               "2023-12-23,08:25:00","2023-12-23,08:26:00","2023-12-23,08:27:00",
               "2023-12-23,08:28:00","2023-12-23,08:29:00","2023-12-23,08:30:00",
               "2023-12-23,08:31:00","2023-12-23,08:32:00","2023-12-23,08:33:00",
               "2023-12-23,08:34:00","2023-12-23,08:35:00","2023-12-23,08:36:00",
               "2023-12-23,08:37:00","2023-12-23,08:38:00","2023-12-23,08:39:00",
               "2023-12-23,08:40:00","2023-12-23,08:41:00","2023-12-23,08:42:00",
               "2023-12-23,08:43:00","2023-12-23,08:44:00")

data <- data.frame(pac_w,day_energy_wh,date_time)

My goal is to estimate how many watt-hours the inverter was unable to generate due to this overvoltage protection.

The day_energy_wh column shows the accumulated energy for the day up to the time in the date_time column.

I want to estimate the energy not generated by calculating the average between the value immediately before the failure (in case 3294) and the value after stabilization (in case 2530)

(3294 + 2530) / 2 = 2912

In the example data, the estimate of how many watt-hours the inverter stopped generating is 252.

round(sum(2912 - pac_w[3:8])/60) = 252

At the beginning and end of the day it is common to have low values and even values equal to zero. So I only want to estimate the energy not generated when the value of pac_w immediately before the four values equal to zero is equal to or greater than 500.

EDIT

r2evans, your first solution presents correct values, but is not immune to variation in the number of consecutive zeros.

The second solution is not affected by the variation of consecutive zeros, but something means that only the first calculation relating to the occurrences of consecutive zeros has the correct value.

r <- rle(data$pac_w == 0)

four0 <- setdiff(which(r$values), c(1L, length(r$values)))

four0 <- four0[r$lengths[four0 + 1] >= 3]

lapply(four0, function(f0) {
  indprev <- sum(r$lengths[1:(f0-1)])
  indtween <- (f0-1):sum(r$lengths[1:f0])+2
  indnext <- max(indtween)+1
  val <- sum(
    mean(data$pac_w[ c(indprev, indnext) ]) - data$pac_w[indtween]
  ) / 60
  cbind(data[indprev+1,], data.frame(lost = val))
}) |>
  do.call(rbind, args = _)

#   pac_w day_energy_wh           date_time     lost
# 3      0          2555 2023-12-23,08:15:00 251.8333 # correct
# 18     0          2979 2023-12-23,08:30:00 246.1417 # incorrect
# 26     0          3097 2023-12-23,08:38:00 690.9000 # incorrect


data |> 
  mutate(
    starts = cumsum(zoo::rollapply(pac_w == 0, 4, align="left", partial=TRUE, FUN=all)),
    prev_pac_w = lag(pac_w)
  ) |>
  summarize(
    .by = starts,
    date_time = first(date_time),
    lost = if (first(pac_w) == 0) {
      sum(mean(c(first(prev_pac_w), pac_w[which(pac_w > 0)[1]+2])) -
            pac_w[1:(which(pac_w > 0)[1]+1)]) / 60
    } else NA
  )

# starts           date_time     lost
# 1      0 2023-12-23,08:13:00       NA
# 2      1 2023-12-23,08:15:00 251.8333 # correct
# 3      2 2023-12-23,08:30:00 187.3167 # correct
# 4      3 2023-12-23,08:38:00 269.9167 # correct


Solution

  • Edit: perhaps a run-length encoding approach is best. Now using the updated data with three "four 0s" blocks.

    r <- rle(data$pac_w == 0)
    # ignore first or last, we cannot recover when those are the case
    four0 <- setdiff(which(r$values), c(1L, length(r$values)))
    # ignore those where we don't have sufficient data after the episode
    four0 <- four0[r$lengths[four0 + 1] >= 3]
    sapply(four0, function(f0) {
      indprev <- sum(r$lengths[1:(f0-1)])
      indtween <- (sum(r$lengths[1:(f0-1)])+1):(sum(r$lengths[1:f0])+2)
      indnext <- max(indtween)+1
      sum(
        mean(data$pac_w[ c(indprev, indnext) ]) - data$pac_w[indtween]
      ) / 60
    })
    # [1] 251.8333 187.3167 269.9167
    

    If you need each value identified with a timestamp, then perhaps

    lapply(four0, function(f0) {
      indprev <- sum(r$lengths[1:(f0-1)])
      indtween <- (sum(r$lengths[1:(f0-1)])+1):(sum(r$lengths[1:f0])+2)
      indnext <- max(indtween)+1
      val <- sum(
        mean(data$pac_w[ c(indprev, indnext) ]) - data$pac_w[indtween]
      ) / 60
      cbind(data[indprev+1,], data.frame(lost = val))
    }) |>
      do.call(rbind, args = _)
    #    pac_w day_energy_wh           date_time     lost
    # 3      0          2555 2023-12-23,08:15:00 251.8333
    # 18     0          2979 2023-12-23,08:30:00 187.3167
    # 26     0          3097 2023-12-23,08:38:00 269.9167
    

    Previous answer that requires four in a row:

    library(dplyr)
    data |>
      mutate(
        starts = cumsum(zoo::rollapply(pac_w == 0, 4, align="left", partial=TRUE, FUN=all)),
        prev_pac_w = lag(pac_w)
      ) |>
      summarize(
        .by = starts,
        date_time = first(date_time),
        lost = if (first(pac_w) == 0) {
          sum(mean(c(first(prev_pac_w), pac_w[which(pac_w > 0)[1]+2])) -
                pac_w[1:(which(pac_w > 0)[1]+1)]) / 60
          } else NA
      )
    #   starts           date_time     lost
    # 1      0 2023-12-23 08:13:00       NA
    # 2      1 2023-12-23 08:15:00 251.8333
    

    The NA row is not a time-lost, so you can safely |> filter(!is.na(lost)).

    This is doing a rolling window, 4-wide, to find where a sequence of 4 zeroes starts; the cumsum then groups all rows based on the start of each of these sequence. From there, per-group (.by=starts) it internally selects the third non-zero pac_w and does your formula. (There is likely a way to clean that up, it looks rather "busy" as it stands.)

    The use of .by= requires dplyr_1.1.0 or newer; if you have an older version, change from mutate(.by=c(..), stuff) to group_by(..) |> mutate(stuff) |> ungroup().