rdatetidyversedata-manipulationbinary-data

Mutating detection data into binary


Currently I have a dataframe of bear detections that I want to convert into a binary detection history (14 columns of day1, day2, day3, etc. where:

What I'm trying to do is start a 14 day detection history (e.g. 001010001...) for each plot, starting with "actual_date_out" and ending 14 days after that. The problem is that sometimes the first detection (start_time) is e.g. 3 days after the deployment date and some dates don't have data, so I would have to fill in a 0 for some plots starting at the first deployment date (actual_date_out) and then for every preceding date that doesn't have a "start_time" column value (only dates that have start time row/presence have a 1). Some plots have 1 detection the whole time, others multiple for each day. The idea is each day just gets 0 or 1. Some plots may have multiple start_time values but each plot will only have one actual_date_out (each camera is only deployed once.)

Currently I have code that kind of works except it's putting a 1 for the first column and only 0s for all the preceding instead of searching for a start_time and assigning a 1. Only dates that also have a start_date get a value of 1.

library(tidyverse)
d <- read_csv(file.choose())

#Change actual_date_out to date format
d$actual_date_out <- as.Date(d$actual_date_out, format = "%m/%d/%Y")
d$retrieval_date <- as.Date(d$retrieval_date, format = "%m/%d/%Y")
d$start_time <- as.Date(d$start_time, format = "%m/%d/%Y")

#For each plot, find the earliest date in actual_date_out and then add the next 13 days. For each plot, each of the remaining 13 days should be added with a pres_abs = 0
d1 <- d |>
  group_by(plot) |>
  mutate(start_time = min(actual_date_out)) |>
  complete(actual_date_out = seq.Date(first(start_time), first(start_time) + 20, by = "day")) |>
  mutate(pres_abs = if_else(is.na(pres_abs), 0, pres_abs)) |>
  ungroup()

# Removes duplicate entries by taking max of pres_abs
d2 <- d1 |>
  group_by(plot, actual_date_out) |>
  slice_max(pres_abs, with_ties = FALSE) |>
  ungroup()

# Create news days variable as a sequence
d2 <- d2 |>
  group_by(plot) |>
  arrange(actual_date_out) |>
  mutate(day = paste0("day_", row_number())) |>
  ungroup()

# For each plot, take the actual_lat and actual_long from the first entry and propogate it to all the other entries for that plot
# For each plot, record the very first day in the 14 day period as actual_date_out
d2 <- d2 |>
  group_by(plot) |>
  mutate(actual_lat = first(actual_lat),
         actual_long = first(actual_long),
         first_day = first(actual_date_out)) |>
  ungroup()

# Pivots to wider so each of the 14 days is a binarized variable
d3 <- d2 |>
  select(-c(retrieval_date,start_time, actual_date_out)) |>
  pivot_wider(id_cols = c(plot, actual_lat, actual_long, first_day),
              names_from = day,
              values_from = pres_abs)
> dput(d)
structure(list(plot = c("d22142", "d22142", "d22489", "d22489", 
"d23081", "d23081", "d23081", "d23302", "d23544", "d23544", "d23544", 
"d23544", "d23544", "d23544", "d23569", "d23569", "d23579", "d23647"
), actual_date_out = structure(c(17158, 17158, 17229, 17229, 
17273, 17273, 17273, 17272, 17326, 17326, 17326, 17326, 17326, 
17326, 17303, 17303, 17303, 17309), class = "Date"), retrieval_date = structure(c(17178, 
17178, 17250, 17250, 17293, 17293, 17293, 17291, 17349, 17349, 
17349, 17349, 17349, 17349, 17324, 17324, 17327, 17324), class = "Date"), 
    actual_lat = c(35.5767, 35.5767, 35.5901, 35.5901, 35.2851, 
    35.2851, 35.2851, 35.3086, 35.9439, 35.9439, 35.9439, 35.9439, 
    35.9439, 35.9439, 35.0581, 35.0581, 35.1264, 35.3453), actual_long = c(-82.4956, 
    -82.4956, -82.5901, -82.5901, -83.1089, -83.1089, -83.1089, 
    -82.5258, -82.6275, -82.6275, -82.6275, -82.6275, -82.6275, 
    -82.6275, -83.4274, -83.4274, -83.0983, -82.781), start_time = structure(c(17161, 
    17161, 17248, 17248, 17281, 17283, 17281, 17273, 17336, 17336, 
    17347, 17349, 17336, 17336, 17309, 17315, 17316, 17311), class = "Date"), 
    pres_abs = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1)), row.names = c(NA, -18L), spec = structure(list(
    cols = list(plot = structure(list(), class = c("collector_character", 
    "collector")), actual_date_out = structure(list(), class = c("collector_character", 
    "collector")), retrieval_date = structure(list(), class = c("collector_character", 
    "collector")), actual_lat = structure(list(), class = c("collector_double", 
    "collector")), actual_long = structure(list(), class = c("collector_double", 
    "collector")), start_time = structure(list(), class = c("collector_character", 
    "collector")), pres_abs = structure(list(), class = c("collector_double", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"))

Solution

  • We can simply do a summarize() and find the dates that have data in the 14-day window after actual_date_out.

    library(tidyverse)
    
    d_binary <- d %>%
      summarize(
        actual_date_out = first(actual_date_out),
        actual_lat = first(actual_lat),
        actual_long = first(actual_long),
        detection_history = paste0(as.numeric((actual_date_out + 0:13) 
                                              %in% start_time), collapse = ""),
        .by = plot
      )
    
    #> # A tibble: 8 × 5
    #>   plot   actual_date_out actual_lat actual_long detection_history
    #>   <chr>  <date>               <dbl>       <dbl> <chr>            
    #> 1 d22142 2016-12-23            35.6       -82.5 00010000000000   
    #> 2 d22489 2017-03-04            35.6       -82.6 00000000000000   
    #> 3 d23081 2017-04-17            35.3       -83.1 00000000101000   
    #> 4 d23302 2017-04-16            35.3       -82.5 01000000000000   
    #> 5 d23544 2017-06-09            35.9       -82.6 00000000001000   
    #> 6 d23569 2017-05-17            35.1       -83.4 00000010000010   
    #> 7 d23579 2017-05-17            35.1       -83.1 00000000000001   
    #> 8 d23647 2017-05-23            35.3       -82.8 00100000000000
    

    Here, I have done a simple visualization of the results;

    d_binary %>%
      rowwise() %>%
      mutate(
        days = list(1:14),
        detections = list(as.numeric(strsplit(detection_history, "")[[1]])),
        dates = list(actual_date_out + 0:13)
      ) %>%
      unnest(c(days, detections, dates)) %>%
      ungroup() %>%
      mutate(
        date_label = if_else(
          detections == 1, 
          str_glue("{month(dates)}/{day(dates)}/{year(dates)}"), "")
      ) %>% 
      ggplot(., aes(x = days, y = reorder(plot, desc(plot)), 
                    fill = factor(detections))) +
      geom_tile(color = "white", linewidth = 0.8, width = 0.9, height = 0.8) +
      geom_text(aes(label = date_label), 
                size = 2.5, color = "white", fontface = "bold") +
      scale_fill_manual(
        values = c("0" = "#d73027", "1" = "#1a9850"), 
        name = "Detection Status", 
        labels = c("0" = "Not Detected", "1" = "Detected")
      ) +
      scale_x_continuous(
        breaks = 1:14, 
        expand = c(0, 0)
      ) +
      scale_y_discrete(expand = c(0, 0)) +
      labs(
        title = "Bear Detection History",
        x = "Study Day",
        y = "Plot ID"
      ) +
      theme_minimal() +
      theme(
        axis.text.x = element_text(hjust = 1, size = 9),
        axis.text.y = element_text(size = 9),
        panel.grid = element_blank(),
        panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5),
        plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "bottom",
        legend.title = element_text(face = "bold")
      )
    

    Created on 2025-08-12 with reprex v2.1.1