rdplyr

Identify first constant value within a period in R tidyverse


Given a dataframe (df) with item_id and price, I would like to obtain the first price that is constant i.e., lasts for a period of at least 90 days for each item (item_id), when available. Am stuck on this.

Each item is available on the market from the season_start until season_end where each row can be loosely thought of as a "season", and multiple rows can make up a period - where we can check if price was constant.

Example: For the item_id = 1, the first non-missing price is 450, and comes from the season_start date "2022-06-15", however that "season" is less than 90 days (season_end - season_start). But the next two rows/seasons after it have similar price and the period covered by these 3 rows is more than 90 days (season_end (row3) - season_start (row1), making the price of 450 the first constant value within a 90 days period for this item which is the desired result and can be added to the dataframe in variable first_constant_price.

One might start like this so that records are properly ordered from the earliest (season_start) per item. Then the rest follows... Shall greatly appreciate tidyverse-styled solutions.

df %>% 
group_by(item_id) %>% 
arrange(season_start, .by_group = TRUE)  

Below is an example dataset.

df <- structure(list(item_id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27,
                                 
                                 27, 27, 27, 27, 32, 32, 32, 32, 32, 32, 32, 32, 158, 158, 158,
                                 
                                 158, 158), season_end = structure(c(19173, 19173, 19265, 19357,
                                                                     
                                                                     19447, 19447, 19538, 19630, 19722, 18992, 18992, 19082, 19173,
                                                                     
                                                                     19265, 19357, 19265, 19265, 19447, 19447, 19538, 19630, 19630,
                                                                     
                                                                     19630, 19538, 19538, 19538, 19630, 19630), class = "Date"), season_start = structure(c(19157,
                                                                                                                                                            
                                                                                                                                                            19158, 19158, 19158, 19398, 19398, 19494, 19592, 19662, 18973,
                                                                                                                                                            
                                                                                                                                                            18974, 19013, 19013, 19013, 19013, 19259, 19263, 19388, 19438,
                                                                                                                                                            
                                                                                                                                                            19534, 19541, 19541, 19599, 19522, 19522, 19529, 19541, 19613
                                                                                                                                                            
                                                                     ), class = "Date"), price = c(NA, 450, 450, 450, NA, 1050, 450,
                                                                                                   
                                                                                                   150, NA, NA, 150, 50, 50, 50, 50, 100, 150, 100, 50, 20, 20,
                                                                                                   
                                                                                                   20, 40, NA, 700, 300, 600, 900)), row.names = c(NA, -28L), class = c("tbl_df",
                                                                                                                                                                        
                                                                                                                                                                        "tbl", "data.frame"))

Here is the desired results

desired_result_df <- structure(list(item_id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 
                                                27, 27, 27, 27, 32, 32, 32, 32, 32, 32, 32, 32, 158, 158, 158, 
                                                158, 158), season_end = structure(c(19173, 19173, 19265, 19357, 
                                                                                    19447, 19447, 19538, 19630, 19722, 18992, 18992, 19082, 19173, 
                                                                                    19265, 19357, 19265, 19265, 19447, 19447, 19538, 19630, 19630, 
                                                                                    19630, 19538, 19538, 19538, 19630, 19630), class = "Date"), season_start = structure(c(19157, 
                                                                                                                                                                           19158, 19158, 19158, 19398, 19398, 19494, 19592, 19662, 18973, 
                                                                                                                                                                           18974, 19013, 19013, 19013, 19013, 19259, 19263, 19388, 19438, 
                                                                                                                                                                           19534, 19541, 19541, 19599, 19522, 19522, 19529, 19541, 19613
                                                                                    ), class = "Date"), price = c(NA, 450, 450, 450, NA, 1050, 450, 
                                                                                                                  150, NA, NA, 150, 50, 50, 50, 50, 100, 150, 100, 50, 20, 20, 
                                                                                                                  20, 40, NA, 700, 300, 600, 900), id = 1:28, first_constant_price = c(NA, 
                                                                                                                                                                                       450, NA, NA, NA, NA, NA, NA, NA, NA, NA, 50, NA, NA, NA, NA, 
                                                                                                                                                                                       NA, NA, NA, 20, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 
                                                                                                                                                                                                                                                       -28L), class = c("tbl_df", "tbl", "data.frame"))



Solution

  • A tidyverse solution

    library(dplyr)
    
    group_by(df, item_id) |> 
      arrange(season_start, .by_group = TRUE) |> 
      mutate(
        season_length = season_end - season_start,
        price_group = consecutive_id(price)
        ) |> 
      group_by(item_id, price_group, price) |> 
      summarise(season_length = sum(season_length)) |> 
      group_by(item_id) |> 
      filter(!is.na(price), season_length >= 90) |> 
      summarise(price = first(price))
    
    #> # A tibble: 3 × 2
    #> # Groups:   item_id [3]
    #>   item_id price
    #>     <dbl> <dbl>
    #> 1       1   450
    #> 2      27    50
    #> 3      32    20
    

    With explanation:

    # Arrange each item's prices by date
    group_by(df, item_id) |> 
      arrange(season_start, .by_group = TRUE) |> 
    
      mutate(
        ## Calculate length of season
        season_length = season_end - season_start,
       
        ## dplyr::consecutive_id() lets us assign the same
        ## value to rows with consecutive prices
        price_group = consecutive_id(price)
        ) |> 
    
      ## Get total days that each price remains 'fixed'
      group_by(item_id, price_group, price) |> 
      summarise(season_length = sum(season_length)) |> 
    
      ## Drop missing prices, keep only consistent periods > 90
      group_by(item_id) |> 
      filter(!is.na(price), season_length >= 90) |> 
    
      ## Get first price that meets our criteria
      summarise(price = first(price))
    
    #> # A tibble: 3 × 2
    #> # Groups:   item_id [3]
    #>   item_id price
    #>     <dbl> <dbl>
    #> 1       1   450
    #> 2      27    50
    #> 3      32    20
    

    Created on 2024-12-03 with reprex v2.1.1