rclickstream

User paths in R


The following is a sample of the dataset I am working on. I am trying to assess which users create a request on the contact form and are successful. So, the button click that tells me that the user has begun a request is "createrequestButtonClick" and the button click that denotes a successfully sent request is "SendButtonClick".

The problem I have is the path to "SendButtonClick" is uncertain it could be after 6 or 4 steps from "createrequestButtonClick". Also, a user can create and send (or not) multiple requests.

Through R code, how can I assess whether a "createrequestButtonClick" precedes a "SendButtonClick" or vice versa? If there isn't a "SendButtonClick" after a "createrequestButtonClick", it means that the user initiated a request, but did not submit it successfully (and this needs to be flagged).

structure(list(session_id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), 
User_ID = c("123", "123", "123", "123", "123", "123", "123", "123", "123", "123", "345", "345", "345", "345", "345", "345", "345", "345", "345", "345", "345"), 
Page = c("home", "contact", "createrequestButtonClick", "requestform", "requestform", "FormValueChange", "FormContactSelection", "FormValueChange", "SendButtonClick", "home", "home", "contact", "createrequestButtonClick", "requestform", "FormValueChange", "SendButtonClick", "contact", "createrequestButtonClick", "requestform", "FormValueChange", "SendButtonClick"), 
Path_ID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), 
Path_Length = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L)), 
row.names = c(NA, -21L), 
class = c("tbl_df", "tbl", "data.frame"))

Solution

  • You can use cumsum() to create identifiers for all created requests. Then check if the send button was clicked in each request with any().

    library(tidyverse)
    
    paths %>% 
      group_by(session_id) %>%
      mutate(request_id = cumsum(Page == "createrequestButtonClick")) %>% 
      filter(request_id > 0) %>%
      group_by(request_id, .add = TRUE) %>% 
      summarise(request_was_succesful = any(Page == "SendButtonClick")) %>%
      summarise(session_was_succesful = all(request_was_succesful))
    #> # A tibble: 2 × 2
    #>   session_id session_was_succesful
    #>        <dbl> <lgl>                
    #> 1          1 TRUE                 
    #> 2          2 TRUE
    

    A couple of simplified examples:

    sessions <- rbind(
      data.frame(session_id = 1, action = c("create", "send")),
      data.frame(session_id = 2, action = c("create", "change", "send")),
      data.frame(session_id = 3, action = c("create", "send", "create", "send")),
      data.frame(session_id = 4, action = c("create")),
      data.frame(session_id = 5, action = c("create", "create", "send")),
      data.frame(session_id = 6, action = c("send", "create"))
    )
    
    sessions
    #>    session_id action
    #> 1           1 create
    #> 2           1   send
    #> 3           2 create
    #> 4           2 change
    #> 5           2   send
    #> 6           3 create
    #> 7           3   send
    #> 8           3 create
    #> 9           3   send
    #> 10          4 create
    #> 11          5 create
    #> 12          5 create
    #> 13          5   send
    #> 14          6   send
    #> 15          6 create
    

    And the corresponding classifications:

    sessions %>% 
      group_by(session_id) %>%
      mutate(request_id = cumsum(action == "create")) %>% 
      filter(request_id > 0) %>%
      group_by(request_id, .add = TRUE) %>% 
      summarise(request_was_succesful = any(action == "send")) %>%
      summarise(session_was_succesful = all(request_was_succesful))
    #> # A tibble: 6 × 2
    #>   session_id session_was_succesful
    #>        <dbl> <lgl>                
    #> 1          1 TRUE                 
    #> 2          2 TRUE                 
    #> 3          3 TRUE                 
    #> 4          4 FALSE                
    #> 5          5 FALSE                
    #> 6          6 FALSE