In a longitudinal dataset, how to create a status variable 0/1 depending on a second, diagnostic variable that varies with time

I work on sexually transmitted diseases, such as gonorrhoea, and their possible consequences, such as ectopic pregnancy in women. My dataset (df_gono) contains longitudinal epidemiological data by year category, age category, sex and HIV status. It also has a variable that counts (integer from 0 to n) the number of diagnoses of gonorrhoea per age_cat, year_cat and HIV status. I would like to create a variable that informs about the "gonorrhoea status" of patients, with patients taking 1 as soon as they have been diagnosed at least once in a given age_cat and year_cat. In other words, for each patient, whenever the variable n_gono is greater than 0, status_gono takes 1 for that line and the following rows. Result should be as df_gono2.

df_gono <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"), 
                      sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"), 
                      age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7), 
                      calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4), 
                      age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"), 
                      year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"), 
                      hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
                      pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
                      n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7))

df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"), 
                       sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"), 
                       age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7), 
                       calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4), 
                       age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"), 
                       year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"), 
                       hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
                       pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
                       n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
                       status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))

Solution

Try this solution, you'll need to install the libraries dplyr and tidyr:

library(dplyr)
library(tidyr)

df_gono2 |> 
  group_by(patient) |> 
  mutate(status_gono2 = ifelse(n_gono >= 1, 1, NA)) |> 
  fill(status_gono2, .direction = "down") |> 
  replace_na(list (status_gono2 = 0)) |>
  select(status_gono, status_gono2) 
#> Adding missing grouping variables: `patient`
#> # A tibble: 21 × 3
#> # Groups:   patient [5]
#>    patient status_gono status_gono2
#>    <chr>         <dbl>        <dbl>
#>  1 A                 0            0
#>  2 A                 1            1
#>  3 A                 1            1
#>  4 A                 1            1
#>  5 A                 1            1
#>  6 A                 1            1
#>  7 B                 0            0
#>  8 B                 1            1
#>  9 B                 1            1
#> 10 C                 0            0
#> 11 C                 0            0
#> 12 C                 0            0
#> 13 C                 0            0
#> 14 C                 0            0
#> 15 D                 1            1
#> 16 D                 1            1
#> 17 D                 1            1
#> 18 D                 1            1
#> 19 E                 0            0
#> 20 E                 1            1
#> 21 E                 1            1

Data given:

df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"), 
                       sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"), 
                       age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7), 
                       calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4), 
                       age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"), 
                       year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"), 
                       hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
                       pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
                       n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
                       status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))