I work on sexually transmitted diseases, such as gonorrhoea, and their possible consequences, such as ectopic pregnancy in women. My dataset (df_gono) contains longitudinal epidemiological data by year category, age category, sex and HIV status. It also has a variable that counts (integer from 0 to n) the number of diagnoses of gonorrhoea per age_cat, year_cat and HIV status. I would like to create a variable that informs about the "gonorrhoea status" of patients, with patients taking 1 as soon as they have been diagnosed at least once in a given age_cat and year_cat. In other words, for each patient, whenever the variable n_gono is greater than 0, status_gono takes 1 for that line and the following rows. Result should be as df_gono2.
df_gono <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"),
sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"),
age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7),
calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4),
age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"),
year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"),
hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7))
df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"),
sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"),
age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7),
calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4),
age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"),
year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"),
hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))
Try this solution, you'll need to install the libraries dplyr
and tidyr
:
library(dplyr)
library(tidyr)
df_gono2 |>
group_by(patient) |>
mutate(status_gono2 = ifelse(n_gono >= 1, 1, NA)) |>
fill(status_gono2, .direction = "down") |>
replace_na(list (status_gono2 = 0)) |>
select(status_gono, status_gono2)
#> Adding missing grouping variables: `patient`
#> # A tibble: 21 × 3
#> # Groups: patient [5]
#> patient status_gono status_gono2
#> <chr> <dbl> <dbl>
#> 1 A 0 0
#> 2 A 1 1
#> 3 A 1 1
#> 4 A 1 1
#> 5 A 1 1
#> 6 A 1 1
#> 7 B 0 0
#> 8 B 1 1
#> 9 B 1 1
#> 10 C 0 0
#> 11 C 0 0
#> 12 C 0 0
#> 13 C 0 0
#> 14 C 0 0
#> 15 D 1 1
#> 16 D 1 1
#> 17 D 1 1
#> 18 D 1 1
#> 19 E 0 0
#> 20 E 1 1
#> 21 E 1 1
Data given:
df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"),
sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"),
age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7),
calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4),
age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"),
year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"),
hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))