I have a composite score (BASFI) which consists of components Q1-Q10 and a bunch of variables I want unchanged. I wanted to go from a wide format to a long format. The composite score and components have been measured at 4 time points. Please see dataset for more details on this.
Reproducible dataset:
structure(list(composite_baseline = c(40, NA, 16), composite_6month = c(NA_real_,
NA_real_, NA_real_), composite_12month = c(NA, 0, NA), composite_24month = c(NA,
12, NA), q1_socks_baseline = c(0, NA, 0), q1_socks_6month = c(NA_real_,
NA_real_, NA_real_), q1_socks_12month = c(NA, 0, NA), q1_socks_24month = c(NA,
20, NA), q2_pen_baseline = c(0, NA, 20), q2_pen_6month = c(NA_real_,
NA_real_, NA_real_), q2_pen_12month = c(NA, 0, NA), q2_pen_24month = c(NA,
10, NA), q3_shelf_baseline = c(10, NA, 40), q3_shelf_6month = c(NA_real_,
NA_real_, NA_real_), q3_shelf_12month = c(NA, 0, NA), q3_shelf_24month = c(NA,
10, NA), q4_chair_baseline = c(30, NA, 10), q4_chair_6month = c(NA_real_,
NA_real_, NA_real_), q4_chair_12month = c(NA, 0, NA), q4_chair_24month = c(NA,
10, NA), q5_floor_baseline = c(50, NA, 20), q5_floor_6month = c(NA_real_,
NA_real_, NA_real_), q5_floor_12month = c(NA, 0, NA), q5_floor_24month = c(NA,
10, NA), q6_standing_baseline = c(30, NA, 0), q6_standing_6month = c(NA_real_,
NA_real_, NA_real_), q6_standing_12month = c(NA, 0, NA), q6_standing_24month = c(NA,
10, NA), q7_steps_baseline = c(70, NA, 10), q7_steps_6month = c(NA_real_,
NA_real_, NA_real_), q7_steps_12month = c(NA, 0, NA), q7_steps_24month = c(NA,
10, NA), q8_shoulder_baseline = c(50, NA, 0), q8_shoulder_6month = c(NA_real_,
NA_real_, NA_real_), q8_shoulder_12month = c(NA, 0, NA), q8_shoulder_24month = c(NA,
10, NA), q9_activities_baseline = c(70, NA, 30), q9_activities_6month = c(NA_real_,
NA_real_, NA_real_), q9_activities_12month = c(NA, 0, NA), q9_activities_24month = c(NA,
20, NA), q10_day_baseline = c(90, NA, 30), q10_day_6month = c(NA_real_,
NA_real_, NA_real_), q10_day_12month = c(NA, 0, NA), q10_day_24month = c(NA,
10, NA), age_at_diagnosis = c(31, 38, 34), hla_b27 = c("pos",
"neg", "neg"), years_since_symptoms_onset = c(25L, 3L, 4L), smoking_status = c("never",
"past", "past"), bio_drug_name = c("golimumab", "adalimumab",
"golimumab"), bmi = c(25.7629805786762, 24.3910588392529, 21.9713358572201
), nsaid_baseline = c("0", NA, "1"), concom_csdmard_baseline = c(1,
1, 0), bio_drug_start_year = c(2011L, 2007L, 2011L), country = c("CH",
"CH", "CH"), asas_criteria_3_enthesitis = c(1L, 0L, 1L), asas_criteria_2_arthritis = c(1L,
1L, 0L), asas_criteria_6_psoriasis = c(0L, 0L, 0L), asas_criteria_7_ibd = c(0L,
1L, 0L), asas_criteria_4_uveitis = c(0L, 0L, 0L), crp_baseline = c(11,
NA, 7), gender = c("female", "male", "female"), ID = 1:3), row.names = c(NA,
-3L), class = c("data.table", "data.frame"))
Looks like this
composite_baseline composite_6month composite_12month composite_24month q1_socks_baseline q1_socks_6month q1_socks_12month q1_socks_24month
1: 40 NA NA NA 0 NA NA NA
2: NA NA 0 12 NA NA 0 20
3: 16 NA NA NA 0 NA NA NA
q2_pen_baseline q2_pen_6month q2_pen_12month q2_pen_24month q3_shelf_baseline q3_shelf_6month q3_shelf_12month q3_shelf_24month q4_chair_baseline
1: 0 NA NA NA 10 NA NA NA 30
2: NA NA 0 10 NA NA 0 10 NA
3: 20 NA NA NA 40 NA NA NA 10
q4_chair_6month q4_chair_12month q4_chair_24month q5_floor_baseline q5_floor_6month q5_floor_12month q5_floor_24month q6_standing_baseline
1: NA NA NA 50 NA NA NA 30
2: NA 0 10 NA NA 0 10 NA
3: NA NA NA 20 NA NA NA 0
q6_standing_6month q6_standing_12month q6_standing_24month q7_steps_baseline q7_steps_6month q7_steps_12month q7_steps_24month q8_shoulder_baseline
1: NA NA NA 70 NA NA NA 50
2: NA 0 10 NA NA 0 10 NA
3: NA NA NA 10 NA NA NA 0
q8_shoulder_6month q8_shoulder_12month q8_shoulder_24month q9_activities_baseline q9_activities_6month q9_activities_12month q9_activities_24month
1: NA NA NA 70 NA NA NA
2: NA 0 10 NA NA 0 20
3: NA NA NA 30 NA NA NA
q10_day_baseline q10_day_6month q10_day_12month q10_day_24month age_at_diagnosis hla_b27 years_since_symptoms_onset smoking_status bio_drug_name
1: 90 NA NA NA 31 pos 25 never golimumab
2: NA NA 0 10 38 neg 3 past adalimumab
3: 30 NA NA NA 34 neg 4 past golimumab
bmi nsaid_baseline concom_csdmard_baseline bio_drug_start_year country asas_criteria_3_enthesitis asas_criteria_2_arthritis
1: 25.76298 0 1 2011 CH 1 1
2: 24.39106 <NA> 1 2007 CH 0 1
3: 21.97134 1 0 2011 CH 1 0
asas_criteria_6_psoriasis asas_criteria_7_ibd asas_criteria_4_uveitis crp_baseline gender ID
1: 0 0 0 11 female 1
2: 0 1 0 NA male 2
3: 0 0 0 7 female 3
Code I have tried:
library(data.table)
dat_long <- melt(setDT(dat), id.vars = 45:62,
measure=c("composite", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10"),
value.name=c("basfi_composite", "basfi_1_socks", "basfi_2_pen", "basfi_3_shelf", "basfi_4_chair",
"basfi_5_floor", "basfi_6_standing", "basfi_7_steps", "basfi_8_shoulder", "basfi_9_activities_demanding", "basfi_10_activities_day"))
Basically gives 24 observations when 12 were expected (4 time points * 3 observations).
This case is somewhat complicated by the fact that time and measure are separated by "_" but some measures also have "_" in their names.
Also, unfortunately melt does not support names_sep like pivot_longer
and when you melt in multiple columns, the melted variables are given an index instead of a name
First, i melt all in one column into a temporary DT to obtain variable names. Then, by using strsplit and a lookbehind in the regexp (to avoid the underscore in q1..10 names) i create the measure and time variable.
tmp <- melt(setDT(dat) , id.vars = names(dat)[!grepl("^q\\d|composite", names(dat))],
variable.factor = FALSE)
tmp[, c("variable", "time") := as.data.table(
do.call(rbind, strsplit(variable, "(?<!\\d)_", perl=T)))]
Finally you can dcast the result to get the 12 variables (plus the id vars)
dcast(tmp, ... ~ time + ID, value.var = "value" )