rpivotreshapemeltlong-format-data

transform data from wide to long in r


I have a dataset similar to this:

id_male<-rep(c(1:100),times=1)
df1<-data.frame(id_male)
df1$id_female <- sample(500:1000, length(df1$id_male), replace=TRUE)
df1$var_ma_1 <- sample(500:1000, length(df1$id_male))
df1$var_ma_2 <- sample(500:1000, length(df1$id_male))
df1$var_ma_3 <- sample(500:1000, length(df1$id_male))
df1$var_ma_4 <- sample(500:1000, length(df1$id_male))
df1$diff_1 <- sample(900:1000, length(df1$id_male))
df1$diff_2 <- sample(800:1000, length(df1$id_male))
df1$diff_3 <- sample(100:1000, length(df1$id_male))
df1$diff_4 <- sample(200:1000, length(df1$id_male))
df1$var <- sample(0:1, length(df1$id_male), replace=TRUE)

 id_male id_female var_ma_1 var_ma_2 var_ma_3 var_ma_4 diff_1 diff_2 diff_3 diff_4 var
       1       522      839      996      866      661    904    888    333    701   1
       2       548      706      967      855      682    974    838    257    572   0
       3       799      580      903      671      636    911    889    104    225   1
       4       518      745      565      719      707    959    983    465    239   0
       5       769      719      760      951      687    943    997    474    562   0
       6       745      642     1000      926      659    950    859    168    849   0

and I wanted to reshape this data to a long format:

id_male id_female visit var_ma  diff    var
  1        522      1     839    904     1
  1        522      2     996    888     1
  1        522      3     866    333     1
  1        522      4     661    701     1
  2        548      1     967    974     0
  2        548      2     855    838     0
  2        548      3     682    257     0
  2        548      4     947    572     0

I tried pivot_wider, reshape, and melt, but can't get it exactly what I want. Thank you in advance for your help!


Solution

  • Since part of some column names are kept as columns and part as a value in a new column, I think we can use names_pattern to separate out the number from the column name to form visit, and keep the rest.

    tidyr::pivot_longer(df1, -c(id_male, id_female, var),
      names_pattern = "(.*)_([0-9]+)$",
      names_to = c(".value", "visit"))
    # # A tibble: 24 x 6
    #    id_male id_female   var visit var_ma  diff
    #      <int>     <int> <int> <chr>  <int> <int>
    #  1       1       522     1 1        839   904
    #  2       1       522     1 2        996   888
    #  3       1       522     1 3        866   333
    #  4       1       522     1 4        661   701
    #  5       2       548     0 1        706   974
    #  6       2       548     0 2        967   838
    #  7       2       548     0 3        855   257
    #  8       2       548     0 4        682   572
    #  9       3       799     1 1        580   911
    # 10       3       799     1 2        903   889
    # # ... with 14 more rows
    

    If you're unfamiliar with regex,

    (.*)_([0-9]+)$
     ^^             0 or more of anything
    ^^^^            parens capture this as the first "group"
        ^           the literal underscore character
          ^^^^^^    1 or more digit (0-9 only)
         ^^^^^^^^   parens capture this as the second "group"
                 ^  end of string
    

    The two "groups" correspond to the names_to argument.


    Data

    df1 <- structure(list(id_male = 1:6, id_female = c(522L, 548L, 799L, 518L, 769L, 745L), var_ma_1 = c(839L, 706L, 580L, 745L, 719L, 642L), var_ma_2 = c(996L, 967L, 903L, 565L, 760L, 1000L), var_ma_3 = c(866L, 855L, 671L, 719L, 951L, 926L), var_ma_4 = c(661L, 682L, 636L, 707L, 687L, 659L), diff_1 = c(904L, 974L, 911L, 959L, 943L, 950L), diff_2 = c(888L, 838L, 889L, 983L, 997L, 859L), diff_3 = c(333L, 257L, 104L, 465L, 474L, 168L), diff_4 = c(701L, 572L, 225L, 239L, 562L, 849L), var = c(1L, 0L, 1L, 0L, 0L,  0L)), class = "data.frame", row.names = c(NA, -6L))