rdata-processing

how to convert tabular data correctly for objects with different lengths


I have data as objects like this

set.seed(1)
 make_rle <- function() rnorm(10) |> cumsum() |> sign() |> accelerometry::rle2(indices = T)
 X <- lapply(1:10, \(x) make_rle())
 X
 [[1]]
     value start stop length
[1,]    -1     1    2      2
[2,]     1     3    3      1
[3,]    -1     4    6      3
[4,]     1     7   10      4

[[2]]
     value start stop length
[1,]     1     1    3      3
[2,]    -1     4    4      1
[3,]     1     5    5      1
[4,]    -1     6    6      1
[5,]     1     7   10      4

[[3]]
     value start stop length
[1,]     1     1   10     10

[[4]]
     value start stop length
[1,]     1     1    6      6
[2,]    -1     7   10      4



and so on

How can I correctly convert this data to get tabular data with the same number of columns. Tabular data is needed for machine learning

I can add to each matrix NA for example

add_na <- function(x, n= 10)  rbind(x, matrix(NA,nrow = n-nrow(x),ncol = 4)) 
 Xna <- lapply(X, add_na)
 Xna
[[1]]
      value start stop length
 [1,]    -1     1    2      2
 [2,]     1     3    3      1
 [3,]    -1     4    6      3
 [4,]     1     7   10      4
 [5,]    NA    NA   NA     NA
 [6,]    NA    NA   NA     NA
 [7,]    NA    NA   NA     NA
 [8,]    NA    NA   NA     NA
 [9,]    NA    NA   NA     NA
[10,]    NA    NA   NA     NA

[[2]]
      value start stop length
 [1,]     1     1    3      3
 [2,]    -1     4    4      1
 [3,]     1     5    5      1
 [4,]    -1     6    6      1
 [5,]     1     7   10      4
 [6,]    NA    NA   NA     NA
 [7,]    NA    NA   NA     NA
 [8,]    NA    NA   NA     NA
 [9,]    NA    NA   NA     NA
[10,]    NA    NA   NA     NA

and so on

And then convert the matrix to a vector and thereby get tabular data

to_tab <- sapply(Xna, as.vector) |> t()

My question is whether I'm doing it right or some information will be lost during such a conversion. What is the correct solution in my case?


Solution

  • (maxrows <- max(sapply(X, nrow)))
    # [1] 5
    
    X2 <- lapply(X, function(z) rbind(z, z[1,][rep(NA, maxrows - nrow(z)),]))
    X2
    # [[1]]
    #      value start stop length
    # [1,]    -1     1    2      2
    # [2,]     1     3    3      1
    # [3,]    -1     4    6      3
    # [4,]     1     7   10      4
    # NA      NA    NA   NA     NA
    # [[2]]
    #      value start stop length
    # [1,]     1     1    3      3
    # [2,]    -1     4    4      1
    # [3,]     1     5    5      1
    # [4,]    -1     6    6      1
    # [5,]     1     7   10      4
    # [[3]]
    #      value start stop length
    # [1,]     1     1   10     10
    # NA      NA    NA   NA     NA
    # NA.1    NA    NA   NA     NA
    # NA.2    NA    NA   NA     NA
    # NA.3    NA    NA   NA     NA
    # [[4]]
    #      value start stop length
    # [1,]     1     1    6      6
    # [2,]    -1     7   10      4
    # NA      NA    NA   NA     NA
    # NA.1    NA    NA   NA     NA
    # NA.2    NA    NA   NA     NA
    

    You can prefix this with do.call(cbind, to result in a single wider matrix:

    do.call(cbind, X2)
    #      value start stop length value start stop length value start stop length value start stop length
    # [1,]    -1     1    2      2     1     1    3      3     1     1   10     10     1     1    6      6
    # [2,]     1     3    3      1    -1     4    4      1    NA    NA   NA     NA    -1     7   10      4
    # [3,]    -1     4    6      3     1     5    5      1    NA    NA   NA     NA    NA    NA   NA     NA
    # [4,]     1     7   10      4    -1     6    6      1    NA    NA   NA     NA    NA    NA   NA     NA
    # NA      NA    NA   NA     NA     1     7   10      4    NA    NA   NA     NA    NA    NA   NA     NA
    

    Data

    X <- list(structure(list(value = c(-1L, 1L, -1L, 1L), start = c(1L, 3L, 4L, 7L), stop = c(2L, 3L, 6L, 10L), length = c(2L, 1L, 3L, 4L)), class = "data.frame", row.names = c(NA, -4L)), structure(list(value = c(1L, -1L, 1L, -1L, 1L), start = c(1L, 4L, 5L, 6L, 7L), stop = c(3L, 4L, 5L, 6L, 10L), length = c(3L, 1L, 1L, 1L, 4L)), class = "data.frame", row.names = c(NA, -5L)), structure(list(value = 1L, start = 1L, stop = 10L, length = 10L), class = "data.frame", row.names = c(NA, -1L)), structure(list(value = c(1L,  -1L), start = c(1L, 7L), stop = c(6L, 10L), length = c(6L, 4L)), class = "data.frame", row.names = c(NA, -2L)))