rmatrixape

How to calculate the group mean for each group in a very large matrix, after subsetting the matrix according to the rows/column names?


So I have a distance matrix of pairwise distances between DNA sequences where the name of each sequence is something identical to ID0001|Species name. The code I used to generate it is the following:

library(ape)
myseq <- read.dna("sequences.fasta", format = "fasta")
mydist <- dist.dna(myseq)

The resulting matrix looks like this: subset of the original matrix

What I am trying to do is to calculate the average distance within each genus. The way I have found to do this is by converting the matrix to a data frame, then separating the sequence ID and the species name, and finally extracting the genus (i.e. the first word of the species name):

library(reshape2)
library(stringr)
library(tidyr)

df_dist<-setNames(reshape2::melt(as.matrix(mydist)), c('rows', 'vars', 'values'))
names(df_dist)<-c("record1","record2","dist")
df_dist2<-separate(df_dist,record1,into = c("ID1","species1"),sep = "\\|",remove = FALSE,extra = "merge")
df_dist3<-separate(df_dist2,record2,into = c("ID2","species2"),sep = "\\|",remove = FALSE,extra = "merge")
df_dist3$genus1<- str_extract(df_dist3$species1, '[A-Za-z]+')
df_dist3$genus2<- str_extract(df_dist3$species2, '[A-Za-z]+')

Then what I do is I retain only the distances between the same genus, and after that I remove the instances where the distance was calculated between sequences belonging to the same species within a genus (not useful for my analysis):

congeneric<-df_dist3[df_dist3$genus1==df_dist3$genus2,]
congeneric2<-congeneric[congeneric$species1 != congeneric$species2,]

Then I calculate the mean distance of each genus and save them as a data frame:

congeneric_distances<-congeneric2 %>%
  group_by(genus1) %>%
  summarise_at(vars(dist), list(name = mean))

This code works for some matrices, but for very large ones, I am not being able to run the code, I get returned a message such as this:

Error: cannot allocate vector of size X Gb

I am wondering if there is a way to smooth out the code and make it less memory intensive. One of the steps where the code mostly "freezes" is when using the separate function, but sometimes it fails in other places. I have also been told that I shouldn't convert it to a data frame, as it is more memory intensive than the matrix itself.


Solution

  • It is a bit hard to answer your question without a reproducible example or some information about your computer. Below is my suggestion.

    Some dummy data resembling your dataset:

    # Create a dummy example
    set.seed(1)
    dummy_mat <- matrix(rnorm(n=9*9), ncol = 9, nrow = 9)
    diag(dummy_mat) <- 0
    dummy_mat[lower.tri(dummy_mat)] <- dummy_mat[upper.tri(dummy_mat)]
    
    # Create col and row names
    x <- paste0("Genus", LETTERS[1:3])
    y <- paste0(" species", 1:3)
    
    set.seed(1)
    ids <- paste0(sample(LETTERS, 9, replace = T), sample(1000:9999, 9))
    nms <- paste0(ids, "|", do.call(paste0, expand.grid(x, y)))
    
    colnames(dummy_mat) <- nms
    rownames(dummy_mat) <- nms
    

    This is the output:

    > dummy_mat
                          Y9788|GenusA species1 D2300|GenusB species1 G9521|GenusC species1 A2798|GenusA species2 B9228|GenusB species2 W2128|GenusC species2 K9920|GenusA species3 N1877|GenusB species3 R9676|GenusC species3
    Y9788|GenusA species1             0.0000000            -0.3053884             0.8212212           -1.47075238            -0.3942900            -0.7074952           1.433023702            0.02800216           0.610726353
    D2300|GenusB species1            -0.3053884             0.0000000             0.5939013           -0.47815006            -0.0593134             0.3645820           1.980399899           -0.74327321          -0.934097632
    G9521|GenusC species1             0.8212212             1.1000254             0.0000000            0.41794156             1.1000254             0.7685329          -0.367221476            0.18879230          -1.253633400
    A2798|GenusA species2             0.5939013             0.7631757             1.4330237            0.00000000             0.7631757            -0.1123462          -1.044134626           -1.80495863           0.291446236
    B9228|GenusB species2            -1.4707524            -0.7074952             1.9803999            0.02800216             0.0000000             0.8811077           0.569719627            1.46555486          -0.443291873
    W2128|GenusC species2            -0.4781501             0.3645820            -0.3672215           -0.74327321             0.1532533             0.0000000          -0.135054604            0.15325334           0.001105352
    K9920|GenusA species3             0.4179416             0.7685329            -1.0441346            0.18879230             2.1726117            -1.2536334           0.000000000            2.17261167           0.074341324
    N1877|GenusB species3            -0.3942900            -0.1123462             0.5697196           -1.80495863             0.6107264             0.2914462           0.001105352            0.00000000          -0.589520946
    R9676|GenusC species3            -0.0593134             0.8811077            -0.1350546            1.46555486            -0.9340976            -0.4432919           0.074341324           -0.58952095           0.000000000
    

    Now we can try to answer your question.

    1] First I would suggest to list all unique genera you have in your dataset. I will use the library stringr (but it can also be done in base R) and the fact that all the genera are delimited by "|" and " ".

    # Identify all genera you have
    nms <- colnames(dummy_mat)
    
    library(stringr)
    nms <- stringr::str_extract(nms, "(?<=\\|).*(?= )") # delimiters of the Genus "|" and " " (whitespace)
    nms <- unique(nms)
    

    2] Now we can create a data.frame that will store all your average distances (preallocation is always a good thing when dealing with large datasets)

    # Create a data frame to pre allocate memory 
    df <- data.frame(genus = nms, meanDist = rep(as.numeric(NA), length(nms)))
    
    > df
       genus meanDist
    1 GenusA       NA
    2 GenusB       NA
    3 GenusC       NA
    

    3] Compute the average distance. Within each iteration we only need the upper triangular matrix (diagonal is always 0, lower triangular is equal to upper triangular matrix). Instead of recreating new data.frames at each iteration, we will try to subset the existing matrix.

    for(n in seq_along(df$genus)){
      
      pattern <- df$genus[n] 
      ind     <- grep(pattern, colnames(dummy_mat))
      
      m <- dummy_mat[ind, ind]
      
      df$meanDist[n] <- mean(m[upper.tri(m)])
      
    }
    

    4] Our results

    > df
       genus   meanDist
    1 GenusA -0.3606211
    2 GenusB  0.2209894
    3 GenusC -0.1613317
    

    Let me know if this solve your problem.

    UPDATE

    Using the profiling tool provided by the library profvis we can see that te loop itself doesn't use up much memory even with a distance matrix 10.000x10.000

    library(profvis)
    
    profvis({
        
        # Create a dummy example
        set.seed(1)
        dummy_mat <- matrix(rnorm(n=10000*10000), ncol = 10000, nrow = 10000)
        diag(dummy_mat) <- 0
        dummy_mat[lower.tri(dummy_mat)] <- dummy_mat[upper.tri(dummy_mat)]
        
        # Create col and row names
        x <- paste0("Genus", LETTERS[1:10])
        y <- paste0(" species", 1:1000)
        
        set.seed(1)
        ids <- paste0(sample(LETTERS, 9, replace = T), sample(1000:9999, 9))
        nms <- paste0(ids, "|", do.call(paste0, expand.grid(x, y)))
        
        colnames(dummy_mat) <- nms
        rownames(dummy_mat) <- nms
        
        
        # Identify all genera you have
        nms <- colnames(dummy_mat)
        
        library(stringr)
        nms <- stringr::str_extract(nms, "(?<=\\|).*(?= )") # delimiters of the Genus "|" and " " (whitespace)
        nms <- unique(nms)
        
        # Create a data frame to pre allocate memory 
        df <- data.frame(genus = nms, meanDist = rep(as.numeric(NA), length(nms)))
        
        
        # Compute mean distances
        for(n in seq_along(df$genus)){
          
          pattern <- df$genus[n]
          ind     <- grep(pattern, colnames(dummy_mat))
          
          m <- dummy_mat[ind, ind]
          
          df$meanDist[n] <- mean(m[upper.tri(m)])
          
        }
        
    }) #profvis end
    

    performance profile profvis