rperformancenested-loopsmutual-information

R: performance issues when computing mutual information matrix with NAs


I realized that computing mutual information on a dataframe with NA using R's infotheo package does not yield errors but incorrect results. The problem is described in more detail here but while I now have a mathematically correct solution which only removes pairwise incomplete cases instead of across all columns the performance for large data sets it catastrophic. I guess it is the nested for loop which causes the long compute times, does anyone have an idea how to improve performance of the below code?

library(infotheo)

v1 <- c(1,2,3,4,5,NA,NA,NA,NA,NA)
v2 <- c(1,NA,3,NA,5,NA,7,NA,9,NA)
v3 <- c(NA,2,3,NA,NA,6,7,NA,7,NA)
v4 <- c(NA,NA,NA,NA,NA,6,7,8,9,10)
df <- cbind.data.frame(v1,v2,v3,v4)

ColPairMap<-function(df){
t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
colnames(t) <- colnames(df)
rownames(t) <- colnames(df)
for (j in 1:ncol(df)) {
               for (i in 1:ncol(df)) {
                                c(1:ncol(df))
                                if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
                                    t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
                                } else {
                                    t[j,i] <- 0
                                }
               }
}
return(t)
}


ColPairMap(df)

Thanks in advance!


Solution

  • I found a tweak which is not helping for toy data sets as df above but for real world data sets, especially when executed on some proper H/W I've seen examples where it reduces a 2.5hrs compute time to 14min! The code below is a complete copy&pastable exmple which incorporates Rui's solution using a nested for loop and building on this idea another solution using a nested 'foreach' loop parallelizing the task on 75% of the available cores. You can control the size of the data set and consequently the compute time by adjusting n.

    library(foreach)
    library(parallel)
    library(doParallel)
    library(infotheo)
    
    n <- 500 #creates an nXn matrix, the larger the more compute time is required
    df <- (discretize(matrix(rnorm(4*n*n,n,n/10),ncol=n)))
    
    ## pairwise complete mutual information via nested for loop ##
    start_for <- Sys.time()
    ColPairMap<-function(df){
    t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
    colnames(t) <- colnames(df)
    rownames(t) <- colnames(df)
    for (j in 1:ncol(df)) {
                   for (i in 1:ncol(df)) {
                                    c(1:ncol(df))
                                    if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
                                        t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
                                    } else {
                                        t[j,i] <- 0
                                    }
                   }
    }
    return(t)
    }
    ColPairMap(df)
    end_for <- Sys.time()
    end_for-start_for
    
    
    ## pairwise complete mutual information via nested foreach loop ##
    start_foreach <- Sys.time()
    ncl <- max(2,floor(detectCores()*0.75)) #number of cores
    clst <- makeCluster(n=ncl,type="TERR") #create cluster
    #e <- new.env() #new environment to export libraries to cores
    #e$libs <- .libPaths()
    #clusterExport(clst, "libs", envir=e) #export required packages to all cores
    #clusterEvalQ(clst, .libPaths(libs)) #export required packages to all cores
    clusterEvalQ(clst, { #export required packages to all cores
        library(infotheo)
    })
    registerDoParallel(cl = clst) #register cluster
    
    t <- foreach (j=1:ncol(df), .combine="c") %:% #parallellized nested loop for computing normalized pairwise complete MI between all columns
            foreach (i=j:ncol(df), .combine="c", .packages="infotheo") %dopar% {
                combine="c"
                compl_cases <- complete.cases(df[,c(i,j)])
                if (sum(compl_cases) > 0) {
                    natstobits(mutinformation(df[compl_cases,][,j], df[compl_cases,][,i]))
                } else {
                    0
                }
            }
    
    RCA_MI_Matrix <- matrix(0, ncol = ncol(df), nrow = ncol(df), dimnames = list(colnames(df), colnames(df))) #set-up empty matrix for MI values
    RCA_MI_Matrix[lower.tri(RCA_MI_Matrix, diag=TRUE)] <- t #fill lower triangle with MI values from nested loop
    RCA_MI_Matrix[upper.tri(RCA_MI_Matrix)] <- t(RCA_MI_Matrix)[upper.tri(RCA_MI_Matrix)] #mirror lower triangle of matrix into upper one
    end_foreach <- Sys.time()
    end_foreach-start_foreach
    
    stopCluster(cl=clst) #stop cluster