rmemory-leaksrdcomclient

Memory Leak When Opening Data Frame With RDCOMClient


I am experimenting with using the RDCOMClient package to open a dataframe on the fly into Excel. I have functional code which will open a dataframe into Excel, however the issue I am running into is that the memory resources from loading the data into Excel are not being released by rsession.exe when the code completes. Even after closing the Excel application the memory resources are not released. Is there something I am missing, that I could use to release the memory from rsession.exe? The only way I have been able to release the memory is by closing out of RStudio and opening it back up again.

Code for the function

in.xl <- function(data, headers = TRUE, rownames = FALSE) {
  require(RDCOMClient)

  # Attempt to coerce non dataframe data into a dataframe
  if (!is.data.frame(data)) {
    data <- as.data.frame(data)
  }

  # Set row range for data
  if (headers == TRUE) {
    d.row.start <- 2
    d.row.end <- nrow(data) + 1
  } else {
    d.row.start <- 1
    d.row.end <- nrow(data)
  }

  # Set column range for data
  if (rownames == TRUE) {
    d.col.start <- 2
    d.col.end <- ncol(data) + 1
  } else {
      d.col.start <- 1
      d.col.end <- ncol(data)
    }

  # Create COM Connection to Excel
  xlApp <- COMCreate("Excel.Application")
  xlWB <- xlApp[["Workbooks"]]$Add()
  xlSheet <- xlWB$Sheets(1)

  # Check if headers should be included
  if (headers == TRUE) {

    # Create a dataframe from headers
    headers <- t(as.data.frame(colnames(data)))

    # Set range for header values
    startCell <- xlSheet$Cells(1, d.col.start)
    endCell <- xlSheet$Cells(1, d.col.end)
    rng <- xlSheet$Range(startCell, endCell)

    # Add headers to Excel sheet
    rng[["Value"]] <- asCOMArray(headers)

    # Remove header dataframe
    rm(headers)
  }

  # Check if rownames should be included
  if(rownames == TRUE) {

    # Create dataframe from row names
    if (is.null(rownames(data))) {
      rnames = as.data.frame(1:nrow(data))
    } else {
        rnames = as.data.frame(rownames(data))
      }

    # Set range for row name values
    startCell <- xlSheet$Cells(d.row.start, 1)
    endCell <- xlSheet$Cells(d.row.end, 1)
    rng <- xlSheet$Range(startCell, endCell)

    # Add row names to Excel sheet
    rng[["Value"]] <- asCOMArray(rnames)

    # Remove row name dataframe
    rm(rnames)
  } 

  xlApp[["ScreenUpdating"]] <- FALSE

  nblocks <- ceiling(nrow(data) / 2000)
  pb <- txtProgressBar(min = 0, max = nblocks, initial = 0, style = 3, width = 20)
  data.start <- d.row.start
  block <- 1
  d.row.end <- d.row.start
  df.row.start <- 1

  while(d.row.end < nrow(data)) {
    d.row.end <- d.row.start + 1999
    df.row.end <- df.row.start + 1999

    if (d.row.end > nrow(data) + data.start) {
      d.row.end <- nrow(data) + data.start - 1
    }

    if (df.row.end > nrow(data)) {
      df.row.end <- nrow(data)
    }

    xlApp[["StatusBar"]] <- paste("Processing block", block, "of", nblocks)

    # Set range for data values
    rng <- xlSheet$Range(xlSheet$Cells(d.row.start, d.col.start), xlSheet$Cells(d.row.end, d.col.end))

    # Add data to Excel sheet
    rng[["Value"]] <- asCOMArray(data[df.row.start:df.row.end, ])

    d.row.start <- d.row.end + 1
    df.row.start <- df.row.end + 1

    if (block != nblocks) {
      block <- block + 1
    }

    # update the progress bar with the current value
    setTxtProgressBar(pb,block)
    rm(rng, vals)
    gc()
  }

  xlApp[["StatusBar"]] <- "Formatting Columns..."

  # Auto adjust column widths
  for(c in 1:d.col.end) {
    col <- xlSheet$Columns(c)
    col[["EntireColumn"]]$AutoFit()
  }

  xlApp[["StatusBar"]] <- ""
  xlApp[["ScreenUpdating"]] <- TRUE

  # Show Excel application
  xlApp[["Visible"]] <- TRUE

  gc()
} 

Code to generate a large dataframe for testing. (Makes it easier to see the memory usage issue when looking at the rsession.exe process in Task Manager)

df <- data.frame(replicate(20, sample(replicate(10, paste(sample(LETTERS, 15, rep = TRUE), collapse = "")), 100000, rep = TRUE)))

in.xl(df)

Solution

  • You can use the package callr. With the following code, the memory is released :

    df <- data.frame(replicate(20, sample(replicate(10, paste(sample(LETTERS, 15, rep = TRUE), collapse = "")), 1000000, rep = TRUE)))
    
    library(callr)
    callr::r(func = in.xl, args = list(data = df))