rmemorydocxqdap

Why is my R script using all the computer's memory?


I wrote a simple script which opens each .docx document from my folders and looks for certain words. If any of the words exist, it extracts an ID number and then moves on to the next document. The problem is that after ~1500 documents, it consumes all the computer's memory and R gets stuck and aborts. I don't know why it happens - my script should not be using so much memory. Any suggestion would be appreciated!

rm(list=ls()) #clean environment
library(qdapTools)
setwd("C:/DocxArchive/ParentFolder")
results <- 0 #store results here
years_list <- c("2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020")

for (year_index in 1:11) {
  parent_dir <- years_list[year_index]     
  file_list <- list.files(path = parent_dir, recursive = TRUE) #get list of file names
  items_to_delete <- grep('~',file_list) #find temporary files - name begins with '~'
  file_list <- file_list[-items_to_delete] #delete temporary files from file list
  length_of_file_list <- length(file_list)
  file_num <- 1 #initialize file number index
  while(file_num <= length_of_file_list){ 
    DOCX <- read_docx(file=file.path(parent_dir, file_list[file_num]))
      index_of_HITEC1 <- grepl("HI TEC", DOCX, fixed=FALSE, ignore.case=TRUE) 
      index_of_HITEC2 <- grepl("HITEC", DOCX, fixed=FALSE, ignore.case=TRUE) 
      index_of_HITEC3 <- grepl("HI-TEC", DOCX, fixed=FALSE, ignore.case=TRUE) 
      HITEC1_num <- which.max(index_of_HITEC1) #line in document where word exists
      HITEC2_num <- which.max(index_of_HITEC2) #line in document where word exists
      HITEC3_num <- which.max(index_of_HITEC3) #line in document where word exists
      HITEC_sum <- HITEC1_num+HITEC2_num+HITEC3_num
      if (HITEC_sum > 3){
        index_of_person_ID <- grepl("ID:", DOCX, fixed=TRUE) #find lines where ID exist
        text <- DOCX[index_of_person_ID==TRUE] #keep only lines where ID exist
        text <- gsub("[^0-9]","",text) #delete everything except numbers
        results <- c(results,text)
      } #end of if loop
      file_num <- file_num+1
   } #end of while loop 
} #end of for loop

Solution

  • Update: the memory leak was the fault of qdapTools::read_docx. Using officer::read_docx, there is no problem.