javaout-of-memoryapache-poibatch-processingfileupdate

Appending to a workbook using poi sxssfworkbook


I need to append rows to a sheet of a workbook. I am using org.apache.poi.xssf.streaming.SXSSFWorkbook but I am not able to achieve a low memory footprint. Following is the code:

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

public class ExcelHelper {
    public static void createExcelFileWithLowMemFootprint(
            ArrayList<HashMap<String, Object>> data,
            ArrayList<String> fieldNames, String fileName, int rowNum) {
        try {
            if (rowNum == 0) {
                // Creating a new workbook and writing the top heading here
                SXSSFWorkbook workbook = new SXSSFWorkbook(1000);
                Sheet worksheet = workbook.createSheet("Sheet 1");
                int i = 0;
                Iterator<String> it0 = fieldNames.iterator();
                Row row = worksheet.createRow(i);
                int j = 0;
                while (it0.hasNext()) {
                    Cell cell = row.createCell(j);
                    String fieldName = it0.next();
                    cell.setCellValue(fieldName);
                    j++;
                }
                rowNum++;
                FileOutputStream fileOut = new FileOutputStream(fileName);
                workbook.write(fileOut);
                fileOut.flush();
                fileOut.close();
            }
            InputStream fileIn = new BufferedInputStream(new FileInputStream(
                    fileName), 1000);
            SXSSFWorkbook workbook = new SXSSFWorkbook(
                    new XSSFWorkbook(fileIn), 1000);
            Sheet worksheet = workbook.getSheetAt(0);
            Iterator<HashMap<String, Object>> it = data.iterator();
            int i = rowNum;
            while (it.hasNext()) {
                Row row = worksheet.createRow(i);
                int j = 0;
                HashMap<String, Object> rowContent = it.next();
                Iterator<String> it1 = fieldNames.iterator();
                while (it1.hasNext()) {
                    Cell cell = row.createCell(j);
                    String key = it1.next();
                    Object o = rowContent.get(key);
                    if (o instanceof String) {
                        cell.setCellValue((String) o);
                    } else if (o instanceof Double) {
                        cell.setCellType(cell.CELL_TYPE_NUMERIC);
                        cell.setCellValue((Double) o);
                    }
                    j++;
                }
                i++;
            }
            fileIn.close();
            FileOutputStream fileOut = new FileOutputStream(fileName);
            workbook.write(fileOut);
            fileOut.flush();
            fileOut.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

I am appending to the file by passing the content in batches(so as to save on the jvm memory) and by incrementing the variable rowNum.

As per my understanding, when I am re-opening the file with

SXSSFWorkbook workbook = new SXSSFWorkbook(new XSSFWorkbook(fileIn),1000);

the constructor for XSSWorkbook reloads the complete file in memory, result in gc limit exceeded.

I went through http://poi.apache.org/spreadsheet/how-to.html but was unable to find a suitable solution to my usecase.

Can you guys please suggest how to fix this to achieve a low memory footprint for appending rows to the workbook?


Solution

  • SXSSFWorkbook doesn't need to be output then loaded back in for good memory management. Just write all of the data at once. If you try loading a whole workbook it stores that in memory, when writing it at once it uses storage space instead. Also 1000 is a lot to put in the constructor on some computers. If you want, try putting 100 or some other lower number in the constructor instead of 1000.