pythonjsoncsvfirebase-realtime-databaseijson

What is the proper way to import large amounts data into a Firebase database?


I'm working with a dataset of political campaign contributions that ends up being an approximately 500mb JSON file (originally a 124mb CSV). It's far too big to import in the Firebase web interface (trying before crashed the tab on Google Chrome). I attempted manually uploading objects as they were made from the CSV (Using a CSVtoJSON converter, each row becomes a JSON object, and I would then upload that object to Firebase as they came).

Here's the code I used.

var firebase = require('firebase');
var Converter = require("csvtojson").Converter;
firebase.initializeApp({
  serviceAccount: "./credentials.json",
  databaseURL: "url went here"
});
var converter = new Converter({
    constructResult:false,
  workerNum:4
});
var db = firebase.database();
var ref = db.ref("/");

var lastindex = 0;
var count = 0;
var section = 0;
var sectionRef;
converter.on("record_parsed",function(resultRow,rawRow,rowIndex){
    if (rowIndex >= 0) {
        sectionRef = ref.child("reports" + section);
        var reportRef = sectionRef.child(resultRow.Report_ID);
        reportRef.set(resultRow);
        console.log("Report uploaded, count at " + count + ", section at " + section);
        count += 1;
        lastindex = rowIndex;
        if (count >= 1000) {
            count = 0;
            section += 1;
        }
        if (section >= 100) {
            console.log("last completed index: " + lastindex);
            process.exit();
        }
    } else {
        console.log("we out of indices");
        process.exit();
    }

});
var readStream=require("fs").createReadStream("./vUPLOAD_MASTER.csv");
readStream.pipe(converter);

However, that ran into memory issues and wasn't able to complete the dataset. Trying to do it in chunks was not viable either as Firebase wasn't showing all the data uploaded and I wasn't sure where I left off. (When leaving the Firebase database open in Chrome, I would see data coming in, but eventually the tab would crash and upon reloading a lot of the later data was missing.)

I then tried using Firebase Streaming Import, however that throws this error:

started at 1469471482.77
Traceback (most recent call last):
  File "import.py", line 90, in <module>
    main(argParser.parse_args())
  File "import.py", line 20, in main
    for prefix, event, value in parser:
  File "R:\Python27\lib\site-packages\ijson\common.py", line 65, in parse
    for event, value in basic_events:
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 185, in basic_parse
    for value in parse_value(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 127, in parse_value
    raise UnexpectedSymbol(symbol, pos)
ijson.backends.python.UnexpectedSymbol: Unexpected symbol u'\ufeff' at 0

Looking up that last line (the error from ijson), I found this SO thread, but I'm just not sure how I'm supposed to use that to get Firebase Streaming Import working.

I removed the Byte Order Mark using Vim from the JSON file I was trying to upload, and now I get this error after a minute or so of running the importer:

Traceback (most recent call last):
  File "import.py", line 90, in <module>
    main(argParser.parse_args())
  File "import.py", line 20, in main
    for prefix, event, value in parser:
  File "R:\Python27\lib\site-packages\ijson\common.py", line 65, in parse
    for event, value in basic_events:
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 185, in basic_parse
    for value in parse_value(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 116, in parse_value
    for event in parse_array(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 138, in parse_array
    for event in parse_value(lexer, symbol, pos):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 119, in parse_value
    for event in parse_object(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 170, in parse_object
    pos, symbol = next(lexer)
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 51, in Lexer
    buf += data
MemoryError

The Firebase Streaming Importer is supposed to be able to handle files upwards of 250mb, and I'm fairly certain I have more than enough RAM to handle this file. Any ideas as to why this error is appearing?

If seeing the actual JSON file I'm trying to upload with Firebase Streaming Import would help, here it is.


Solution

  • I got around the problem by giving up on the Firebase Streaming Import and writing my own tool that used csvtojson to convert the CSV and then the Firebase Node API to upload each object one at a time.

    Here's the script:

    var firebase = require("firebase");
    firebase.initializeApp({
      serviceAccount: "./credentials.json",
      databaseURL: "https://necir-hackathon.firebaseio.com/"
    });
    
    var db = firebase.database();
    var ref = db.ref("/reports");
    var fs = require('fs');
    var Converter = require("csvtojson").Converter;
    var header = "Report_ID,Status,CPF_ID,Filing_ID,Report_Type_ID,Report_Type_Description,Amendment,Amendment_Reason,Amendment_To_Report_ID,Amended_By_Report_ID,Filing_Date,Reporting_Period,Report_Year,Beginning_Date,Ending_Date,Beginning_Balance,Receipts,Subtotal,Expenditures,Ending_Balance,Inkinds,Receipts_Unitemized,Receipts_Itemized,Expenditures_Unitemized,Expenditures_Itemized,Inkinds_Unitemized,Inkinds_Itemized,Liabilities,Savings_Total,Report_Month,UI,Reimbursee,Candidate_First_Name,Candidate_Last_Name,Full_Name,Full_Name_Reverse,Bank_Name,District_Code,Office,District,Comm_Name,Report_Candidate_First_Name,Report_Candidate_Last_Name,Report_Office_District,Report_Comm_Name,Report_Bank_Name,Report_Candidate_Address,Report_Candidate_City,Report_Candidate_State,Report_Candidate_Zip,Report_Treasurer_First_Name,Report_Treasurer_Last_Name,Report_Comm_Address,Report_Comm_City,Report_Comm_State,Report_Comm_Zip,Category,Candidate_Clarification,Rec_Count,Exp_Count,Inkind_Count,Liab_Count,R1_Count,CPF9_Count,SV1_Count,Asset_Count,Savings_Account_Count,R1_Item_Count,CPF9_Item_Count,SV1_Item_Count,Filing_Mechanism,Also_Dissolution,Segregated_Account_Type,Municipality_Code,Current_Report_ID,Location,Individual_Or_Organization,Notable_Contributor,Currently_Accessed"
    var queue = [];
    var count = 0;
    var upload_lock = false;
    var lineReader = require('readline').createInterface({
      input: fs.createReadStream('test.csv')
    });
    
    lineReader.on('line', function (line) {
        var line = line.replace(/'/g, "\\'");
        var csvString = header + '\n' + line;
        var converter = new Converter({});
        converter.fromString(csvString, function(err,result){
            if (err) {
                var errstring = err + "\n";
                fs.appendFile('converter_error_log.txt', errstring, function(err){
                    if (err) {
                    console.log("Converter: Append Log File Error Below:");
                    console.error(err);
                    process.exit(1);
                } else {
                    console.log("Converter Error Saved");
                }
                });
            } else {
                result[0].Location = "";
                result[0].Individual_Or_Organization = "";
                result[0].Notable_Contributor = "";
                result[0].Currently_Accessed = "";
                var reportRef = ref.child(result[0].Report_ID);
                count += 1;
                reportRef.set(result[0]);
                console.log("Sent #" + count);
          }
        });
    });
    

    The only caveat is although the script can quickly send out all the objects, Firebase apparently needed the connection to remain while it was saving them, as closing the script after all objects were sent resulted in a lot of objects not appearing in the database. (I waited 20 minutes to be sure, but it might be shorter)