I am trying to use JSONStream library in NodeJS to stream json (94mb big) from the http request to a local file. However, the app crashes with out-of-memory
error, even though the code is invoked with 256mb memory flag : node --max-old-space-size=256 .\main.js
.
Isn't the point of streaming to utilize small chunks of data? Or I am doing something wrong?
Here is the source code:
const http = require('http');
const JSONStream = require('JSONStream');
const https = require('https');
const { pipeline, Transform } = require('stream');
const fs = require('fs');
const mb100 = 'https://raw.githubusercontent.com/seductiveapps/largeJSON/master/100mb.json';
new Promise((res, rej)=> {
const url = mb100;
const protocol = isHttps(url) ? https : http;
const parse = JSONStream.parse('*');
const write = fs.createWriteStream('file.json');
const str = new Transform({
objectMode:true,
transform: function (chunk, encoding, callback) {
// some computations, just an example:
callback(null, JSON.stringify(chunk));
}
})
protocol.get(url, (networkStream) => {
pipeline(networkStream, parse, str, write, (err) => {
console.log(err ?? 'finish');
});
})
})
Isn't the point of streaming to utilize small chunks of data, rather than storing the whole json in memory? Or am I doing something wrong?
Your code works as expected. Seems JSONStream
provides chunks as TOP-LEVEL JS objects. Since in your JSON you have big objects of data you actually receive quite big chunks. Since you have your node memory limited, the memory allocated by node and you in sum overflows the limit. To process this JSON file with small memory footprint you need to process the stream character-by-character.
const JSONStream = require('JSONStream');
const https = require('https');
const { pipeline, Transform } = require('stream');
const fs = require('fs');
const mb100 = 'https://raw.githubusercontent.com/seductiveapps/largeJSON/master/100mb.json';
const url = mb100;
const parse = JSONStream.parse('*');
const write = fs.createWriteStream('file.json');
const str = new Transform({
objectMode: true,
transform: function (chunk, encoding, callback) {
callback(null, JSON.stringify(chunk));
console.log(Object.entries(chunk).map(([key,val])=>`${key}: ${JSON.stringify(val).length}`));
}
})
https.get(url, (networkStream) => {
pipeline(networkStream, parse, str, write, (err) => {
console.log(err ?? 'finish');
});
})
You have a quite big chunk in the end of around 59827472 * 2 (size of JS character in bytes, it serialized in JSON but anyway) bytes:
(1) ['here: 647']
(1) ['here: 260101']
(2) ['credits: 49', 'right-click here!: 28708']
(1) ['here: 1594']
(1) ['<div style="z-index:0;width:100%;height:100%;colo…wer/bg.gif) repeat;">key with backdrop;</div>: 5']
(1) ['{"check":{"this":"out"}}: 1477']
(4) ['292460: 2919', '51928711: 15', 'keys: 9', 'hmo: 48']
(2) ['fnki: 15', 'budp: 59827472']
You could try @streamparser/json
to get JSON value-by-value, will work in your case, just decide how to process and write values:
const https = require('https');
const {Tokenizer} = require('@streamparser/json');
class MyTokenizer extends Tokenizer {
onToken({ token, value }) {
console.log(token, value);
}
}
const tokenizer = new MyTokenizer;
const url = 'https://raw.githubusercontent.com/seductiveapps/largeJSON/master/100mb.json';
https.get(url, (networkStream) => {
networkStream.on('readable', function () {
// There is some data to read now.
let data;
while ((data = this.read(1000)) !== null) {
tokenizer.write(data);
}
});
});