I spent a better (or rather worse) part of today hunting a bug that caused Node.js engine to randomly run out of memory when searching a giant log file line by line by regex.
The cause was that I kept the matches in array, which kept hidden references to entire chunks of the original file. The number of matches was not a problem, but the scattering across the file was. For reference, this is the code I use to break binary stream inputs into line strings:
/**
*
* @param {ReadableStreamDefaultReader<Uint8Array>} reader
* @returns {AsyncGenerator<string>}
*/
export default async function* asyncLineIterator(reader, cancelReader = true) {
let prefetch = null;
let wasDone = false;
try {
const utf8Decoder = new TextDecoder("utf-8");
// const response = await fetch(fileURL);
// const reader = response.body.getReader();
let { value: binaryChunk, done: readerDone } = await reader.read();
let chunk = binaryChunk ? utf8Decoder.decode(binaryChunk) : "";
const newline = /\r?\n/gm;
let startIndex = 0;
let result;
prefetch = reader.read();
while (true) {
const result = newline.exec(chunk);
if (!result) {
if (readerDone) {
break;
}
const remainder = chunk.substr(startIndex);
({ value: binaryChunk, done: readerDone } = await prefetch);
if (!readerDone) {
prefetch = reader.read();
chunk = remainder + (binaryChunk ? utf8Decoder.decode(binaryChunk, {stream: true}) : "");
}
else {
prefetch = null;
chunk = remainder;
}
startIndex = newline.lastIndex = 0;
continue;
}
yield chunk.substring(startIndex, result.index);
startIndex = newline.lastIndex;
if(chunk.length > 10*1024*1024) {
throw new Error("Line too long, aborting.");
}
}
if (startIndex < chunk.length) {
// Last line didn't end in a newline char
yield chunk.substr(startIndex);
}
wasDone = readerDone;
}
catch (e) {
console.trace(e);
throw e;
}
finally {
if(prefetch) {
await prefetch;
}
//console.log("Done reading lines.");
if(cancelReader && !wasDone) {
await reader.cancel();
}
}
}
What was happening in my code and caused crash:
substr
above creates sliced string and keeps ref to the original 66kB chunkRegExp.exec
also keeps ref to slice of a slice, retaining the 66kB chunkThis is what I saw after I was finally able to obtain a memory snapshot before a crash:
This is one of my matches, containing 66kB string, despite being like 60 characters.
My solution for now is the following:
/**
* @template TInput
* @param {TInput} str
* @returns {TInput}
*/
function unrefLine(str) {
return JSON.parse(JSON.stringify(str));
}
Which I call with the match array:
const match = myRegex.exec(lineStr);
if(match) {
myMatches.push({matches: unrefLine([...match]), matchedLine: unrefLine(lineStr)});
}
This solved all crashes. My question here is if there's a faster and less ugly way than JSON.stringify
. The goal is to get V8 to forget that the substring belongs to the original chunk from the file I am reading.
@jmrk's answer gives a lot of context and explains why it's not easy to optimize this in V8, and correctly points out that there is no "official" way to do it.
Still, here is another hack that will be faster that your JSON.parse(JSON.stringify(...))
approach:
function unrefLine(str) {
let g = {};
g[str] = undefined;
}
This works because V8 internalizes strings that are used as object keys. And internalization only works for sequential strings, which means that when a sliced string is used as a key, it will first be converted to a sequential string, thus removing the pointer to the parent string.
A word of caution though: this behavior is an implementation detail and could easily change without notice, which makes this solution not very future-proof.
Quick performance comparison with your approach:
function unrefLineWithJSON(str) { return JSON.parse(JSON.stringify(str)); }
function unrefLineWithInternalization(str) { let g = {}; g[str] = undefined; }
let str = "a".repeat(1000000);
for (let string_size of [20, 50, 100, 1000, 10000]) {
print(`string_size=${string_size}`);
// JSON-based
let time = performance.now();
for (let i = 0; i < 1000000; i++) {
unrefLineWithJSON(str.substring(10, 10+string_size));
}
time = performance.now() - time;
print(` > JSON: ${time}`);
// Internalization-based
time = performance.now();
for (let i = 0; i < 1000000; i++) {
unrefLineWithInternalization(str.substring(10, 10+string_size));
}
time = performance.now() - time;
print(` > Internalization: ${time}\n`);
}
Which produces:
string_size=20
> JSON: 268.409
> Internalization: 110.47300000000001
string_size=50
> JSON: 301.63500000000005
> Internalization: 113.80700000000002
string_size=100
> JSON: 371.19499999999994
> Internalization: 118.72199999999998
string_size=1000
> JSON: 955.9929999999997
> Internalization: 191.00199999999995
string_size=10000
> JSON: 7491.878999999999
> Internalization: 855.7530000000006
So, it seems that relying on internalization rather than JSON stringify+parse scales much better. However, if your strings are small, I doubt that this will be the bottleneck of your application.