javascriptnode.jsv8

How to correctly unref a V8 substring ("sliced string") from its source string


I spent a better (or rather worse) part of today hunting a bug that caused Node.js engine to randomly run out of memory when searching a giant log file line by line by regex.

The cause was that I kept the matches in array, which kept hidden references to entire chunks of the original file. The number of matches was not a problem, but the scattering across the file was. For reference, this is the code I use to break binary stream inputs into line strings:

/**
 * 
 * @param {ReadableStreamDefaultReader<Uint8Array>} reader
 * @returns {AsyncGenerator<string>}
 */
export default async function* asyncLineIterator(reader, cancelReader = true) {
  let prefetch = null;
  let wasDone = false;
  try {
    const utf8Decoder = new TextDecoder("utf-8");
    // const response = await fetch(fileURL);
    // const reader = response.body.getReader();
    let { value: binaryChunk, done: readerDone } = await reader.read();
    let chunk = binaryChunk ? utf8Decoder.decode(binaryChunk) : "";

    const newline = /\r?\n/gm;
    let startIndex = 0;
    let result;

    prefetch = reader.read();

    while (true) {
      const result = newline.exec(chunk);
      if (!result) {
        if (readerDone) {
          break;
        }

        const remainder = chunk.substr(startIndex);
        ({ value: binaryChunk, done: readerDone } = await prefetch);

        if (!readerDone) {
          prefetch = reader.read();
          chunk = remainder + (binaryChunk ? utf8Decoder.decode(binaryChunk, {stream: true}) : "");
        }
        else {
          prefetch = null;
          chunk = remainder;
        }

        startIndex = newline.lastIndex = 0;
        continue;
      }
      yield chunk.substring(startIndex, result.index);
      startIndex = newline.lastIndex;
      if(chunk.length > 10*1024*1024) {
        throw new Error("Line too long, aborting.");
      }
    }

    if (startIndex < chunk.length) {
      // Last line didn't end in a newline char
      yield chunk.substr(startIndex);
    }
    wasDone = readerDone;
  }
  catch (e) {
    console.trace(e);
    throw e;
  }
  finally {
    if(prefetch) {
      await prefetch;
    }
    //console.log("Done reading lines.");
    if(cancelReader && !wasDone) {
      await reader.cancel();
    }
  }
}

What was happening in my code and caused crash:

This is what I saw after I was finally able to obtain a memory snapshot before a crash: picture shows a match array containing a ref to 66176kB string

This is one of my matches, containing 66kB string, despite being like 60 characters.

My solution for now is the following:

        /**
         * @template TInput
         * @param {TInput} str 
         * @returns {TInput}
         */
        function unrefLine(str) {
            return JSON.parse(JSON.stringify(str));
        }

Which I call with the match array:

const match = myRegex.exec(lineStr);
if(match) {
   myMatches.push({matches: unrefLine([...match]), matchedLine: unrefLine(lineStr)});
}

This solved all crashes. My question here is if there's a faster and less ugly way than JSON.stringify. The goal is to get V8 to forget that the substring belongs to the original chunk from the file I am reading.


Solution

  • @jmrk's answer gives a lot of context and explains why it's not easy to optimize this in V8, and correctly points out that there is no "official" way to do it.

    Still, here is another hack that will be faster that your JSON.parse(JSON.stringify(...)) approach:

    function unrefLine(str) {
      let g = {};
      g[str] = undefined;
    }
    

    This works because V8 internalizes strings that are used as object keys. And internalization only works for sequential strings, which means that when a sliced string is used as a key, it will first be converted to a sequential string, thus removing the pointer to the parent string.

    A word of caution though: this behavior is an implementation detail and could easily change without notice, which makes this solution not very future-proof.


    Quick performance comparison with your approach:

    function unrefLineWithJSON(str) { return JSON.parse(JSON.stringify(str)); }
    function unrefLineWithInternalization(str) { let g = {}; g[str] = undefined; }
    
    let str = "a".repeat(1000000);
    
    for (let string_size of [20, 50, 100, 1000, 10000]) {
      print(`string_size=${string_size}`);
      // JSON-based
      let time = performance.now();
      for (let i = 0; i < 1000000; i++) {
        unrefLineWithJSON(str.substring(10, 10+string_size));
      }
      time = performance.now() - time;
      print(` > JSON: ${time}`);
    
      // Internalization-based
      time = performance.now();
      for (let i = 0; i < 1000000; i++) {
        unrefLineWithInternalization(str.substring(10, 10+string_size));
      }
      time = performance.now() - time;
      print(` > Internalization: ${time}\n`);
    }
    

    Which produces:

    string_size=20
     > JSON: 268.409
     > Internalization: 110.47300000000001
    
    string_size=50
     > JSON: 301.63500000000005
     > Internalization: 113.80700000000002
    
    string_size=100
     > JSON: 371.19499999999994
     > Internalization: 118.72199999999998
    
    string_size=1000
     > JSON: 955.9929999999997
     > Internalization: 191.00199999999995
    
    string_size=10000
     > JSON: 7491.878999999999
     > Internalization: 855.7530000000006
    

    So, it seems that relying on internalization rather than JSON stringify+parse scales much better. However, if your strings are small, I doubt that this will be the bottleneck of your application.