javascriptnode.jsnode-modulesnode.js-stream

Fetching Large File For Processing Using Node.js


I have a Node.js application that needs to fetch this 6GB zip file from Census.gov and then process its content. However when fetching the file using Node.js https API, the downloading stops at different file size. Sometime it fails at 2GB or 1.8GB and so on. I am never able to fully download the file using the application but its fully downloaded when using the browser. Is there any way to download the full file? I cannot start processing the zip until its fully download, so my processing code waits for the download to complete before executing.

const file = fs.createWriteStream(fileName);
http.get(url).on("response", function (res) {
      let downloaded = 0;
      res
        .on("data", function (chunk) {
          file.write(chunk);
          downloaded += chunk.length;
          process.stdout.write(`Downloaded ${(downloaded / 1000000).toFixed(2)} MB of ${fileName}\r`);
        })
        .on("end", async function () {
          file.end();
          console.log(`${fileName} downloaded successfully.`);
        });
    });

Solution

  • You have no flow control on the file.write(chunk). You need to pay attention to the return value from file.write(chunk) and when it returns false, you have to wait for the drain event before writing more. Otherwise, you can overflow the buffer on the writestream, particularly when writing large things to a slow medium like disk.

    When you lack flow control when attempting to write large things faster than the disk can keep up, you will probably blow up your memory usage because the stream has to accumulate more data in its buffer than is desirable.

    Since your data is coming from a readable, when you get false back from the file.write(chunk), you will also have to pause the incoming read stream so it doesn't keep spewing data events at you while you're waiting for the drain event on the writestream. When you get the drain event, you can then resume the readstream.

    FYI, if you don't need the progress info, you can let pipeline() do all the work (including the flow control) for you. You don't have to write that code yourself. You may even be able to still gather the progress info, by just watching the writestream activity when using pipeline().

    Here's one way to implement the flow control yourself, though I'd recommend you use the pipeline() function in the stream module and let it do all this for you if you can:

    const file = fs.createWriteStream(fileName);
    file.on("error", err => console.log(err));
    http.get(url).on("response", function(res) {
        let downloaded = 0;
        res.on("data", function(chunk) {
            let readyForMore = file.write(chunk);
            if (!readyForMore) {
                // pause readstream until drain event comes
                res.pause();
                file.once('drain', () => {
                    res.resume();
                });
            }
            downloaded += chunk.length;
            process.stdout.write(`Downloaded ${(downloaded / 1000000).toFixed(2)} MB of ${fileName}\r`);
        }).on("end", function() {
            file.end(); console.log(`${fileName} downloaded successfully.`);
        }).on("error", err => console.log(err));
    });
    

    There also appeared to be a timeout issue in the http request. When I added this:

    // set client timeout to 24 hours
    res.setTimeout(24 * 60 * 60 * 1000);
    

    I was then able to download the whole 7GB ZIP file.

    Here's turnkey code that worked for me:

    const fs = require('fs');
    const https = require('https');
    const url =
        "https://www2.census.gov/programs-surveys/acs/summary_file/2020/data/5_year_entire_sf/All_Geographies_Not_Tracts_Block_Groups.zip";
    const fileName = "census-data2.zip";
    
    const file = fs.createWriteStream(fileName);
    file.on("error", err => {
        console.log(err);
    });
    const options = {
        headers: {
            "accept-encoding": "gzip, deflate, br",
        }
    };
    https.get(url, options).on("response", function(res) {
        const startTime = Date.now();
    
        function elapsed() {
            const delta = Date.now() - startTime;
            // convert to minutes
            const mins = (delta / (1000 * 60));
            return mins;
        }
    
        let downloaded = 0;
        console.log(res.headers);
        const contentLength = +res.headers["content-length"];
        console.log(`Expecting download length of ${(contentLength / (1024 * 1024)).toFixed(2)} MB`);
        // set timeout to 24 hours
        res.setTimeout(24 * 60 * 60 * 1000);
        res.on("data", function(chunk) {
            let readyForMore = file.write(chunk);
            if (!readyForMore) {
                // pause readstream until drain event comes
                res.pause();
                file.once('drain', () => {
                    res.resume();
                });
            }
            downloaded += chunk.length;
            const downloadPortion = downloaded / contentLength;
            const percent = downloadPortion * 100;
            const elapsedMins = elapsed();
            const totalEstimateMins = (1 / downloadPortion) * elapsedMins;
            const remainingMins = totalEstimateMins - elapsedMins;
    
            process.stdout.write(
                `  ${elapsedMins.toFixed(2)} mins, ${percent.toFixed(1)}% complete, ${Math.ceil(remainingMins)} mins remaining, downloaded ${(downloaded / (1024 * 1024)).toFixed(2)} MB of ${fileName}                                 \r`
            );
        }).on("end", function() {
            file.end();
            console.log(`${fileName} downloaded successfully.`);
        }).on("error", err => {
            console.log(err);
        }).on("timeout", () => {
            console.log("got timeout event");
        });
    });