node.jsasync-awaitpromisespawnytdl

Can't break while loop from within after obtaining result form a spawned process


I have been trying to make a video listing function that makes use of node-js's spawn to spawn a yt-dlp process, whose output gets stored in a database.

Now it works but not as expected (the save order gets messed up even then) when I give it the size of the playlist it must process, but when the submitted playlist size is not known I can't stop the while loop that I have been using to run it.

Here it the function:

const { Sequelize, DataTypes } = require('sequelize'); // including this just in case
const { spawn } = require("child_process");
async function list_background(body_url, start_num, stop_num, chunk_size) {
    // sleep just to make it possible to catch
    // await sleep(2 * 1000);
    console.log('\nlisting in background');
    var i = 0;
    var dont_stop = true;
    // need to find a way to make the loop work only until the time we get a response
    // empty response means we should stop
    //  while (dont_stop) { // this is disastrous as the variable never gets updated
    while (i < 10) {
        // prepare an empty string to append all the data to
        var response = '';
        // make the start and stop numbers
        start_num = parseInt(start_num) + chunk_size;
        stop_num = parseInt(stop_num) + chunk_size;

        console.log("\nsupplied data:", "\ni:", i, "\nbody_url:", body_url, "\nstart_num:", start_num, "\nstop_num:", stop_num, "\nchunk_size", chunk_size);
        // actually spawn the thing
        const yt_list = spawn("yt-dlp", ["--playlist-start", start_num, "--playlist-end", stop_num, "--flat-playlist",
            "--print", '%(title)s\t%(id)s\t%(webpage_url)s', body_url]);
        yt_list.stdout.on("data", async data => {
            response += data;
        });
        yt_list.stderr.on("data", data => {
            response = `stderr: ${data}`;
        });
        yt_list.on('error', (error) => {
            response = `error: ${error.message}`;
        });
        // apparently await has no effect on this expression
        // but then how are we supposed to know when to stop?
        // the listing only ends when dont_stop is false
        yt_list.on("close", async (code) => {
            end = `child process exited with code ${code}`;
            response_list = response.split("\n");
            // remove the "" from the end of the list
            response_list.pop();
            // get the status at the end
            console.log("\ndata after processing\ni:", i, "response:\n", response, "\nresponse_list:", response_list, "\nresponse_list.length:", response_list.length, "\n");
            if (response_list == '') {
                // basically when the resonse is empty it means that all 
                // the items have been listed and the function can just return 
                // this should then break the outer listing loop
                console.log("no vidoes found", "\ni:", i, "\n");
                // break wont work as `Jump target cannot cross function boundary.ts(1107)`
                // so I am returning false to dont_stop and if dont_stop is is true then the loop 
                // should stop in the next iteration
                dont_stop = false;
            } else {
                // adding the items to db
                console.log("adding items to db", "\ni:", i, "\n");
                await Promise.all(response_list.map(async (element) => {
                    var items = element.split("\t");
                    // console.log(items, items.length, "\ni:", i, "\n");
                    // update the vidoes too here by looking for any changes that could have been made
                    // use find or create here to update the entries
                    if (items.length == 3) {
                        try {
                            if (items[0] == "[Deleted video]" || items[0] == "[Private video]") {
                                item_available = false;
                            } else {
                                item_available = true;
                            }
                            const [found, created] = await vid_list.findOrCreate({
                                where: { url: items[2] },
                                defaults: {
                                    id: items[1],
                                    reference: body_url,
                                    title: items[0],
                                    downloaded: false,
                                    available: item_available
                                }
                            })
                            //if (created)
                            //console.log("\nsaved", items[0], "\ni:", i, "\n");
                            //else 
                            if (found) {
                                if (!item_available) {
                                    found.available = false;
                                    //console.log("\nfound", items[0], "updated", "\ni:", i, "\n");
                                }
                                else {
                                    //console.log("\nfound", items[0], "no changes", "\ni:", i, "\n");
                                }
                                found.changed('updatedAt', true);
                            }
                        } catch (error) {
                            // remember to uncomment this later, the sequelize erros are not relevant here now
                            // console.error(error);
                        }
                    }
                }));
                dont_stop = true;
            }
        });
        console.log('\n\ndont_stop', dont_stop, "\ni:", i, "\n");
        i++;
    }
    console.log('\noutside the loop, and persumably done', "\ni:", i, "\n");
}

this is the test data that I use:

const daft_punk_essentials = { url: "https://www.youtube.com/playlist?list=PLSdoVPM5WnneERBKycA1lhN_vPM6IGiAg", size: 22 }
// first 10 will be listed by the main method so the number of vidoes that we should get here is total-10
list_background(daft_punk_essentials['url'], 1, 10, 10);

I recorded the output of the execution to find out what is happening can't_stop.log

From my observations I have found out that the spawn doesn't start until after the loop has finished, which I had to limit it 10 as without a limit it just crashes my computer. (see log file for how it happening)

Now I know about await Promise.all() to wait for it's internal stuff to complete but how do i don't get how to implement this for a while loop that need process parts of a list in order to add them to a db.

I am not sure if this is the right approach to do this. I used while loop because there can be up to 5000 videos in a playlist and using a for loop to make chunks would be wasteful if the playlist has like < 500 videos.


Solution

  • The beauty of using promises and async/await is that you can use normal flow of control programming with loops, break, return, etc... because your code isn't running inside of event triggered callback functions which have no control over the higher level scope.

    So, the first thing to clean up here is to take all the .on() event handling from the spawn() and wrap it into a promise so that can all be abstracted away in a separate function that you can use await on.

    Then, I'd also suggest breaking some of the complication you have into separate functions as that will also allow you to more simply see and control the flow.

    I did not follow everything you were trying to do in this loop or how you want to handle all possible error conditions so I'm sure this will need some further tweaking, but here's the general idea.

    Synopsis of Changes

    1. Put the spawn operation into a separate function which I called getVideoInfo() that returns a promise that resolves/rejects when its done. This wraps all the .on() event handlers in a promise that the caller can more simply deal with.

    2. Break out the functionality that adds items to the DB into its own function. This is done just to simplify the code and make the main control flow easier to follow and see and write.

    3. Just use a while (true) loop and when you're done, you can simply return. No need for stop loop variables or any of that.

    Here's the general idea for how that could look (you will likely have to fix up some details and error handling since I can't run this myself).

    const { Sequelize, DataTypes } = require('sequelize'); // including this just in case
    const { spawn } = require("child_process");
    
    function getVideoInfo(body_url, start_num, stop_num) {
      return new Promise((resolve, reject) => {
        // actually spawn the thing
        let response = "";
        const yt_list = spawn("yt-dlp", [
          "--playlist-start",
          start_num,
          "--playlist-end",
          stop_num,
          "--flat-playlist",
          "--print", '%(title)s\t%(id)s\t%(webpage_url)s',
          body_url
        ]);
        yt_list.stdout.on("data", data => {
          response += data;
        });
        yt_list.stderr.on("data", data => {
          reject(new Error(`stderr: ${data}`));
        });
        yt_list.on("close", async (code) => {
          resolve(response);
        });
        yt_list.on("error", reject);
      });
    }
    
    async function addItemsToDb(response_list) {
      // adding the items to db
      console.log("adding items to db", "\ni:", i, "\n");
      await Promise.all(response_list.map(async (element) => {
        const items = element.split("\t");
        // update the vidoes too here by looking for any changes that could have been made
        // use find or create here to update the entries
        if (items.length === 3) {
          try {
            const item_available = items[0] === "[Deleted video]" || items[0] === "[Private video]";
            const [found, created] = await vid_list.findOrCreate({
              where: { url: items[2] },
              defaults: {
                id: items[1],
                reference: body_url,
                title: items[0],
                downloaded: false,
                available: item_available
              }
            });
            if (found) {
              if (!item_available) {
                found.available = false;
                //console.log("\nfound", items[0], "updated", "\ni:", i, "\n");
              }
              else {
                //console.log("\nfound", items[0], "no changes", "\ni:", i, "\n");
              }
              found.changed('updatedAt', true);
            }
          } catch (error) {
            // remember to uncomment this later, the sequelize erros are not relevant here now
            // console.error(error);
          }
        }
      }));
    }
    
    
    async function list_background(body_url, start_num, stop_num, chunk_size) {
      console.log('\nlisting in background');
      start_num = parseInt(start_num);
      stop_num = parseInt(stop_num);
    
      while (true) {
        // make the start and stop numbers
        start_num += chunk_size;
        stop_num += chunk_size;
    
        const response = await getVideoInfo(body_url, start_num, stop_num);
        const response_list = response.split("\n");
        // remove the "" from the end of the list
        response_list.pop();
        // get the status at the end
        if (response_list == '') {
          return;
        } else {
          await addItemsToDb(response_list);
        }
      }
    }
    

    P.S. I don't understand why you're adding chunk_size to start_num before you ever use it. It seems like you'd want to do that after you do the first iteration so you start at start_num, not start at start_num + chunk_size. But, this is how your original code was written so I left it that way.