google-sheetsgoogle-apps-script

Working Around Large Dataset that gives me 413 Error


I'm working on a project where I'm attempting to use Sheets API to grab the data from a large set of data. It's about 400K rows with 11 columns. I have seen methods where people split the load into batches of 100K rows at a time but I'm at a loss at how I would do this while performing some cleanup. I have have to loop through the rows to remove any rows where one of the columns equals 1 of 36 values. Here is a sample of a script that I attempted but I get stuck at the dataSourceArr gathering of data as that is the sheet that contains the large dataset.

  //Data from Content Column. These are the excluded items
  excludedContentArr = Sheets.Spreadsheets.Values.get(tss.getId(), `'` + excludedContentSht.getName() + `'!A2:A`).values;  
  
  //This is the large dataset that is giving me an error right now
  dataSourceArr = Sheets.Spreadsheets.Values.get(sourceSS.getId(), `'` + dataSourceSht.getName() + `'!A1:` + columnToLetter(dataSourceSht.getLastColumn())).values;
  var headersDataSrcArr = dataSourceArr[0];//Get the first and only inner array
  var eidDataSrcCN = headersDataSrcArr.indexOf('EID') + 1;//Arrays are zero indexed- add 1.
  var contentDataSrcCN = headersDataSrcArr.indexOf('Content') + 1;//Arrays are zero indexed- add 1.
  var statusDataSrcCN = headersDataSrcArr.indexOf('Status') + 1;//Arrays are zero indexed- add 1.
  var compDateDataSrcCN = headersDataSrcArr.indexOf('Completion Date') + 1;//Arrays are zero indexed- add 1.
  var pathwayDataSrcCN = headersDataSrcArr.indexOf('Pathway') + 1;//Arrays are zero indexed- add 1.
  
  var values = [];
  for(var i in dataSourceArr){    
    var excludedContentIndex = excludedContentArr.map(r => r[0]).indexOf(dataSourceArr[i][contentDataSrcCN-1]);
    if(excludedContentIndex == -1){
      //Action if not found
      values.push([
        dataSourceArr[i][eidDataSrcCN-1],
        dataSourceArr[i][contentDataSrcCN-1],
        dataSourceArr[i][statusDataSrcCN-1],
        dataSourceArr[i][compDateDataSrcCN-1],
        dataSourceArr[i][pathwayDataSrcCN-1]
      ]);

    }
  }
  Logger.log('Found ' + values.length + ' rows of data that met the criteria.');

  //Paste into the destination sheet.
  Sheets.Spreadsheets.Values.update({ values }, tss.getId(), `'` + pathwayContentCompSht.getName() + `'!A2`, { valueInputOption: "USER_ENTERED" });

You can see where I am looping to exclude any rows that contain any of the excluded content. Example of data to be excluded:

Content
Back to the Future 1
Back to the Future 2
Back to the Future 3

Example of Large Dataset:

EID Content Status Completion Date Pathway
a1 Rocky 5 Complete 01/01/2024 abc123
a2 Back to the Future 1 Complete 01/01/2024 abc123
a3 Rocky 4 Complete 01/01/2024 abc123
a4 Rocky 3 Complete 01/01/2024 abc123
a5 Back to the Future 2 Complete 01/01/2024 abc123
a6 Rocky 2 Complete 01/01/2024 abc123
a7 Rocky 1 Complete 01/01/2024 abc123
a8 Matrix Revolutions Complete 01/01/2024 abc123
a9 Back to the Future 3 Complete 01/01/2024 abc123
a10 Matrix Complete 01/01/2024 abc123

Example of End Result:

EID Content Status Completion Date Pathway
a1 Rocky 5 Complete 01/01/2024 abc123
a3 Rocky 4 Complete 01/01/2024 abc123
a4 Rocky 3 Complete 01/01/2024 abc123
a6 Rocky 2 Complete 01/01/2024 abc123
a7 Rocky 1 Complete 01/01/2024 abc123
a8 Matrix Revolutions Complete 01/01/2024 abc123
a10 Matrix Complete 01/01/2024 abc123

Solution

  • From your following reply,

    The error message is HttpResponseException: Response Code: 413. Message: response too large.

    The error occurs on the second line with dataSourceArr = Sheets.Spreadsheets.Values.get(...

    I understand that your error of HttpResponseException: Response Code: 413. Message: response too large. occurs at dataSourceArr = Sheets.Spreadsheets.Values.get(sourceSS.getId(), '+ dataSourceSht.getName() +'!A1: + columnToLetter(dataSourceSht.getLastColumn())).values;. If your other parts except for this work fine, how about the following modification?

    In this modification, the values are retrieved and put by separating rows.

    Modified script:

    Before you use this script, please enable Sheets API at Advanced Google services.

    // ref: https://tanaikech.github.io/2022/04/25/report-handling-10000000-cells-in-google-spreadsheet-using-google-apps-script/
    function getValues_({ spreadsheetId, sheetName, start = 1, maxRow, limit = 100000 }) {
      return [...Array(Math.ceil(maxRow / limit))].flatMap((_) => {
        const last = start - 1 + limit;
        const range = `'${sheetName}'!A${start}:${last > maxRow ? maxRow : last}`;
        const temp = Sheets.Spreadsheets.Values.get(spreadsheetId, range).values;
        start += limit;
        return temp;
      });
    }
    
    // ref: https://tanaikech.github.io/2022/04/25/report-handling-10000000-cells-in-google-spreadsheet-using-google-apps-script/
    function setValues_({ spreadsheetId, sheetName, start = 1, maxRow, limit = 100000, values }) {
      Array.from(Array(Math.ceil(maxRow / limit))).forEach((_) => {
        const v = values.splice(0, limit);
        Sheets.Spreadsheets.Values.update({ values: v }, spreadsheetId, `'${sheetName}'!A${start}`, { valueInputOption: "USER_ENTERED" });
        start += limit;
      });
    }
    
    function myFunction() {
      // Please set your variables.
      var tss = 
      var excludedContentSht = 
      var sourceSS = 
      var dataSourceSht = 
      var pathwayContentCompSht = 
    
    
    
      // --- I modified the below script.
      excludedContentArr = getValues_({ spreadsheetId: tss.getId(), sheetName: excludedContentSht.getName(), start: 2, maxRow: excludedContentSht.getLastRow(), limit: 100000 });
      dataSourceArr = getValues_({ spreadsheetId: sourceSS.getId(), sheetName: dataSourceSht.getName(), start: 1, maxRow: dataSourceSht.getLastRow(), limit: 100000 });
      // ---
    
    
      var headersDataSrcArr = dataSourceArr[0];//Get the first and only inner array
      var eidDataSrcCN = headersDataSrcArr.indexOf('EID') + 1;//Arrays are zero indexed- add 1.
      var contentDataSrcCN = headersDataSrcArr.indexOf('Content') + 1;//Arrays are zero indexed- add 1.
      var statusDataSrcCN = headersDataSrcArr.indexOf('Status') + 1;//Arrays are zero indexed- add 1.
      var compDateDataSrcCN = headersDataSrcArr.indexOf('Completion Date') + 1;//Arrays are zero indexed- add 1.
      var pathwayDataSrcCN = headersDataSrcArr.indexOf('Pathway') + 1;//Arrays are zero indexed- add 1.
    
      var values = [];
      for (var i in dataSourceArr) {
        var excludedContentIndex = excludedContentArr.map(r => r[0]).indexOf(dataSourceArr[i][contentDataSrcCN - 1]);
        if (excludedContentIndex == -1) {
          //Action if not found
          values.push([
            dataSourceArr[i][eidDataSrcCN - 1],
            dataSourceArr[i][contentDataSrcCN - 1],
            dataSourceArr[i][statusDataSrcCN - 1],
            dataSourceArr[i][compDateDataSrcCN - 1],
            dataSourceArr[i][pathwayDataSrcCN - 1]
          ]);
    
        }
      }
      Logger.log('Found ' + values.length + ' rows of data that met the criteria.');
    
    
      // --- I modified the below script.
      setValues_({ spreadsheetId: tss.getId(), sheetName: pathwayContentCompSht.getName(), start: 2, maxRow: values.length, limit: 100000, values });
      // ---
    }