google-sheetsgoogle-apps-scriptweb-scrapingpagination

Pagination error while accessing data using Google Apps Script


I am trying to access url data (clickable titles) from this table. The script gets the first page correctly but I could not find a way to get the data from second page. Here is the sample script:

 function scrapeTitlesData() {

        var url = "https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender/SplashOpportunitiesSearchUI.aspx?menuIndex=3&BusCatID=157&type=category&ClickFrom=OpenOpp";

        let cookieResponse = UrlFetchApp.fetch(url);
        let cookie = cookieResponse.getAllHeaders()['Set-Cookie'].map(c => c.split(';')[0]).join(';');
    
        let options = {
         method: 'get',
         headers: { cookie },
    };


        var response = UrlFetchApp.fetch(url,options).getContentText();
        extractAndModifyUrls(response)

  }

    function extractAndModifyUrls(html) {
    
      // Regex pattern to match the required URLs
      var regex = /SplashBidNoticeAbstractUI\.aspx\?menuIndex=3&refID=\d+&[^"]+/g;
    
      // Initialize an empty array to store the modified URLs
      var modifiedUrls = [];
      var match;
      
      // Find all matches in the HTML
      while ((match = regex.exec(html)) !== null) {
        var newUrl = 'https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender/' + match[0];
        newUrl = newUrl.replace(/&/g, '&');
        
        // Add the new URL to the array
        modifiedUrls.push(newUrl);
      }
    
      console.log(modifiedUrls);
    
    }

I have tried adding the page number in the URL but it does not go to the next page at all. Any guidance to resolve this is much appreciated.


Solution

  • You need to make a POST request with form data and cookies. Something like this should work:

    function* getNextPage(url) {
      let response = UrlFetchApp.fetch(url);
      let cookie = response.getAllHeaders()['Set-Cookie'].map( c => c.split(';')[0]).join(';');
      let html = response.getContentText();
    
      yield html;
    
      while (html.includes('pgCtrlDetailedSearch$nextLB')) {
        let inputs = Array.from(html.matchAll(/id="(\S+)" value="(\S+)"/g), a => a.slice(1));
        let data = {
          '__EVENTTARGET': 'pgCtrlDetailedSearch$nextLB',
          '__EVENTARGUMENT': '',
          ...Object.fromEntries(inputs),
        }
    
        let options = {
          method: 'post',
          headers: {cookie},
          payload: data,
        };
    
        html = UrlFetchApp.fetch(url, options).getContentText();
        yield html;
      }
    }
    
    
    function main() {
      const pageUrl = 'https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender/SplashOpportunitiesSearchUI.aspx?menuIndex=3&BusCatID=53&type=category&ClickFrom=OpenOpp'
    
      let paginator = getNextPage(pageUrl);
      let baseUrl = 'https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender';
      for (let page of paginator) {
        let matches = page.matchAll(/id="dgSearchResult_\w+" href="([^"]+)"/g);
        let links = Array.from(matches, arr => `${baseUrl}/${arr[1].replaceAll('&', '&')}`)
        console.log(links)
      }
    
    }
    

    Note: getNextPage is a generator function. (yields one page at a time)