I am trying to access url data (clickable titles) from this table. The script gets the first page correctly but I could not find a way to get the data from second page. Here is the sample script:
function scrapeTitlesData() {
var url = "https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender/SplashOpportunitiesSearchUI.aspx?menuIndex=3&BusCatID=157&type=category&ClickFrom=OpenOpp";
let cookieResponse = UrlFetchApp.fetch(url);
let cookie = cookieResponse.getAllHeaders()['Set-Cookie'].map(c => c.split(';')[0]).join(';');
let options = {
method: 'get',
headers: { cookie },
};
var response = UrlFetchApp.fetch(url,options).getContentText();
extractAndModifyUrls(response)
}
function extractAndModifyUrls(html) {
// Regex pattern to match the required URLs
var regex = /SplashBidNoticeAbstractUI\.aspx\?menuIndex=3&refID=\d+&[^"]+/g;
// Initialize an empty array to store the modified URLs
var modifiedUrls = [];
var match;
// Find all matches in the HTML
while ((match = regex.exec(html)) !== null) {
var newUrl = 'https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender/' + match[0];
newUrl = newUrl.replace(/&/g, '&');
// Add the new URL to the array
modifiedUrls.push(newUrl);
}
console.log(modifiedUrls);
}
I have tried adding the page number in the URL but it does not go to the next page at all. Any guidance to resolve this is much appreciated.
You need to make a POST
request with form data and cookies. Something like this should work:
function* getNextPage(url) {
let response = UrlFetchApp.fetch(url);
let cookie = response.getAllHeaders()['Set-Cookie'].map( c => c.split(';')[0]).join(';');
let html = response.getContentText();
yield html;
while (html.includes('pgCtrlDetailedSearch$nextLB')) {
let inputs = Array.from(html.matchAll(/id="(\S+)" value="(\S+)"/g), a => a.slice(1));
let data = {
'__EVENTTARGET': 'pgCtrlDetailedSearch$nextLB',
'__EVENTARGUMENT': '',
...Object.fromEntries(inputs),
}
let options = {
method: 'post',
headers: {cookie},
payload: data,
};
html = UrlFetchApp.fetch(url, options).getContentText();
yield html;
}
}
function main() {
const pageUrl = 'https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender/SplashOpportunitiesSearchUI.aspx?menuIndex=3&BusCatID=53&type=category&ClickFrom=OpenOpp'
let paginator = getNextPage(pageUrl);
let baseUrl = 'https://notices.philgeps.gov.ph/GEPSNONPILOT/Tender';
for (let page of paginator) {
let matches = page.matchAll(/id="dgSearchResult_\w+" href="([^"]+)"/g);
let links = Array.from(matches, arr => `${baseUrl}/${arr[1].replaceAll('&', '&')}`)
console.log(links)
}
}
Note: getNextPage
is a generator function. (yields one page at a time)