Long story short, I've made an app for web scraping and in order for it to be able to simultaneously run more then 1 process at a time (more than 1 Chromium opened), i used puppeteer-cluster. I've got it to run several processes at once, but the cluster won't stop afterwards, it permanently runs. Along the way, I've encountered the following error (1)
await cluster.close(); // Gives the following error -> cluster.close is not a function (1)
If i use it like this (2)
(await cluster).close(); // This returns no error (2)
Anywho, the main problem is that when the code gets to the line where cluster gets closed (at the end of the code, check code below), it freezes there. Won't return any error, as i tried catching it, but at the same time it won't crash either.
const puppeteer = require('puppeteer');
const { Cluster } = require('puppeteer-cluster/');
function delay(time) {
return new Promise(function(resolve) {
setTimeout(resolve, time)
});
}
(async () => {
const cluster = Cluster.launch({
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 2,
});
const test = async () => {
const user = process.argv[2];
const pass = process.argv[3];
const smis = process.argv[4];
const nrinreg = process.argv[5];
const browser = await puppeteer.launch({headless: false, defaultViewport: null, args:['--start-fullscreen']}); // you can also use '--start-fullscreen']});
const page = await browser.newPage();
//await page.setViewport({width: 1200, height: 900});
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './'+smis+''}).catch(e => {});
await page.goto('https://aplicatii2014.smis.fonduri-ue.ro/smis2014app/').catch(e => {});
await delay(2000);
// await page.waitFor(3000);
//scriere credentiale pt elementele de tip HTML user&parola
await page.type('#j_idt38 > .step-content > .step-pane > .col-md-12 > .form-group > input[name="j_idt38:utilizator"]', user).catch(e => {});
await delay(1000);
await page.type('#j_idt38 > .step-content > .step-pane > .col-md-12 > .form-group > input[name="j_idt38:pass"]', pass).catch(e => {});
await delay(1000);
page.click('#j_idt38 > .actions > a').catch(e => {});
await delay(2000);
page.click('#idPanelGroup > #headerPanel > div > #j_idt18 > tbody > tr > #topMenuCell > #j_idt22').catch(e => {});
await delay(2000);
await page.type('#dialogCereriFinantare > .ui-dialog-content > #formCereriFinantare > table > tbody > tr > td > input[id="formCereriFinantare:idSmisAll"]', smis).catch(e => {});
await delay(1000);
page.click('#dialogCereriFinantare > .ui-dialog-content > #formCereriFinantare > table > tbody > tr > td > a[id="formCereriFinantare:commandBtnSearch"]').catch(e => {});
await delay(1000);
page.click('span.ui-chkbox-icon.ui-icon.ui-icon-blank.ui-c').catch(e => {});
await delay(1000);
await page.type('#dialogAcordConfidentialitate > .ui-dialog-content > #formAcordConfidentialitate > div[id="formAcordConfidentialitate:j_idt167"] > .ui-scrollpanel-container > .ui-scrollpanel-content > .col-md-12 > .row > .col-md-3 > input[id="formAcordConfidentialitate:CNP"]', pass).catch(e => {});
await delay(1000);
page.click('#dialogAcordConfidentialitate > .ui-dialog-content > #formAcordConfidentialitate > div[id="formAcordConfidentialitate:j_idt167"] > .ui-scrollpanel-container > .ui-scrollpanel-content > .col-md-12 > .row > a[id="formAcordConfidentialitate:btnConfirmContent"]').catch(e => {});
await delay(1000);
page.click('#dialogCereriFinantare > .ui-dialog-content > #formCereriFinantare > div > div > a > .ui-icon-seek-end').catch(e => {});
await delay(2000);
const doc_details = await page.evaluate(() => {
//Extract each doc's basic details
let table = document.querySelector('#dialogCereriFinantare > .ui-dialog-content > #formCereriFinantare > div[id="formCereriFinantare:tableCereriFinantare"] > .ui-datatable-tablewrapper > table > tbody');
let doc_panels = Array.from(table.children);
// Loop through each doc and get their details
let doc_info = doc_panels.map(doc_panel => {
let codsmis = doc_panel.querySelector("tr > td:nth-child(1)").textContent;
let titlu = doc_panel.querySelector("tr > td:nth-child(2)").textContent;
let versiune = doc_panel.querySelector("tr > td:nth-child(3)").textContent;
let contractare = doc_panel.querySelector("tr > td:nth-child(4)").textContent;
return { versiune, contractare };
});
return doc_info;
});
doc_details.sort((a, b) => (parseInt(a.versiune) < parseInt(b.versiune) ? 1 : -1));
let res = new Array();
res = doc_details.filter(a => a.contractare.length > 0);
/**
* Get first elem from a array
* // [...res].shift()
*/
const [first] = res;
/**
* If no element exist
*/
if (first === null) {
page.click('#dialogCereriFinantare > .ui-dialog-content > #formCereriFinantare > div > div > a > .ui-icon-seek-prev').catch(e => { });
}
else {
let version = first["versiune"];
await delay(1000);
const example = await page.$('#dialogCereriFinantare > .ui-dialog-titlebar');
const bounding_box = await example.boundingBox();
await page.mouse.move(bounding_box.x + bounding_box.width / 2, bounding_box.y + bounding_box.height / 2);
await page.mouse.down();
await page.mouse.move(126, 19);
await page.mouse.up();
await delay(1000);
await page.waitForXPath("//tr/td[3][contains(., '"+ version +"')]");
const [projects] = await page.$x("//tr/td[3][contains(., '"+ version +"')]");
projects.click().catch(e => {});
await delay(2000);
await page.goto("https://aplicatii2014.smis.fonduri-ue.ro/smis2014app/faces/pages/comunicare.xhtml").catch(e => {});
await delay(2000);
await page.evaluate(() => {
document.querySelector('#j_idt68 > div > #idPanelContent > #j_idt140 > #j_idt140_content > #j_idt142 > div > .ui-datatable-tablewrapper > table > tbody').scrollIntoView();
}).catch(e => {});
await delay(2000).catch(e => {});
const [com] = await page.$x("//tr/td[1][contains(., '37114')]").catch(e => {});
if (com){
com.click().catch(e => {});
}
else
{
let [com2] = await page.$x("//tr/td[1][contains(., '"+ nrinreg +"')]");
do{
page.click('#j_idt68 > div > #idPanelContent > #j_idt140 > #j_idt140_content > #j_idt142 > div > div[id="j_idt142:idComunicareTable_paginator_bottom"] > .ui-paginator-next').catch(e => {});
await delay(2000).catch(e => {});
let [com2] = await page.$x("//tr/td[1][contains(., '"+ nrinreg +"')]").catch(e => {});
if (com2){
break;
}
}
while(!com2);
let [com3] = await page.$x("//tr/td[1][contains(., '"+ nrinreg +"')]").catch(e => {});
com3.click().catch(e => {});
await delay(2000).catch(e => {});
await page.evaluate(() => {
document.querySelector('#j_idt68 > div > #idPanelContent > #j_idt140 > div > #idDetaliicomunicare').scrollIntoView();
}).catch(e => {});
await delay(2000).catch(e => {});
}
await delay(2000);
const listadownload = await page.$$('#j_idt68 > div > #idPanelContent > div > div > #idDetaliicomunicare > div > div > div > ul > li > .ui-treenode-children > li > span');
for (let iteminlistadownload of listadownload){
await iteminlistadownload.click({button: 'right',}).catch(e => {});
await delay(2000);
let [viz] = await page.$x('//*[@id="idDetaliicomunicare:j_idt163"]/ul/li/a').catch(e => {});
viz.click().catch(e => {});
await delay(2000);
}
}
await delay(3000);
await browser.close();
};
(await cluster).queue(test);
(await cluster).idle();
(await cluster).close();
})();
I have been scraping the internet for a solution, looking for fixes on GitHub and nothing seems to work. What is it that I'm doing wrong that simply just won't terminate the process? PS : I added the whole code in hopes of it being relevant.
Cluster.launch
return a Promise. If you just call const cluster = Cluster.launch
, now cluster
is Promise, when you call (await cluster).close();
, (await cluster)
will return a Cluster
instance -> It work!
Let’s use cluster
as a Cluster
instance instead of a Promise object:
const cluster = await Cluster.launch({ // wait until it "launch" finish
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 2,
});