javascriptnode.jsweb-scrapingapify

Apify - How to Enqueue URL Variations Efficiently


I am creating a new actor in Apify with Cheerio to read an input file of URLs and return primarily two items: (1) the HTTP status code and (2) the HTML title. As part of our process, I would like to be able to try up to 4 variations of each input URL, such as:

  1. HTTP://WWW.SOMEURL.COM
  2. HTTPS://WWW.SOMEURL.COM
  3. HTTP://SOMEURL.COM
  4. HTTPS://SOMEURL.COM

If one of the 4 variations is successful, then the process should ignore the other variations and move to the next input URL.

I read the original input list into a RequestList, and then would like to create the variations in a RequestQueue. Is this the most efficient way to do it? Please see code below, and thank you!

const Apify = require('apify');
const {
    utils: { enqueueLinks },
} = Apify;
const urlParse = require('url');

Apify.main(async () => {
    const input = await Apify.getInput();
    const inputFile = input.inputFile;
    console.log('INPUT FILE: ' + inputFile);
    
    const requestList = await Apify.openRequestList('urls', [
        { requestsFromUrl: inputFile, userData: { isFromUrl: true } },
    ]);
    const requestQueue = await Apify.openRequestQueue();

    const proxyConfiguration = await Apify.createProxyConfiguration();

    const handlePageFunction = async ({ $, request, response }) => {
        let parsedHost = urlParse.parse(request.url).host;
        let simplifiedHost = parsedHost.replace('www.', '');
        
        const urlPrefixes = ['HTTP://WWW.', 'HTTPS://WWW.', 'HTTP://', 'HTTPS://'];
        let i;
        for (i = 0; i < urlPrefixes.length; i++) {
            let newUrl = urlPrefixes[i] + simplifiedHost;
            console.log('NEW URL: ' + newUrl);
            await requestQueue.addRequest({ url: newUrl });
        }
  
        console.log(`Processing ${request.url}`);
        const results = {
            inputUrl: request.url,
            httpCode: response.statusCode,
            title: $('title').first().text().trim(),
            responseUrl: response.url
        };
        await Apify.pushData(results);
    };

    const crawler = new Apify.CheerioCrawler({
        proxyConfiguration,
        maxRequestRetries: 0,
        handlePageTimeoutSecs: 60,
        requestTimeoutSecs: 60,
        requestList,
        requestQueue,
        handlePageFunction,   
        handleFailedRequestFunction: async ({ request }) => {
            await Apify.pushData({ inputUrl: request.url, httpCode: '000', title: '', responseUrl: ''});
        }
    });

    await crawler.run();
});

Solution

  • you should create your URL list beforehand. the handlePageFunction is only used for the actual scraping part, and you should only have the Apify.pushData there:

        //...
    
        const initRequestList = await Apify.openRequestList('urls', [
            { requestsFromUrl: inputFile },
        ]);
        
        const parsedRequests = [];
        let req;
    
        while (req = await initRequestList.fetchNextRequest()) {
          const parsedHost = urlParse.parse(req .url).host;
          const simplifiedHost = parsedHost.replace('www.', '');
            
          const urlPrefixes = ['HTTP://WWW.', 'HTTPS://WWW.', 'HTTP://', 'HTTPS://'];
    
          for (let i = 0; i < urlPrefixes.length; i++) {
            let newUrl = urlPrefixes[i] + simplifiedHost;
            console.log('NEW URL: ' + newUrl);
            parsedRequests.push({ 
              url: newUrl,
              userData: { isFromUrl: true }
            });
          }
        } 
    
        const requestList = await Apify.openRequestList('starturls', parsedRequests);
    
        //...
    
        const crawler = new Apify.CheerioCrawler({
            proxyConfiguration,
            maxRequestRetries: 0,
            handlePageTimeoutSecs: 60,
            requestTimeoutSecs: 60,
            handlePageFunction,   
            requestList,
            handleFailedRequestFunction: async ({ request }) => {
                await Apify.pushData({ inputUrl: request.url, httpCode: '000', title: '', responseUrl: ''});
            }
        });
    
        //...
    
    

    requestsFromUrl is a greedy function that tries to parse all URLs from to the given resource. so you'll have to perform the processing as an additional step.