phpcurllibcurlcurl-multi

Loading 30,000 URLs with PHP curl_multi_init returning empty string


I'm trying to load around 30,000 URLs in PHP. To complete this task as quickly as possible I'm trying to use curl_multi_init(). However it appears to be loading all 30,000 at once where as my understanding was it would process 10 at a time unless otherwise specified by CURLMOPT_MAXCONNECTS.

I believe it's trying to load all 30,000 at once because the code runs for about 8 seconds (the timeout set below) and then returns empty content for most of the URLs, as if the requests failed.

The code runs as expected for a smaller amount of domains, e.g under 100.

How can I ensure it only processes 10 requests at a time?

    $mh = curl_multi_init();

    $requests = [];
    foreach ($urls as $i => $url) {
        $requests[$i] = curl_init($url);
        curl_setopt($requests[$i], CURLOPT_RETURNTRANSFER, true);
        curl_setopt($requests[$i], CURLOPT_TIMEOUT, 8);
        curl_setopt($requests[$i], CURLOPT_CONNECTTIMEOUT, 5);
        curl_setopt($requests[$i], CURLOPT_SSL_VERIFYHOST, false);
        curl_setopt($requests[$i], CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($requests[$i], CURLOPT_HEADER, false);
        curl_setopt($requests[$i], CURLOPT_FOLLOWLOCATION, TRUE);
        curl_setopt($requests[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36');
        curl_multi_add_handle($mh, $requests[$i]);
    }
    
    $active = null;
    
    do {
        curl_multi_exec($mh, $active);
    } while ($active);
    
    
    $responses = [];
    foreach ($requests as $request) {
        $responses[] = curl_multi_getcontent($request);
        curl_multi_remove_handle($mh, $request);
        curl_close($request);
    }

Solution

  • Give this a try. It splits $urls into 100 element arrays, and sends a multi request for each group of 100.

    $chunks = array_chunk($urls,100);
    foreach($chunks as $chunk) {
        $mh = curl_multi_init();
        $responses = [];
        $requests = [];
        foreach ($urls as $i => $url) {
            $requests[$i] = curl_init($url);
            curl_setopt($requests[$i], CURLOPT_RETURNTRANSFER, true);
            curl_setopt($requests[$i], CURLOPT_TIMEOUT, 8);
            curl_setopt($requests[$i], CURLOPT_CONNECTTIMEOUT, 5);
            curl_setopt($requests[$i], CURLOPT_SSL_VERIFYHOST, false);
            curl_setopt($requests[$i], CURLOPT_SSL_VERIFYPEER, false);
            curl_setopt($requests[$i], CURLOPT_HEADER, false);
            curl_setopt($requests[$i], CURLOPT_FOLLOWLOCATION, TRUE);
            curl_setopt($requests[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36');
            curl_multi_add_handle($mh, $requests[$i]);
        }
        
        $active = null;
        
        do {
            curl_multi_exec($mh, $active);
        } while ($active);
        
        
    
        foreach ($requests as $request) {
            $responses[] = curl_multi_getcontent($request);
            curl_multi_remove_handle($mh, $request);
            curl_close($request);
        }
    }