I'm trying to load around 30,000 URLs in PHP. To complete this task as quickly as possible I'm trying to use curl_multi_init(). However it appears to be loading all 30,000 at once where as my understanding was it would process 10 at a time unless otherwise specified by CURLMOPT_MAXCONNECTS.
I believe it's trying to load all 30,000 at once because the code runs for about 8 seconds (the timeout set below) and then returns empty content for most of the URLs, as if the requests failed.
The code runs as expected for a smaller amount of domains, e.g under 100.
How can I ensure it only processes 10 requests at a time?
$mh = curl_multi_init();
$requests = [];
foreach ($urls as $i => $url) {
$requests[$i] = curl_init($url);
curl_setopt($requests[$i], CURLOPT_RETURNTRANSFER, true);
curl_setopt($requests[$i], CURLOPT_TIMEOUT, 8);
curl_setopt($requests[$i], CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($requests[$i], CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($requests[$i], CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($requests[$i], CURLOPT_HEADER, false);
curl_setopt($requests[$i], CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($requests[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36');
curl_multi_add_handle($mh, $requests[$i]);
}
$active = null;
do {
curl_multi_exec($mh, $active);
} while ($active);
$responses = [];
foreach ($requests as $request) {
$responses[] = curl_multi_getcontent($request);
curl_multi_remove_handle($mh, $request);
curl_close($request);
}
Give this a try. It splits $urls into 100 element arrays, and sends a multi request for each group of 100.
$chunks = array_chunk($urls,100);
foreach($chunks as $chunk) {
$mh = curl_multi_init();
$responses = [];
$requests = [];
foreach ($urls as $i => $url) {
$requests[$i] = curl_init($url);
curl_setopt($requests[$i], CURLOPT_RETURNTRANSFER, true);
curl_setopt($requests[$i], CURLOPT_TIMEOUT, 8);
curl_setopt($requests[$i], CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($requests[$i], CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($requests[$i], CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($requests[$i], CURLOPT_HEADER, false);
curl_setopt($requests[$i], CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($requests[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36');
curl_multi_add_handle($mh, $requests[$i]);
}
$active = null;
do {
curl_multi_exec($mh, $active);
} while ($active);
foreach ($requests as $request) {
$responses[] = curl_multi_getcontent($request);
curl_multi_remove_handle($mh, $request);
curl_close($request);
}
}