perlweb-scrapingwebbrowser-controllwp-useragent

Perl LWP::UserAgent simulate browser


I am trying to automate getting web page using LWP::UserAgent but I get 403 Forbidden error while if I use from console wget https://dreaminislam.com/a/ or curl I get the page normal. How to set the correct options for LWP::UserAgent to get that page and similar to simulate real browser. Here is sample code.

use HTTP::CookieJar::LWP ();
use LWP::UserAgent;
use LWP::Simple;

    my $url = qq{https://dreaminislam.com/a/};
    my $content = getUrl($url);
    exit;

sub getUrl {

    my $url = shift;
    my $jar = HTTP::CookieJar::LWP->new;
    my $ua = LWP::UserAgent->new(timeout => 180,  cookie_jar => $jar, protocols_allowed => ['http', 'https']);
    $ua->agent(qq{Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0});

    my $response = $ua->get($url);

    if ($response->is_success) {
        my $content = $response->decoded_content;
        return $content;
    } else {
        my $content = $response->decoded_content;
        printf "Get url error [%d] %s.\n", $response->code, $response->message;
    }
}

Solution

  • It looks like there is some Anti-Bot protection installed at this site. It seems to require at least a User-Agent and a Accept header:

    use LWP::UserAgent;
    use HTTP::Request;
    
    my $ua = LWP::UserAgent->new();
    
    my $req = HTTP::Request->new(GET => 'https://dreaminislam.com/a/');
    $req->header('User-Agent' => 'Mozilla/5.0');
    $req->header('Accept' => '*/*');
    my $response = $ua->request($req);
    
    die $response->code if ! $response->is_success;
    print $response->decoded_content;