web-scrapingweb-inspector

Scraping Barchart.com financial data


I am trying to scrape the financial data in the tables on https://www.barchart.com/stocks/quotes/IBM/income-statement/

Using inspect element I didn't see any XHR/fetch requests, but it looks like the data is generated via a js file named global-MBHFEFVQ.js, but its hard to follow the obfuscated code.

For some other data on the barchart.com website it looks like it can be scraped via an API call as explained in this post: How can I webscrape these ticker symbols from barchart.com?. But I'm not sure if this is the same for the income statement data. Any help is appreciated as I am new to webscraping.

I will be using php to scrape the data, but other languages would work too.

Right now I am just getting the whole page and extracting the data I'm interested in as substrings, but this is not ideal as I have to sort through all the other overhead in the website and have too loop through each 'reportPage' in the url to get data from each year.

    $url = "https://www.barchart.com/stocks/quotes/IBM/income-statement/quarterly?reportPage=2";
    $html = file_get_contents($url);
    $date_start = stripos($html, "report__row-dates");
    $date_end = stripos($html, "</tr>", $offset = $date_start);
    $dates = substr($html, $date_start, $date_end - $date_start);

Solution

  • it looks like the data is generated via a js file named global-MBHFEFVQ.js

    no idea where you got that from, it's all embedded in the HTML,

    <?php
    
    declare(strict_types=1);
    $html = file_get_contents('https://www.barchart.com/stocks/quotes/IBM/income-statement/annual');
    //var_dump($html);die();
    $dom = new DOMDocument();
    @$dom->loadHTML($html);
    $xp = new DOMXPath($dom);
    $tbody = $xp->query('//tr[contains(@class,"bc-financial-report")]/parent::tbody')->item(0);
    $trs = $xp->query('./tr', $tbody);
    $data_keys = [];
    foreach($trs->item(0)->getElementsByTagName('td') as $td){
        $data_keys[] = trim($td->textContent);
    }
    $data = [];
    for($i=1;$i<$trs->length;++$i){
        $tr = $trs->item($i);
        $tds = $xp->query('./td', $tr);
        $row = [];
        foreach($tds as $td){
            $row[] = trim($td->textContent);
        }
        $data[] = array_combine($data_keys, $row);
    }
    var_export($data);
    

    gives

    array (
      0 => 
      array (
        ' ' => 'Sales',
        '12-2022' => '60,530,000',
        '12-2021' => '57,350,000',
        '12-2020' => '55,179,000',
        '12-2019' => '57,714,000',
        '12-2018' => '79,591,000',
      ),
      1 => 
      array (
        ' ' => 'Cost of Goods',
        '12-2022' => '27,842,000',
        '12-2021' => '25,865,000',
        '12-2020' => '24,314,000',
        '12-2019' => '26,180,000',
        '12-2018' => '42,654,000',
      ),
      2 => 
      array (
        ' ' => 'Gross Profit',
        '12-2022' => '32,687,000',
        '12-2021' => '31,486,000',
        '12-2020' => '30,865,000',
        '12-2019' => '31,533,000',
        '12-2018' => '36,936,000',
      ),
      3 => 
      array (
        ' ' => 'Operating Expenses',
        '12-2022' => '25,176,000',
        '12-2021' => '25,233,000',
        '12-2020' => '26,823,000',
        '12-2019' => '24,634,000',
        '12-2018' => '24,745,000',
      ),
      4 => 
      array (
        ' ' => 'Operating Income',
        '12-2022' => '7,512,000',
        '12-2021' => '6,252,000',
        '12-2020' => '4,042,000',
        '12-2019' => '6,900,000',
        '12-2018' => '12,192,000',
      ),
      5 => 
      array (
        ' ' => 'Interest Expense',
        '12-2022' => '1,216,000',
        '12-2021' => '1,155,000',
        '12-2020' => '1,288,000',
        '12-2019' => '1,344,000',
        '12-2018' => '723,000',
      ),
      6 => 
      array (
        ' ' => 'Other Income',
        '12-2022' => '-5,140,000',
        '12-2021' => '-260,000',
        '12-2020' => '-182,000',
        '12-2019' => '1,650,000',
        '12-2018' => '-127,000',
      ),
      7 => 
      array (
        ' ' => 'Pre-tax Income',
        '12-2022' => '1,156,000',
        '12-2021' => '4,837,000',
        '12-2020' => '2,572,000',
        '12-2019' => '7,206,000',
        '12-2018' => '11,342,000',
      ),
      8 => 
      array (
        ' ' => 'Income Tax',
        '12-2022' => '-626,000',
        '12-2021' => '124,000',
        '12-2020' => '-1,360,000',
        '12-2019' => '60,000',
        '12-2018' => '2,619,000',
      ),
      9 => 
      array (
        ' ' => 'Net Income Continuous',
        '12-2022' => '1,783,000',
        '12-2021' => '4,712,000',
        '12-2020' => '3,932,000',
        '12-2019' => '7,146,000',
        '12-2018' => '8,723,000',
      ),
      10 => 
      array (
        ' ' => 'Net Income Discontinuous',
        '12-2022' => '-143,000',
        '12-2021' => '1,030,000',
        '12-2020' => '1,658,000',
        '12-2019' => '2,285,000',
        '12-2018' => '5,000',
      ),
      11 => 
      array (
        ' ' => 'Net Income',
        '12-2022' => '$1,640,000',
        '12-2021' => '$5,742,000',
        '12-2020' => '$5,590,000',
        '12-2019' => '$9,431,000',
        '12-2018' => '$8,728,000',
      ),
      12 => 
      array (
        ' ' => 'EPS Basic Total Ops',
        '12-2022' => '1.82',
        '12-2021' => '6.41',
        '12-2020' => '6.28',
        '12-2019' => '10.63',
        '12-2018' => '9.57',
      ),
      13 => 
      array (
        ' ' => 'EPS Basic Continuous Ops',
        '12-2022' => '1.97',
        '12-2021' => '5.26',
        '12-2020' => '4.42',
        '12-2019' => '8.05',
        '12-2018' => '9.56',
      ),
      14 => 
      array (
        ' ' => 'EPS Basic Discontinuous Ops',
        '12-2022' => '-0.16',
        '12-2021' => '1.15',
        '12-2020' => '1.86',
        '12-2019' => '2.58',
        '12-2018' => '0.01',
      ),
      15 => 
      array (
        ' ' => 'EPS Diluted Total Ops',
        '12-2022' => '1.80',
        '12-2021' => '6.35',
        '12-2020' => '6.23',
        '12-2019' => '10.56',
        '12-2018' => '9.52',
      ),
      16 => 
      array (
        ' ' => 'EPS Diluted Continuous Ops',
        '12-2022' => '1.95',
        '12-2021' => '5.21',
        '12-2020' => '4.38',
        '12-2019' => '8.00',
        '12-2018' => '9.51',
      ),
      17 => 
      array (
        ' ' => 'EPS Diluted Discontinuous Ops',
        '12-2022' => '-0.16',
        '12-2021' => '1.14',
        '12-2020' => '1.85',
        '12-2019' => '2.56',
        '12-2018' => '0.01',
      ),
      18 => 
      array (
        ' ' => 'EPS Diluted Before Non-Recurring Items',
        '12-2022' => '9.13',
        '12-2021' => '7.93',
        '12-2020' => '8.67',
        '12-2019' => '12.81',
        '12-2018' => '13.81',
      ),
      19 => 
      array (
        ' ' => 'EBITDA(a)',
        '12-2022' => '$12,314,000',
        '12-2021' => '$12,669,000',
        '12-2020' => '$10,737,000',
        '12-2019' => '$12,959,000',
        '12-2018' => '$16,672,000',
      ),
    )