pythonbeautifulsoup

Extracting JavaScript Variables into Python Dictionaries


Understanding that I have to use PyQt5 in conjunction with BeautifulSoup to run javascript on my client after extracting the html using BeautifulSoup, I am trying to convert variable _Flourish_data into a Python dictionary.

Is there an easy way to extract the Javascript variable, _Flourish_data, into a Python dictionary? Here is my current Python to extract the Javascript using PyQt5 and BeautifulSoup:

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl

class Page(QWebEnginePage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()

page = Page('https://flo.uri.sh/visualisation/2451841/embed?auto=1')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find_all('script')
js_test[5]

The output of the existing code is

<script>
        function _Flourish_unflattenInto(dest, src) {
            dest = dest || {};
            for (var k in src) {
                var t = dest;
                for (var i = k.indexOf("."), p = 0; i >= 0; i = k.indexOf(".", p = i+1)) {
                    var s = k.substring(p, i);
                    if (!(s in t)) t[s] = {};
                    t = t[s];
                }
                t[k.substring(p)] = src[k];
            }
            return dest;
        }
        var _Flourish_settings = {"cell_fill_1":"#ffffff","cell_fill_2":"#ebebeb","cell_fill_direction":"horizontal","cell_font_size":"1","cell_height":20,"cell_horizontal_alignment":"center","cell_link_color":"#2886b2","cell_padding_horizontal":16,"cell_padding_vertical":11,"column_width_mode":"auto","column_widths":"10%, 10%, 10%, 10%, 50%, 10%","header_fill":"#181f6c","header_font_color":"#ffffff","header_font_default":false,"header_font_size":1.1,"header_horizontal_alignment":"center","header_style_default":true,"layout.body_font":{"name":"Source Sans Pro","url":"https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,700"},"layout.layout_order":"stack-default","layout.space_between_sections":"0.5","mobile.view":true,"no_results_text":"Use the search bar to find your state","pagination_amount":41,"pagination_amount_search":"5","search_enabled":false,"search_hide_table":false,"search_placeholder":"Search to find your state","search_resize":true,"search_width":15};
        _Flourish_unflattenInto(window.template.state, _Flourish_settings);

        var _Flourish_data_column_names = {"rows":{"columns":["State ","Earliest/Planned Start Date for 20/21 Academic Year ","","","",""]}},
                _Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n  ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n  ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};
        for (var _Flourish_dataset in _Flourish_data) {
                window.template.data[_Flourish_dataset] = _Flourish_data[_Flourish_dataset];
                window.template.data[_Flourish_dataset].column_names = _Flourish_data_column_names[_Flourish_dataset];
        }
        window.template.draw();
    </script>

I just want var _flourish_data from HTML tag, as shown below:

        _Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n  ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n  ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};

Any help would be greatly appreciated!


Solution

  • You don't need to execute Javascript. It can be done with json and re module.

    For example:

    import re
    import json
    import requests
    
    url = 'https://flo.uri.sh/visualisation/2451841/embed?auto=1'
    
    html_data = requests.get(url).text
    data = re.search(r'_Flourish_data = (\{.*?\});', html_data).group(1)
    
    data = json.loads(data)
    
    # uncomment this to print all data:
    # print(json.dumps(data, indent=4))
    
    for row in data['rows']:
        print('{:<55}{}'.format(*map(str.strip, row['columns'][:2])))
    

    Prints:

    Alabama                                                Varies by district
    Alaska                                                 Varies by district
    American Samoa                                         Unknown
    Arizona                                                Varies by district
    Arkansas                                               Varies by district
    Bureau of Indian Education                             Varies by district
    California                                             Varies by district
    Colorado                                               Varies by district
    Connecticut                                            Not yet determined
    Delaware                                               Varies by district
    Department of Defense Education Activity               Varies by district
    District of Columbia                                   8/31/2020
    Florida                                                Unknown
    Georgia                                                Unknown
    Guam                                                   Unknown
    Hawaii                                                 Not yet determined
    Idaho                                                  Varies by District
    Illinois                                               Varies by district
    Indiana                                                Not yet determined
    Iowa                                                   Varies by district
    Kansas                                                 Not yet determined
    Kentucky                                               Unknown
    Louisiana                                              Varies by district
    Maine                                                  Varies by district
    Maryland                                               Not yet determined
    Massachusetts                                          Not yet determined
    Michigan                                               Not yet determined
    Minnesota                                              Not yet determined
    Mississippi                                            Varies by district
    Missouri                                               Varies by district
    Montana                                                Varies by district
    Nebraska                                               Varies by district
    Nevada                                                 Varies by district
    New Hampshire                                          Not yet determined
    New Jersey                                             Varies by district
    New Mexico                                             Unknown
    New York                                               Not yet determined
    North Carolina                                         8/17/2020
    North Dakota                                           Varies by district
    Northern Marianas                                      Unknown
    Ohio                                                   Not yet determined
    Oklahoma                                               Varies by district
    Oregon                                                 Not yet determined
    Pennsylvania                                           Varies by district
    Puerto Rico                                            Unknown
    Rhode Island                                           Not yet determined
    South Carolina                                         Not yet determined
    South Dakota                                           Varies by district
    Tennessee                                              Varies by district
    Texas                                                  Varies by district
    U.S. Virgin Islands                                    Not yet determined
    Utah                                                   Varies by district
    Vermont                                                Not yet determined
    Virginia                                               Not yet determined
    Washington                                             Varies by District
    West Virginia                                          Not yet determined
    Wisconsin                                              Varies by district
    Wyoming                                                Not yet determined