pythonjsondata-structures

Generating Recursive Data Structure From Table


I'm trying to convert this Wikipedia table into a nested JSON object.

I want to group each entry into its parent category. e.g.,

{
   "name": "Mining, quarrying, and oil and gas extraction", 
   "value": 593300, 
   "children": [
       {
           "name": "Oil and gas extraction",
           "value": 118500,
           "children": []
       },
       {
           "name": "Mining, except oil and gas",
           "value": 189700,
           "children": [
               {
                   "name": "Coal mining",
                   "value": 44100,
                   "children": []
               },
               {
                   "name": "Metal ore mining",
                   "value": 44100,
                   "children": []
               },
               {
                   "name": "Nonmetallic mineral mining and quarrying",
                   "value": 102700,
                   "children": []
               },
           ]
       },
       {
           "name": "Support activities for mining",
           "value": 290100,
           "children": []
       }
   ]
}

Using bs4 to parse the page, I can iterate through the table rows as a list and use the "style: padding" attribute to determine the indentation level*, but I'm unable to convert the indentation into a meaningful parent/child representation.

I've tried iterating through the list with a recursive approach:

import bs4

f = open("./webpage.html", "r", encoding="utf8")
soup = bs4.BeautifulSoup(f, "html.parser")

table = soup.find(
    "table", attrs={"class": "collapsible wikitable mw-collapsible mw-made-collapsible"}
)
rows = table.find_all("tr")

stash = []
def recurse(start_idx):
    for idx, row in enumerate(rows[start_idx:]):
        cols = row.find_all("td")

        # skip headers
        if len(cols) == 0:
            continue
        if not cols[0].attrs['style']:
            continue

        indent = int(cols[0].attrs["style"].strip(';')[-3])
        entry = {
            "name": cols[0].text,
            "value": cols[1].text,
            "indent": indent,
            "children": [],
        }
        while stash:
            parent = stash[-1]
            for inner_idx, inner_row in enumerate(rows[start_idx + idx + 1:]):
                inner_cols = inner_row.find_all("td")
                inner_indent = int(inner_cols[0].attrs["style"].strip(';')[-3])
                inner_entry = {
                    "name": inner_cols[0].text,
                    "value": inner_cols[1].text,
                    "indent": inner_indent,
                    "children": [],
                }

                if indent == parent["indent"]:
                    child = stash.pop()
                    stash[-1]["children"].append(child)
                    stash.append(entry)

                if parent["indent"] - indent == -1: 
                    stash.append(entry) # add row to stash
                    recurse(start_idx + idx + 1) # calculate children of row
                    parent["children"].append(entry) # add row with children to row parent

                if indent < parent["indent"]:
                    return

        stash.append(entry)

recurse(0)

This approach correctly appends child nodes to its parent, but fails to append children of children properly.

I suspect there's a data structure approach that fits this problem neatly but I haven't found anything in my research.

Any guidance is much appreciated!

*I needed to manually add a style="padding-left: 0em" to the first non-header <td> of the .html for my index method to start from 0.


Solution

  • I would separate the code needed to parse the HTML from the code that uses the parsed data to build the hierarchy. So two functions:

    import bs4
    import re
    
    def parse_table(soup):
        table = soup.find(
            "table", attrs={"class": "collapsible wikitable mw-collapsible mw-made-collapsible"}
        )
        for row in table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) != 2:
                continue
            parts = re.match(r"padding-left: *(\d+)em", cols[0].attrs.get('style', ''))
            indent = int(parts[1]) if parts else 0
            yield indent, cols[0].text, int(re.sub(r",|\D.*", "", cols[1].text))
    
    def create_hieararchy(soup):
        stash = []
        path = [stash]
        for indent, sector, count in parse_table(soup):
            children = []
            del path[indent+1:]
            path[-1].append({
                "name": sector,
                "value": count,
                "indent": indent,
                "children": children,
            })
            path.append(children)
        return stash
    
    f = open("./webpage.html", "r", encoding="utf8")
    soup = bs4.BeautifulSoup(f, "html.parser")
    stash = create_hieararchy(soup)