I'm trying to convert this Wikipedia table into a nested JSON object.
I want to group each entry into its parent category. e.g.,
{
"name": "Mining, quarrying, and oil and gas extraction",
"value": 593300,
"children": [
{
"name": "Oil and gas extraction",
"value": 118500,
"children": []
},
{
"name": "Mining, except oil and gas",
"value": 189700,
"children": [
{
"name": "Coal mining",
"value": 44100,
"children": []
},
{
"name": "Metal ore mining",
"value": 44100,
"children": []
},
{
"name": "Nonmetallic mineral mining and quarrying",
"value": 102700,
"children": []
},
]
},
{
"name": "Support activities for mining",
"value": 290100,
"children": []
}
]
}
Using bs4 to parse the page, I can iterate through the table rows as a list and use the "style: padding"
attribute to determine the indentation level*, but I'm unable to convert the indentation into a meaningful parent/child representation.
I've tried iterating through the list with a recursive approach:
import bs4
f = open("./webpage.html", "r", encoding="utf8")
soup = bs4.BeautifulSoup(f, "html.parser")
table = soup.find(
"table", attrs={"class": "collapsible wikitable mw-collapsible mw-made-collapsible"}
)
rows = table.find_all("tr")
stash = []
def recurse(start_idx):
for idx, row in enumerate(rows[start_idx:]):
cols = row.find_all("td")
# skip headers
if len(cols) == 0:
continue
if not cols[0].attrs['style']:
continue
indent = int(cols[0].attrs["style"].strip(';')[-3])
entry = {
"name": cols[0].text,
"value": cols[1].text,
"indent": indent,
"children": [],
}
while stash:
parent = stash[-1]
for inner_idx, inner_row in enumerate(rows[start_idx + idx + 1:]):
inner_cols = inner_row.find_all("td")
inner_indent = int(inner_cols[0].attrs["style"].strip(';')[-3])
inner_entry = {
"name": inner_cols[0].text,
"value": inner_cols[1].text,
"indent": inner_indent,
"children": [],
}
if indent == parent["indent"]:
child = stash.pop()
stash[-1]["children"].append(child)
stash.append(entry)
if parent["indent"] - indent == -1:
stash.append(entry) # add row to stash
recurse(start_idx + idx + 1) # calculate children of row
parent["children"].append(entry) # add row with children to row parent
if indent < parent["indent"]:
return
stash.append(entry)
recurse(0)
This approach correctly appends child nodes to its parent, but fails to append children of children properly.
I suspect there's a data structure approach that fits this problem neatly but I haven't found anything in my research.
Any guidance is much appreciated!
*I needed to manually add a style="padding-left: 0em"
to the first non-header <td>
of the .html for my index method to start from 0.
I would separate the code needed to parse the HTML from the code that uses the parsed data to build the hierarchy. So two functions:
import bs4
import re
def parse_table(soup):
table = soup.find(
"table", attrs={"class": "collapsible wikitable mw-collapsible mw-made-collapsible"}
)
for row in table.find_all("tr"):
cols = row.find_all("td")
if len(cols) != 2:
continue
parts = re.match(r"padding-left: *(\d+)em", cols[0].attrs.get('style', ''))
indent = int(parts[1]) if parts else 0
yield indent, cols[0].text, int(re.sub(r",|\D.*", "", cols[1].text))
def create_hieararchy(soup):
stash = []
path = [stash]
for indent, sector, count in parse_table(soup):
children = []
del path[indent+1:]
path[-1].append({
"name": sector,
"value": count,
"indent": indent,
"children": children,
})
path.append(children)
return stash
f = open("./webpage.html", "r", encoding="utf8")
soup = bs4.BeautifulSoup(f, "html.parser")
stash = create_hieararchy(soup)