I am trying to retrieve keywords of a particular IEEE document. I came across this code here
ieee_content = requests.get(link, timeout=180)
soup = BeautifulSoup(ieee_content.text, 'lxml')
tag = soup.find_all('script')
#metadata = "".join(re.findall('global.document.metadata=(.*)', tag[9].text)).replace(";", '').replace('global.document.metadata=', '')
for i in tag[9]:
metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
metadata = re.findall(metadata_format, i)
if len(metadata) != 0:
# convert the list
convert_to_json = json.dumps(metadata)
x = json.loads(convert_to_json)
s = x[0].replace("'", '"').replace(";", '')
The problem is that my metadata variable is always empty. I tried to iterate across all tags rather than using tag[9], but metadata is still empty in all cases. I tried using 'xml' instead of 'lmxl' as well but the result is the same. I'd appreciate some help with this.
import json
import re
from pprint import pprint
import requests
from bs4 import BeautifulSoup
ieee_content = requests.get("https://ieeexplore.ieee.org/document/7845555", timeout=180)
soup = BeautifulSoup(ieee_content.content, "html.parser")
scripts = soup.find_all("script")
pattern = re.compile(r"(?<=\"keywords\":)\[{.*?}\]")
keywords_dict = {}
for i, script in enumerate(scripts):
keywords = re.findall(pattern, str(script.string))
if len(keywords) == 1:
raw_keywords_list = json.loads(keywords[0])
for keyword_type in raw_keywords_list:
keywords_dict[keyword_type["type"].strip()] = [kwd.strip() for kwd in keyword_type["kwd"]]
pprint(keywords_dict)