pythonweb-scrapingweb-crawlerieee

Python Web Scraper IEEE


I am trying to retrieve keywords of a particular IEEE document. I came across this code here

        ieee_content = requests.get(link, timeout=180)
        soup = BeautifulSoup(ieee_content.text, 'lxml')
        tag = soup.find_all('script')
        #metadata = "".join(re.findall('global.document.metadata=(.*)', tag[9].text)).replace(";", '').replace('global.document.metadata=', '')
        for i in tag[9]:
            metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
            metadata = re.findall(metadata_format, i)
            if len(metadata) != 0:
               # convert the list 
               convert_to_json = json.dumps(metadata)
               x = json.loads(convert_to_json)
               s = x[0].replace("'", '"').replace(";", '')

The problem is that my metadata variable is always empty. I tried to iterate across all tags rather than using tag[9], but metadata is still empty in all cases. I tried using 'xml' instead of 'lmxl' as well but the result is the same. I'd appreciate some help with this.


Solution

  • import json
    import re
    from pprint import pprint
    
    import requests
    from bs4 import BeautifulSoup
    
    ieee_content = requests.get("https://ieeexplore.ieee.org/document/7845555", timeout=180)
    soup = BeautifulSoup(ieee_content.content, "html.parser")
    scripts = soup.find_all("script")
    
    pattern = re.compile(r"(?<=\"keywords\":)\[{.*?}\]")
    keywords_dict = {}
    for i, script in enumerate(scripts):
        keywords = re.findall(pattern, str(script.string))
        if len(keywords) == 1:
            raw_keywords_list = json.loads(keywords[0])
            for keyword_type in raw_keywords_list:
                keywords_dict[keyword_type["type"].strip()] = [kwd.strip() for kwd in keyword_type["kwd"]]
    
    pprint(keywords_dict)