pythonweb-scrapingbeautifulsoupimdb

HTML scraping using Python and Beautiful Soup (from IMDb)


I want to get movie ratings from this page for example and print ratings line by line, I've extracted names and release year with BS4 but don't know how to handle ratings...

import requests
from bs4 import BeautifulSoup
import urllib.request


url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')



for div in soup.findAll('h3', attrs={'class':'lister-item-header'}):
    #print(div.find('a')['href'])
    #print("**")
    #print(div)
    year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
    year = str(year)
    year = year.replace('<span class="lister-item-year text-muted unbold">', '')
    year = year.replace('</span>', '')
    name = div.find('a').contents[0]
    print(name + ' ' + year)

    >> I want: Solaris (1972) 8.1

Solution

  • you need to change 'class':'lister-item-header' to 'class':'lister-item-content' parent class to catch the rating.

    import requests
    from bs4 import BeautifulSoup
    import urllib.request
    
    
    url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
    content = url.read()
    soup = BeautifulSoup(content, 'lxml')
    
    for div in soup.findAll('div', {'class':'lister-item-content'}):
        #print(div.find('a')['href'])
        #print("**")
        #print(div)
        year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
        year = str(year)
        year = year.replace('<span class="lister-item-year text-muted unbold">', '')
        year = year.replace('</span>', '')
        name = div.find('a').contents[0]
        rating = div.find('span',class_='ipl-rating-star__rating').text
        # print(rating)
        # you could also format string.
        print(f'{name} {year} {rating}'.format(name, year, rating))
        print(name + ' ' + year + " " +rating)