[SOLVED] HTML scraping using Python and Beautiful Soup (from IMDb)

HTML scraping using Python and Beautiful Soup (from IMDb)

I want to get movie ratings from this page for example and print ratings line by line, I've extracted names and release year with BS4 but don't know how to handle ratings...

import requests
from bs4 import BeautifulSoup
import urllib.request


url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')



for div in soup.findAll('h3', attrs={'class':'lister-item-header'}):
    #print(div.find('a')['href'])
    #print("**")
    #print(div)
    year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
    year = str(year)
    year = year.replace('<span class="lister-item-year text-muted unbold">', '')
    year = year.replace('</span>', '')
    name = div.find('a').contents[0]
    print(name + ' ' + year)

    >> I want: Solaris (1972) 8.1

Solution

you need to change 'class':'lister-item-header' to 'class':'lister-item-content' parent class to catch the rating.

import requests
from bs4 import BeautifulSoup
import urllib.request


url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')

for div in soup.findAll('div', {'class':'lister-item-content'}):
    #print(div.find('a')['href'])
    #print("**")
    #print(div)
    year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
    year = str(year)
    year = year.replace('<span class="lister-item-year text-muted unbold">', '')
    year = year.replace('</span>', '')
    name = div.find('a').contents[0]
    rating = div.find('span',class_='ipl-rating-star__rating').text
    # print(rating)
    # you could also format string.
    print(f'{name} {year} {rating}'.format(name, year, rating))
    print(name + ' ' + year + " " +rating)