I started learning screen scraping using BeautifulSoup. To get started I took a wikipedia article in the following format
<table class="wikitable sortable jquery-tablesorter">
<caption></caption>
<thead>
<tr>
<th colspan="2" style="width: 6%;" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Opening</th>
<th style="width: 20%;" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Title</th>
<th style="width: 10%;" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Director</th>
<th style="width: 45%;" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Cast</th>
<th style="width: 30%;" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Production company</th>
<th class="unsortable" style="width: 1%;"><abbr title="Reference(s)">Ref.</abbr></th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="3" style="text-align: center; background: #77bc83;">
<b>
O<br />
C<br />
T
</b>
</td>
<td rowspan="1" style="text-align: center; background: #77bc83;"><b>11</b></td>
<td style="text-align: center;">
<i><a href="/wiki/Viswam_(film)" title="Viswam (film)">Viswam</a></i>
</td>
<td>Sreenu Vaitla</td>
<td>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" />
<div class="hlist">
<ul>
<li><a href="/wiki/Gopichand_(actor)" title="Gopichand (actor)">Gopichand</a></li>
<li><a href="/wiki/Kavya_Thapar" title="Kavya Thapar">Kavya Thapar</a></li>
<li><a href="/wiki/Vennela_Kishore" title="Vennela Kishore">Vennela Kishore</a></li>
<li><a href="/wiki/Sunil" title="Sunil">Sunil</a></li>
<li><a href="/wiki/Naresh" title="Naresh">Naresh</a></li>
</ul>
</div>
</td>
<td>
Chitralayam Studios<br />
People Media Factory
</td>
<td style="text-align: center;">
<sup id="cite_ref-180" class="reference">
<a href="#cite_note-180"><span class="cite-bracket">[</span>178<span class="cite-bracket">]</span></a>
</sup>
</td>
</tr>
<tr>
<td rowspan="2" style="text-align: center; background: #77bc83;"><b>31</b></td>
<td style="text-align: center;">
<i><a href="/wiki/Lucky_Baskhar" title="Lucky Baskhar">Lucky Baskhar</a></i>
</td>
<td><a href="/wiki/Venky_Atluri" title="Venky Atluri">Venky Atluri</a></td>
<td>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" />
<div class="hlist">
<ul>
<li><a href="/wiki/Dulquer_Salmaan" title="Dulquer Salmaan">Dulquer Salmaan</a></li>
<li><a href="/wiki/Meenakshi_Chaudhary" title="Meenakshi Chaudhary">Meenakshi Chaudhary</a></li>
</ul>
</div>
</td>
<td><a href="/wiki/S._Radha_Krishna" title="S. Radha Krishna">Sithara Entertainments</a></td>
<td style="text-align: center;">
<sup id="cite_ref-181" class="reference">
<a href="#cite_note-181"><span class="cite-bracket">[</span>179<span class="cite-bracket">]</span></a>
</sup>
</td>
</tr>
<tr>
<td style="text-align: center;">
<i><a href="/wiki/Mechanic_Rocky" title="Mechanic Rocky">Mechanic Rocky</a></i>
</td>
<td>Ravi Teja Mullapudi</td>
<td>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" />
<div class="hlist">
<ul>
<li><a href="/wiki/Vishwak_Sen" title="Vishwak Sen">Vishwak Sen</a></li>
<li><a href="/wiki/Meenakshi_Chaudhary" title="Meenakshi Chaudhary">Meenakshi Chaudhary</a></li>
</ul>
</div>
</td>
<td>SRT Entertainments</td>
<td style="text-align: center;">
<sup id="cite_ref-182" class="reference">
<a href="#cite_note-182"><span class="cite-bracket">[</span>180<span class="cite-bracket">]</span></a>
</sup>
</td>
</tr>
<tr>
<td style="text-align: center; background: #77ea83;">
<b>
N<br />
O<br />
V
</b>
</td>
<td style="text-align: center; background: #77ea83;"><b>9</b></td>
<td style="text-align: center;">
<i><a href="/wiki/Telusu_Kada" title="Telusu Kada">Telusu Kada</a></i>
</td>
<td><a href="/wiki/Neeraja_Kona" title="Neeraja Kona">Neeraja Kona</a></td>
<td>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" />
<div class="hlist">
<ul>
<li><a href="/wiki/Siddhu_Jonnalagadda" title="Siddhu Jonnalagadda">Siddhu Jonnalagadda</a></li>
<li><a href="/wiki/Raashii_Khanna" title="Raashii Khanna">Raashii Khanna</a></li>
<li><a href="/wiki/Srinidhi_Shetty" title="Srinidhi Shetty">Srinidhi Shetty</a></li>
</ul>
</div>
</td>
<td>People Media Factory</td>
<td style="text-align: center;">
<sup id="cite_ref-183" class="reference">
<a href="#cite_note-183"><span class="cite-bracket">[</span>181<span class="cite-bracket">]</span></a>
</sup>
</td>
</tr>
<tr>
<td rowspan="2" style="text-align: center; background: #f4ca16; textcolor: #000;">
<b>
D<br />
E<br />
C
</b>
</td>
<td rowspan="1" style="text-align: center; background: #f8de7e;"><b>6</b></td>
<td style="text-align: center;">
<i><a href="/wiki/Pushpa_2:_The_Rule" title="Pushpa 2: The Rule">Pushpa 2: The Rule</a></i>
</td>
<td><a href="/wiki/Sukumar" title="Sukumar">Sukumar</a></td>
<td>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" />
<div class="hlist">
<ul>
<li><a href="/wiki/Allu_Arjun" title="Allu Arjun">Allu Arjun</a></li>
<li><a href="/wiki/Fahadh_Faasil" title="Fahadh Faasil">Fahadh Faasil</a></li>
<li><a href="/wiki/Rashmika_Mandanna" title="Rashmika Mandanna">Rashmika Mandanna</a></li>
</ul>
</div>
</td>
<td><a href="/wiki/Mythri_Movie_Makers" title="Mythri Movie Makers">Mythri Movie Makers</a></td>
<td style="text-align: center;">
<sup id="cite_ref-184" class="reference">
<a href="#cite_note-184"><span class="cite-bracket">[</span>182<span class="cite-bracket">]</span></a>
</sup>
</td>
</tr>
<tr>
<td rowspan="1" style="text-align: center; background: #f8de7e;"><b>20</b></td>
<td style="text-align: center;"><i>Robinhood</i></td>
<td><a href="/wiki/Venky_Kudumula" title="Venky Kudumula">Venky Kudumula</a></td>
<td>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" />
<div class="hlist">
<ul>
<li><a href="/wiki/Nithiin" title="Nithiin">Nithiin</a></li>
<li><a href="/wiki/Sreeleela" title="Sreeleela">Sreeleela</a></li>
</ul>
</div>
</td>
<td><a href="/wiki/Mythri_Movie_Makers" title="Mythri Movie Makers">Mythri Movie Makers</a></td>
<td style="text-align: center;">
<sup id="cite_ref-185" class="reference">
<a href="#cite_note-185"><span class="cite-bracket">[</span>183<span class="cite-bracket">]</span></a>
</sup>
</td>
</tr>
</tbody>
<tfoot></tfoot>
</table>
This is my python script that I wrote:
soup = BeautifulSoup(html_page, "html.parser")
tables = soup.find_all("table",{"class":"wikitable sortable"})
headers = ['month','day','movie','director','cast','producer','reference']
movie_tables = []
total_movies = 0
for table in tables:
caption = table.find("caption")
if not caption or not caption.get_text().strip():
movie_tables.append(table)
#captions = soup.find_all("caption")
max_columns = len(headers)
# List to store dictionaries
data_dict_list = []
movies= []
for movie_table in movie_tables:
table_rows = movie_table.find("tbody").find_all("tr")[1:]
for table_row in table_rows:
total_movies += 1
columns = table_row.find_all('td')
row_data = [col.get_text(strip=True) for col in columns]
# If the row has fewer columns than the max, pad it with None
if len(row_data) == 6:
row_data.insert(0, None)
elif len(row_data) == 5:
row_data.insert(0, None)
row_data.insert(1, None)
for col in columns:
li_tags = col.find_all('li')
if li_tags:
cast=""
for li in li_tags:
li_values = li.get_text(strip=True)
cast = ', '.join(li_values)
row_data.append(cast)
else:
row_data.append(col.get_text())
# Create a dictionary mapping headers to row data
row_dict = dict(zip(headers, row_data))
# Append the dictionary to the list
data_dict_list.append(row_dict)
# Print the list of dictionaries
for row_dict in data_dict_list:
print(row_dict)
This is the output I am getting (Just showing a few items here):
{'month': 'OCT', 'day': '11', 'movie': 'Viswam', 'director': 'Sreenu Vaitla', 'cast': 'GopichandKavya ThaparVennela KishoreSunilNaresh', 'producer': 'Chitralayam StudiosPeople Media Factory', 'reference': '[178]'}
{'month': None, 'day': '31', 'movie': 'Lucky Baskhar', 'director': 'Venky Atluri', 'cast': 'Dulquer SalmaanMeenakshi Chaudhary', 'producer': 'Sithara Entertainments', 'reference': '[179]'}
{'month': None, 'day': None, 'movie': 'Mechanic Rocky', 'director': 'Ravi Teja Mullapudi', 'cast': 'Vishwak SenMeenakshi Chaudhary', 'producer': 'SRT Entertainments', 'reference': '[180]'}
{'month': 'NOV', 'day': '9', 'movie': 'Telusu Kada', 'director': 'Neeraja Kona', 'cast': 'Siddhu JonnalagaddaRaashii KhannaSrinidhi Shetty', 'producer': 'People Media Factory', 'reference': '[181]'}
{'month': 'DEC', 'day': '6', 'movie': 'Pushpa 2: The Rule', 'director': 'Sukumar', 'cast': 'Allu ArjunFahadh FaasilRashmika Mandanna', 'producer': 'Mythri Movie Makers', 'reference': '[182]'}
{'month': None, 'day': '20', 'movie': 'Robinhood', 'director': 'Venky Kudumula', 'cast': 'NithiinSreeleela', 'producer': 'Mythri Movie Makers', 'reference': '[183]'}
This is what I am trying to get(Just showing the last item here):
{'month': 'DEC', 'day': '20', 'movie': 'Robinhood', 'director': 'Venky Kudumula', 'cast': 'Nithiin|Sreeleela', 'producer': 'Mythri Movie Makers', 'reference': '[183]'}
I've been trying to debug this for the last day or so but I cannot figure out where I went wrong.
I am expecting:
You can do most of the work using DataFrame
from io import StringIO
import pandas as pd
df = pd.read_html(StringIO(html_page))[0]
df.columns = ['month', 'day', 'movie', 'director', 'cast', 'producer', 'reference']
df['month'] = df['month'].str.split(' ').apply(''.join)
df['cast'] = df['cast'].str.split(r'\s{2,}', regex=True).apply(', '.join)
data_dict_list = df.to_dict('records')
for row_dict in data_dict_list:
print(row_dict)
Output
{'month': 'OCT', 'day': 11, 'movie': 'Viswam', 'director': 'Sreenu Vaitla', 'cast': 'Gopichand, Kavya Thapar, Vennela Kishore, Sunil, Naresh', 'producer': 'Chitralayam Studios People Media Factory', 'reference': '[178]'}
{'month': 'OCT', 'day': 31, 'movie': 'Lucky Baskhar', 'director': 'Venky Atluri', 'cast': 'Dulquer Salmaan, Meenakshi Chaudhary', 'producer': 'Sithara Entertainments', 'reference': '[179]'}
{'month': 'OCT', 'day': 31, 'movie': 'Mechanic Rocky', 'director': 'Ravi Teja Mullapudi', 'cast': 'Vishwak Sen, Meenakshi Chaudhary', 'producer': 'SRT Entertainments', 'reference': '[180]'}
{'month': 'NOV', 'day': 9, 'movie': 'Telusu Kada', 'director': 'Neeraja Kona', 'cast': 'Siddhu Jonnalagadda, Raashii Khanna, Srinidhi Shetty', 'producer': 'People Media Factory', 'reference': '[181]'}
{'month': 'DEC', 'day': 6, 'movie': 'Pushpa 2: The Rule', 'director': 'Sukumar', 'cast': 'Allu Arjun, Fahadh Faasil, Rashmika Mandanna', 'producer': 'Mythri Movie Makers', 'reference': '[182]'}
{'month': 'DEC', 'day': 20, 'movie': 'Robinhood', 'director': 'Venky Kudumula', 'cast': 'Nithiin, Sreeleela', 'producer': 'Mythri Movie Makers', 'reference': '[183]'}
To add the links as well you will have to use BeautifulSoup
. You didn't mention the format, but you can do something like
df['href'] = None
soup = BeautifulSoup(html_page, 'html.parser')
rows = soup.find_all('tr')[1:]
for i, row in enumerate(rows):
hrefs = row.find_all('a', href=True)
df.at[i, 'href'] = [href['href'] for href in hrefs]
data_dict_list = df.to_dict('records')
for row_dict in data_dict_list:
print(row_dict)
Output
{'month': 'OCT', 'day': 11, 'movie': 'Viswam', 'director': 'Sreenu Vaitla', 'cast': 'Gopichand, Kavya Thapar, Vennela Kishore, Sunil, Naresh', 'producer': 'Chitralayam Studios People Media Factory', 'reference': '[178]', 'href': ['/wiki/Viswam_(film)', '/wiki/Gopichand_(actor)', '/wiki/Kavya_Thapar', '/wiki/Vennela_Kishore', '/wiki/Sunil', '/wiki/Naresh', '#cite_note-180']}
{'month': 'OCT', 'day': 31, 'movie': 'Lucky Baskhar', 'director': 'Venky Atluri', 'cast': 'Dulquer Salmaan, Meenakshi Chaudhary', 'producer': 'Sithara Entertainments', 'reference': '[179]', 'href': ['/wiki/Lucky_Baskhar', '/wiki/Venky_Atluri', '/wiki/Dulquer_Salmaan', '/wiki/Meenakshi_Chaudhary', '/wiki/S._Radha_Krishna', '#cite_note-181']}
{'month': 'OCT', 'day': 31, 'movie': 'Mechanic Rocky', 'director': 'Ravi Teja Mullapudi', 'cast': 'Vishwak Sen, Meenakshi Chaudhary', 'producer': 'SRT Entertainments', 'reference': '[180]', 'href': ['/wiki/Mechanic_Rocky', '/wiki/Vishwak_Sen', '/wiki/Meenakshi_Chaudhary', '#cite_note-182']}
{'month': 'NOV', 'day': 9, 'movie': 'Telusu Kada', 'director': 'Neeraja Kona', 'cast': 'Siddhu Jonnalagadda, Raashii Khanna, Srinidhi Shetty', 'producer': 'People Media Factory', 'reference': '[181]', 'href': ['/wiki/Telusu_Kada', '/wiki/Neeraja_Kona', '/wiki/Siddhu_Jonnalagadda', '/wiki/Raashii_Khanna', '/wiki/Srinidhi_Shetty', '#cite_note-183']}
{'month': 'DEC', 'day': 6, 'movie': 'Pushpa 2: The Rule', 'director': 'Sukumar', 'cast': 'Allu Arjun, Fahadh Faasil, Rashmika Mandanna', 'producer': 'Mythri Movie Makers', 'reference': '[182]', 'href': ['/wiki/Pushpa_2:_The_Rule', '/wiki/Sukumar', '/wiki/Allu_Arjun', '/wiki/Fahadh_Faasil', '/wiki/Rashmika_Mandanna', '/wiki/Mythri_Movie_Makers', '#cite_note-184']}
{'month': 'DEC', 'day': 20, 'movie': 'Robinhood', 'director': 'Venky Kudumula', 'cast': 'Nithiin, Sreeleela', 'producer': 'Mythri Movie Makers', 'reference': '[183]', 'href': ['/wiki/Venky_Kudumula', '/wiki/Nithiin', '/wiki/Sreeleela', '/wiki/Mythri_Movie_Makers', '#cite_note-185']}