Problem:
Code:
from selenium import webdriver
import time
from bs4 import BeautifulSoup as Soup
from urllib.request import urlopen
import datetime as dt
import csv
import pandas as pd
driver = webdriver.Firefox(executable_path='C://Downloads//webdrivers//geckodriver.exe')
c1 = 'amazon_data_' + dt.datetime.now().strftime("%d_%b_%y_%I_%M_%p")
# d = open(str(c1) + '.csv', 'x', encoding='utf-8')
d = open(str(c1) + '.xlsx', 'x', encoding='utf-8')
for c in range(1):
a = f'https://www.flipkart.com/search?q=sony+headphones&as=on&as-show=on&otracker=AS_Query_HistoryAutoSuggest_1_4_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_4_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=sony+headphones&requestId=ad797917-16ae-401e-98df-1c79a43d40c3&as-backfill=on&page={c}'
'''
request_response = requests.head(a)
status_code = request_response.status_code
if status_code == 200:
print(True)
else:
print(False)
'''
driver.get(a)
# time.sleep(1)
page_soup = Soup(urlopen(a), 'html5lib')
container = page_soup.find_all('div', {'class': '_4ddWXP'})
for containers in container:
find_url = containers.find('a')['href']
new_url = 'https://www.flipkart.com' + find_url
fetch = driver.get(new_url)
# time.sleep(1)
page_source = driver.page_source
page_soup = Soup(page_source, 'html.parser')
for data in page_soup:
try:
product_name = data.find('span', {'class': 'B_NuCI'}).text.strip()
price = data.find('div', {'class': "_30jeq3 _16Jk6d"}).text.strip()
current_url = new_url
except:
print('Not Available')
# print(product_name, '\n', price, '\n', current_url, '\n')
d.write(product_name + price + current_url + '\n')
Error I got
Things I tried
When I try to write the output data with .csv it saves properly. But while opening the file, data has some special characters and data is not written in single cell.
** Output of single cell while writing data through .csv method **
JBL a noise cancellation enabled Bluetooth~
Uploading an Image for better Understanding
Things I want
I see few problems:
using open()
, write()
you can't create xlsx
because it has to be file .xml
compressed with zip
some data has ,
which normally is used as separator for columns and you should put data in " "
to create columns correctly. Better use module csv
or pandas
and it will use " "
automatically. And this can be your main problem.
you mix selenium
with beautifulsoup
and sometimes you make mess.
you use for data in page_soup
so you get all children on page and run the same code for these elements but you should get values directly from page_soup
I would put all data on list - every item as sublist - and later I would convert it to pandas.DataFrame
and save it using to_csv()
or to_excel()
I would even use selenium
to search element (ie. find_elements_by_xpath
) instead of beautifulsoup
but I skiped this idea in code.
from selenium import webdriver
import time
from bs4 import BeautifulSoup as BS
import datetime as dt
import pandas as pd
# - before loop -
all_rows = []
#driver = webdriver.Firefox(executable_path='C:\\Downloads\\webdrivers\\geckodriver.exe')
driver = webdriver.Firefox() # I have `geckodriver` in folder `/home/furas/bin` and I don't have to set `executable_path`
# - loop -
for page in range(1): # range(10)`
print('--- page:', page, '---')
url = f'https://www.flipkart.com/search?q=sony+headphones&as=on&as-show=on&otracker=AS_Query_HistoryAutoSuggest_1_4_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_4_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=sony+headphones&requestId=ad797917-16ae-401e-98df-1c79a43d40c3&as-backfill=on&page={page}'
driver.get(url)
time.sleep(3)
soup = BS(driver.page_source, 'html5lib')
all_containers = soup.find_all('div', {'class': '_4ddWXP'})
for container in all_containers:
find_url = container.find('a')['href']
print('find_url:', find_url)
item_url = 'https://www.flipkart.com' + find_url
driver.get(item_url)
time.sleep(3)
item_soup = BS(driver.page_source, 'html.parser')
try:
product_name = item_soup.find('span', {'class': 'B_NuCI'}).text.strip()
price = item_soup.find('div', {'class': "_30jeq3 _16Jk6d"}).text.strip()
print('product_name:', product_name)
print('price:', price)
print('item_url:', item_url)
print('---')
row = [product_name, price, item_url]
all_rows.append(row)
except Exception as ex:
print('Not Available:', ex)
print('---')
# - after loop -
df = pd.DataFrame(all_rows)
filename = dt.datetime.now().strftime("amazon_data_%d_%b_%y_%I_%M_%p.csv")
df.to_csv(filename)
#filename = dt.datetime.now().strftime("amazon_data_%d_%b_%y_%I_%M_%p.xlsx")
#df.to_excel(filename)