pythonselenium-webdriverparsingbeautifulsouphtml-parsing

Parser on python returns an empty list (i guess its an HTML class selection issue)


The idea is: i wanna collect the name of the flat and its price as a list for every flat on the website. Ive made a simple parser on python, but looks like i cant get any values, since it returns an empty list.

My best guess is: i simply cant find the right class/container that contains this info, thats why it returns as an empty list.

# Importing selenium, CSV, and time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import csv
import time
from webdriver_manager.chrome import ChromeDriverManager

# Running the browser in the background without GPU and Sandbox
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')

# Using Service and CDM to specify the driver path
service = Service(ChromeDriverManager().install())

# Initializing the driver
driver = webdriver.Chrome(service=service)

# Opening the developer's URL
print("Opening the page...")
driver.get('https://etalongroup.ru/msk/object/voxhall/')
print("The page is opened.")

# Delay for the page to fully load
time.sleep(30)

# Getting the HTML
page_source = driver.page_source

# Closing the driver
driver.quit()

# Parsing HTML with bs4
soup = BeautifulSoup(page_source, 'html.parser')

# List with apartment data
apartments = []

# Searching for prices in <span> text-scarlet
price_elements = soup.find_all('span', class_='th-h4 text-scarlet')

# Searching for titles in <div> 'aria-label'
title_elements = soup.find_all('div', {'aria-label': True})

# Collecting data
for price_element, title_element in zip(price_elements, title_elements):
    price = price_element.text.strip()
    title = title_element['aria-label'].strip()
    apartments.append({'Title': title, 'Price': price})

print(apartments)

# Script completion message
print("The script has finished executing.")

Im expecting a list or a dictionary in return that will run thru the website and collect the data [n123-30 000 000] etc for every object presented


Solution

  • Here's the bare minimum code to extract the desired data.

    import time
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    
    options = Options()
    options.add_argument('--headless')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    driver = webdriver.Chrome(options=options)
    print("Opening the page...")
    driver.get('https://etalongroup.ru/msk/object/voxhall/')
    print("The page is opened.")
    time.sleep(2)
    container = driver.find_element(By.CSS_SELECTOR, '#card-object>div').get_attribute('innerHTML')
    driver.quit()
    soup = BeautifulSoup(container, 'html.parser')
    apartments = []
    result_container = soup.find_all('div', class_="bg-white relative")
    
    for result in result_container:
        root = result.find_next('a')
        area_floor = root.select_one('section.flex.flex-col.gap-2>span.th-b1-regular').text.split(' | ')
        apartments.append({
            "link": f"https://etalongroup.ru/{root['href']}",
            "price": root.select_one('span.th-h2').text,
            "title": root.select_one('span.th-h4').text,
            "area": area_floor[0],
            "floor": area_floor[1]
        })
    print(apartments)
    

    output:

    [
      {
        'link': 'https://etalongroup.ru//msk/choose/92334/',
        'price': '20 519 852 ₽ ',
        'title': 'Студия № 197',
        'area': '26.0 м²',
        'floor': '16 этаж'
      },
      {
        'link': 'https://etalongroup.ru//msk/choose/92437/',
        'price': '20 726 234 ₽ ',
        'title': 'Студия № 37',
        'area': '25.4 м²',
        'floor': '4 этаж'
      },
      {
        'link': 'https://etalongroup.ru//msk/choose/92445/',
        'price': '20 976  711 ₽ ',
        'title': 'Студия № 44',
        'area': '26.0 м²',
        'floor': '5 этаж'
      },
      {
        'link': 'https://etalongroup.ru//msk/choose/92453/',
        'price': '20 994 562 ₽ ',
        'title': 'Студия № 51',
        'are a': '25.7 м²',
        'floor': '5 этаж'
      },
      {
        'link': 'https://etalongroup.ru//msk/choose/92483/',
        'price': '21 039 082 ₽ ',
        'title': 'Студия № 79',
        'area': '25.7 м²',
        'floor': '7 этаж'
      },
      {
        'link': 'https://etalongroup.ru//msk/choose/92255/',
        'price': '21 835 647 ₽ ',
        'title': 'Студия № 125',
        'area': '25.8 м²',
        'floor': '10 этаж'
      }
    ]
    

    Note:

    1. All the data on this website resides under a div of a div tag with id=card-object. So, you can get the innerHTML of this element and pass it to BeautifulSoup for further parsing.

      driver.find_element(By.CSS_SELECTOR, '#card-object>div').get_attribute('innerHTML')
      
    2. Other things to note here is that you should look at the soup object to find and fetch the target elements accordingly.

    The above mentioned code is working and self explanatory.