pythonselenium-webdriverweb-scraping

Issue in scraping data


I have an issue in scraping schools data. I need their email and website URL. I tried a lot but it's returning empty results.

What's the best way to do this?

Here is the code:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time

def google_search(query, num_results=10):
    options = Options()
    options.add_argument("--headless")  # Run headless browser
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
    driver.get(search_url)

    time.sleep(2)  

    links = []
    results = driver.find_elements(By.CSS_SELECTOR, 'div.yuRUbf > a')
    for result in results:
        href = result.get_attribute('href')
        if href:
            links.append(href)

    driver.quit()
    return links


query = "PSHE site:.sch.uk"
results = google_search(query, num_results=20)

for i, url in enumerate(results, 1):
    print(f"{i}. {url}")

Solution

  • results = driver.find_elements(By.CSS_SELECTOR, 'div.yuRUbf > a')
    

    Above code has the issue.

    div.yuRUbf > a - This CSS selector will try to locate the direct child(a) of div. Not the deeper descendants.

    You need this:

    div.yuRUbf a - This will locate <a> of parent <div> even though it's nested multiple levels deep

    UPDATE: Try this refactored code, I have tested it and this is working in my system.

    Things refactored:

    1. Removed time.sleep and introduced waits which are smart and effective
    2. Removed thrid party library ChromeDriverManager and used built-in Selenium Manager
    3. Used user-agent in ChromeOptions to avoid bot detection
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    
    
    def google_search(query, num_results=10):
        options = Options()
        options.add_argument("--headless")  # Run headless browser
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
    
        # Use a user-agent to avoid bot detection
        options.add_argument(
            "user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
        )
    
        driver = webdriver.Chrome(options=options)
        search_url = f"https://www.google.com/search?q={query}&num={num_results}"
        driver.get(search_url)
        driver.maximize_window()
        wait = WebDriverWait(driver, 10)
    
        # wait.until(EC.element_to_be_clickable((By.ID, "L2AGLb"))).click()  # Accept cookies
        links = []
        results = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.yuRUbf a')))
        for result in results:
            href = result.get_attribute('href')
            if href:
                links.append(href)
    
        driver.quit()
        return links
    
    
    query = "PSHE site:.sch.uk"
    results = google_search(query, num_results=20)
    
    for i, url in enumerate(results, 1):
        print(f"{i}. {url}")
    

    Output:

    1. https://www.wealdofkent.kent.sch.uk/quality-of-education/pshe
    2. https://www.bradstow.wandsworth.sch.uk/2245/what-we-teach-pshe
    3. https://www.leverton.essex.sch.uk/curriculum/pshe/a-list-of-useful-websites-2/
    4. https://www.kls.herts.sch.uk/home/personal-development/personal-social-health-and-economic-pshe/
    5. https://www.pudseylowtown.leeds.sch.uk/PSHE-11062021071900/
    6. https://www.longacre.surrey.sch.uk/prep/pshe/
    7. https://www.frimley.surrey.sch.uk/page/?title=PSHE+%26amp%3B+Relationship+Education&pid=89
    8. https://www.weydonschool.surrey.sch.uk/215/pshe
    9. https://www.ellenwilkinson.newham.sch.uk/page/?title=PSHE&pid=67
    10. https://www.fortismere.haringey.sch.uk/page/?title=PSHE+%28Personal%2C+Social%2C+Heath+%26amp%3B+Economic+Education%29+CURRICULUM&pid=261
    11. https://www.stags.herts.sch.uk/page/?title=Personal%2C+Social+and+Health+Education+%28PSHE%29&pid=178
    12. https://www.ivylane.wilts.sch.uk/page/?title=PSHE&pid=83
    13. https://www.mayfield.ealing.sch.uk/PSHCE/
    14. https://www.pennington-inf.hants.sch.uk/pshe/
    15. https://www.torriano.camden.sch.uk/subjects/pshe/
    16. https://www.goldington.beds.sch.uk/ckfinder/userfiles/files/Careers/PSHE%20framework%20linked%20to%20from%20careers%20programme.pdf
    17. https://www.colley.dudley.sch.uk/Learning/SchoolCurriculum/PSHE
    18. https://www.westfield-jun.leics.sch.uk/our-curriculum/pshe
    19. https://www.tomlinscote.surrey.sch.uk/page/?title=PSHE&pid=471
    20. https://www.potters-gate.surrey.sch.uk/PSHE/
    
    Process finished with exit code 0