I have an issue in scraping schools data. I need their email and website URL. I tried a lot but it's returning empty results.
What's the best way to do this?
Here is the code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time
def google_search(query, num_results=10):
options = Options()
options.add_argument("--headless") # Run headless browser
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
search_url = f"https://www.google.com/search?q={query}&num={num_results}"
driver.get(search_url)
time.sleep(2)
links = []
results = driver.find_elements(By.CSS_SELECTOR, 'div.yuRUbf > a')
for result in results:
href = result.get_attribute('href')
if href:
links.append(href)
driver.quit()
return links
query = "PSHE site:.sch.uk"
results = google_search(query, num_results=20)
for i, url in enumerate(results, 1):
print(f"{i}. {url}")
results = driver.find_elements(By.CSS_SELECTOR, 'div.yuRUbf > a')
Above code has the issue.
div.yuRUbf > a
- This CSS selector will try to locate the direct child(a
) of div
. Not the deeper descendants.
You need this:
div.yuRUbf a
- This will locate <a>
of parent <div>
even though it's nested multiple levels deep
UPDATE: Try this refactored code, I have tested it and this is working in my system.
Things refactored:
time.sleep
and introduced waits which are smart and effectiveChromeDriverManager
and used built-in Selenium Managerfrom selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
def google_search(query, num_results=10):
options = Options()
options.add_argument("--headless") # Run headless browser
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Use a user-agent to avoid bot detection
options.add_argument(
"user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
driver = webdriver.Chrome(options=options)
search_url = f"https://www.google.com/search?q={query}&num={num_results}"
driver.get(search_url)
driver.maximize_window()
wait = WebDriverWait(driver, 10)
# wait.until(EC.element_to_be_clickable((By.ID, "L2AGLb"))).click() # Accept cookies
links = []
results = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.yuRUbf a')))
for result in results:
href = result.get_attribute('href')
if href:
links.append(href)
driver.quit()
return links
query = "PSHE site:.sch.uk"
results = google_search(query, num_results=20)
for i, url in enumerate(results, 1):
print(f"{i}. {url}")
Output:
1. https://www.wealdofkent.kent.sch.uk/quality-of-education/pshe
2. https://www.bradstow.wandsworth.sch.uk/2245/what-we-teach-pshe
3. https://www.leverton.essex.sch.uk/curriculum/pshe/a-list-of-useful-websites-2/
4. https://www.kls.herts.sch.uk/home/personal-development/personal-social-health-and-economic-pshe/
5. https://www.pudseylowtown.leeds.sch.uk/PSHE-11062021071900/
6. https://www.longacre.surrey.sch.uk/prep/pshe/
7. https://www.frimley.surrey.sch.uk/page/?title=PSHE+%26amp%3B+Relationship+Education&pid=89
8. https://www.weydonschool.surrey.sch.uk/215/pshe
9. https://www.ellenwilkinson.newham.sch.uk/page/?title=PSHE&pid=67
10. https://www.fortismere.haringey.sch.uk/page/?title=PSHE+%28Personal%2C+Social%2C+Heath+%26amp%3B+Economic+Education%29+CURRICULUM&pid=261
11. https://www.stags.herts.sch.uk/page/?title=Personal%2C+Social+and+Health+Education+%28PSHE%29&pid=178
12. https://www.ivylane.wilts.sch.uk/page/?title=PSHE&pid=83
13. https://www.mayfield.ealing.sch.uk/PSHCE/
14. https://www.pennington-inf.hants.sch.uk/pshe/
15. https://www.torriano.camden.sch.uk/subjects/pshe/
16. https://www.goldington.beds.sch.uk/ckfinder/userfiles/files/Careers/PSHE%20framework%20linked%20to%20from%20careers%20programme.pdf
17. https://www.colley.dudley.sch.uk/Learning/SchoolCurriculum/PSHE
18. https://www.westfield-jun.leics.sch.uk/our-curriculum/pshe
19. https://www.tomlinscote.surrey.sch.uk/page/?title=PSHE&pid=471
20. https://www.potters-gate.surrey.sch.uk/PSHE/
Process finished with exit code 0