pythonseleniumweb-scrapingproxies

Selenium with proxy not working / wrong options?


I have the following working test-solutions which outputs the IP-address and information -

Now I want to use this with my ScraperAPI-Account with other Proxies. But when I uncomment this 2 lines -

# PROXY = f'http://scraperapi:{SCRAPER_API}@proxy-server.scraperapi.com:8001'
# options.add_argument('--proxy-server=%s' % PROXY) 

the solution is not working anymore -

How can I use my proxies with selenium / that code? (scraperAPI is recommending using the selenium-wire module but I don´t like it cause it has some dependencies to a specific version of other tools - so I would like to use the proxies without that)

Is this possible?

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from sys import platform
import os, sys
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
from dotenv import load_dotenv, find_dotenv

WAIT = 10

load_dotenv(find_dotenv()) 
SCRAPER_API = os.environ.get("SCRAPER_API")
# PROXY = f'http://scraperapi:{SCRAPER_API}@proxy-server.scraperapi.com:8001'

srv=Service(ChromeDriverManager().install())
ua = UserAgent()
userAgent = ua.random
options = Options()
options.add_argument('--headless')
options.add_experimental_option ('excludeSwitches', ['enable-logging'])
options.add_argument("start-maximized")
options.add_argument('window-size=1920x1080')                                 
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')  
options.add_argument(f'user-agent={userAgent}')     
# options.add_argument('--proxy-server=%s' % PROXY)     
path = os.path.abspath (os.path.dirname (sys.argv[0]))
if platform == "win32": cd = '/chromedriver.exe'
elif platform == "linux": cd = '/chromedriver'
elif platform == "darwin": cd = '/chromedriver'
driver = webdriver.Chrome (service=srv, options=options)    
waitWebDriver = WebDriverWait (driver, 10)  

link = "https://whatismyipaddress.com/"
driver.get (link)     
time.sleep(WAIT)
soup = BeautifulSoup (driver.page_source, 'html.parser')     
tmpIP = soup.find("span", {"id": "ipv4"})
tmpP = soup.find_all("p", {"class": "information"})
for e in tmpP:
  tmpSPAN = e.find_all("span")
  for e2 in tmpSPAN:
    print(e2.text)
print(tmpIP.text)

driver.quit()

Solution

  • There are a couple of things you need to look back:

    Making those minor tweaks and optimizing your code:

    import time
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from webdriver_manager.chrome import ChromeDriverManager
    from fake_useragent import UserAgent
    from bs4 import BeautifulSoup
    
    WAIT = 10
    srv=Service(ChromeDriverManager().install())
    ua = UserAgent()
    userAgent = ua.random
    options = Options()
    options.add_argument('--headless')
    options.add_experimental_option ('excludeSwitches', ['enable-logging'])
    options.add_argument("start-maximized")
    options.add_argument('window-size=1920x1080')                                 
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-gpu')  
    options.add_argument(f'user-agent={userAgent}')     
    driver = webdriver.Chrome (service=srv, options=options)    
    waitWebDriver = WebDriverWait (driver, 10)  
    
    link = "https://whatismyipaddress.com/"
    driver.get(link)
    driver.save_screenshot("whatismyipaddress.png")
    time.sleep(WAIT)
    soup = BeautifulSoup (driver.page_source, 'html.parser')     
    tmpIP = soup.find("span", {"id": "ipv4"})
    tmpP = soup.find_all("p", {"class": "information"})
    for e in tmpP:
        tmpSPAN = e.find_all("span")
        for e2 in tmpSPAN:
                print(e2.text)
    print(tmpIP.text)
    driver.quit()
    

    Console Output:

    [WDM] -
    
    [WDM] - ====== WebDriver manager ======
    [WDM] - Current google-chrome version is 96.0.4664
    [WDM] - Get LATEST driver version for 96.0.4664
    [WDM] - Driver [C:\Users\Admin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
    ISP:
    Jio
    City:
    Pune
    Region:
    Maharashtra
    Country:
    India
    123.12.234.23
    

    Saved Screenshot:

    whatismyipaddress


    Using the proxy

    import time
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from webdriver_manager.chrome import ChromeDriverManager
    from fake_useragent import UserAgent
    from bs4 import BeautifulSoup
    
    WAIT = 10
    
    load_dotenv(find_dotenv()) 
    SCRAPER_API = os.environ.get("SCRAPER_API")
    PROXY = f'http://scraperapi:{SCRAPER_API}@proxy-server.scraperapi.com:8001'
    
    srv=Service(ChromeDriverManager().install())
    ua = UserAgent()
    userAgent = ua.random
    options = Options()
    options.add_argument('--headless')
    options.add_experimental_option ('excludeSwitches', ['enable-logging'])
    options.add_argument("start-maximized")
    options.add_argument('window-size=1920x1080')                                 
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-gpu')  
    options.add_argument(f'user-agent={userAgent}') 
    options.add_argument('--proxy-server={}'.format(PROXY))
    path = os.path.abspath (os.path.dirname (sys.argv[0]))
    if platform == "win32": cd = '/chromedriver.exe'
    elif platform == "linux": cd = '/chromedriver'
    elif platform == "darwin": cd = '/chromedriver'
    driver = webdriver.Chrome (service=srv, options=options)    
    waitWebDriver = WebDriverWait (driver, 10)  
    
    link = "https://whatismyipaddress.com/"
    driver.get(link)
    driver.save_screenshot("whatismyipaddress.png")
    time.sleep(WAIT)
    soup = BeautifulSoup (driver.page_source, 'html.parser')     
    tmpIP = soup.find("span", {"id": "ipv4"})
    tmpP = soup.find_all("p", {"class": "information"})
    for e in tmpP:
        tmpSPAN = e.find_all("span")
        for e2 in tmpSPAN:
                print(e2.text)
    print(tmpIP.text)
    driver.quit()
    

    Note: print(f'http://scraperapi:{SCRAPER_API}@proxy-server.scraperapi.com:8001') and ensure the SCRAPER_API returns a result.


    References

    You can find a couple of relevant detailed discussions in: