pythonselenium-webdriverweb-scrapingalfresco-webscriptsironwebscraper

web scrapping Proplem list object has no attribute timeout


I am trying to do web scrapping using Selenium, I want to download images from Google but I have more than one issue:

  1. I got this issue AttributeError: 'list' object has no attribute 'timeout'.
  2. I can't deal with base64 image and can't download it.
  3. I have an issue with the function name download_image in my code.
  4. also there are URLs contain image ural ,how to download it anyone can help me

here is my Code :

from urllib.parse import urlparse
from selenium import webdriver
import time as t
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time as t
import urllib
import base64
try:
    os.mkdir("G:/Smokking_Project")    
except:
    pass

name="smoked"

chrome_options = webdriver.ChromeOptions() 
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
#driver = webdriver.Chrome(executable_path='chromedriver.exe',options=chrome_options)  
driver = webdriver.Chrome(options=chrome_options)  
wait = WebDriverWait(driver, 5)

strr="https://www.google.com/search?q=smokinng&tbm=isch&ved=2ahUKEwi8k9zn9eOBAxVtlycCHTa_DnUQ2-cCegQIABAA&oq=smokinng&gs_lcp=CgNpbWcQAzIJCAAQGBCABBAKMgkIABAYEIAEEAoyCQgAEBgQgAQQCjoECCMQJzoFCAAQgAQ6BggAEAUQHjoECAAQHjoICAAQgAQQsQM6BAgAEAM6BwgAEBgQgARQjwdY8xJg-RloAHAAeACAAb0BiAHsCZIBAzAuOZgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&ei=uUwhZfzSFO2unsEPtv66qAc&bih=723&biw=1517&hl=en"

driver.get(strr)
t.sleep(3)
links=[]
x=1
last_height=0

def download_image(url,filename):
        resource = urllib.request.urlopen(url)
        output = open(filename,"wb")
        output.write(resource.read())
        output.close()
    
while True:
     driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
     t.sleep(4)    
     #try:
     img_link = wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[1]/div[1]/img')))
     t.sleep(1)

     for img in img_link:
        url = img.get_attribute('src')
        if url not in links:
            links.append(url)
            print (url)
            
            try:
                os.mkdir('G://Smokking_Project//'+name)
            except:
                pass
            try:
                os.mkdir('G://Smokking_Project//'+name)
            except:
                pass
            file_name='Smokking_Project//'+name+'//'+str(x)+'.jpg'
            download_image(img_link,file_name)
            
            x+=1
        #except:
            #print('-',end='')
     new_height = driver.execute_script("return document.body.scrollHeight")
     print(new_height)
     if new_height == last_height:
        break
     last_height = new_height
            
driver.close()`

below is full error

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
c:\Users\Geka\Desktop\openCV\vision_ahmed_ibrahim\webscrapping\webscrapping_Google.ipynb Cell 6 line 6
     62         pass
     63     file_name='Smokking_Project//'+name+'//'+str(x)+'.jpg'
---> 64     download_image(img_link,file_name)
     66     x+=1
     67 #except:
     68     #print('-',end='')

c:\Users\Geka\Desktop\openCV\vision_ahmed_ibrahim\webscrapping\webscrapping_Google.ipynb Cell 6 line 3
     33 def download_image(url,filename):
---> 34         resource = urllib.request.urlopen(url)
     35         output = open(filename,"wb")
     36         output.write(resource.read())

File c:\Users\Geka\anaconda3\Lib\urllib\request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    214 else:
    215     opener = _opener
--> 216 return opener.open(url, data, timeout)

File c:\Users\Geka\anaconda3\Lib\urllib\request.py:509, in OpenerDirector.open(self, fullurl, data, timeout)
    506     if data is not None:
    507         req.data = data
--> 509 req.timeout = timeout
    510 protocol = req.type
    512 # pre-process request

AttributeError: 'list' object has no attribute 'timeout'

Solution

  • Errors we have Identified in your code are

    1)AttributeError: 'list' object has no attribute 'timeout': This error occurs because you're passing a list of elements (img_link) to the download_image function instead of a single URL string. You need to pass the url variable to the function instead.

    2)Handling Base64 images: To handle Base64 images, you need to decode the Base64 string and save it as an image file.

    3)Downloading URLs that contain image URLs: You can download images from URLs by sending an HTTP request to the URL and saving the response content.

    Let's modify your code to address these issues:

    import os
    import time
    import urllib.request
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    try:
        os.mkdir("G:/Smokking_Project")
    except FileExistsError:
        pass
    
    name = "smoked"
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
    driver = webdriver.Chrome(options=chrome_options)
    wait = WebDriverWait(driver, 5)
    
    strr = "https://www.google.com/search?q=smokinng&tbm=isch&ved=2ahUKEwi8k9zn9eOBAxVtlycCHTa_DnUQ2-cCegQIABAA&oq=smokinng&gs_lcp=CgNpbWcQAzIJCAAQGBCABBAKMgkIABAYEIAEEAoyCQgAEBgQgAQQCjoECCMQJzoFCAAQgAQ6BggAEAUQHjoECAAQHjoICAAQgAQQsQM6BAgAEAM6BwgAEBgQgARQjwdY8xJg-RloAHAAeACAAb0BiAHsCZIBAzAuOZgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&ei=uUwhZfzSFO2unsEPtv66qAc&bih=723&biw=1517&hl=en"
    
    driver.get(strr)
    time.sleep(3)
    
    x = 1
    last_height = 0
    
    def download_image(url, filename):
        resource = urllib.request.urlopen(url)
        output = open(filename, "wb")
        output.write(resource.read())
        output.close()
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)
        
        img_links = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//a[1]/div[1]/img')))
        time.sleep(1)
    
        for img in img_links:
            url = img.get_attribute('src')
            if url:
                if url.startswith('data:image'):
                    # Decode Base64 image and save it
                    img_data = url.split(',')[1]
                    img_data = img_data.encode()
                    filename = f'G:/Smokking_Project/{name}/{x}.jpg'
                    with open(filename, 'wb') as f:
                        f.write(base64.b64decode(img_data))
                else:
                    # Download image from URL
                    try:
                        os.makedirs(f'G:/Smokking_Project/{name}', exist_ok=True)
                    except FileExistsError:
                        pass
                    filename = f'G:/Smokking_Project/{name}/{x}.jpg'
                    download_image(url, filename)
                x += 1
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        print(new_height)
        if new_height == last_height:
            break
        last_height = new_height
    
    driver.close()
    

    This code should address the issues you mentioned. It handles Base64 images by decoding them and saves them as image files. Additionally, it downloads images from URLs by sending an HTTP request and saving the response content.