pythonwebbeautifulsouppython-requestsbing

python image scraper, not working properly on bing


I am trying to build an image scraper, I first tried on Google but there is no image getting scraped So I tried Bing and it worked, but there are some issues

  1. Image links getting scraped are only a small amount of what is shown in the search engine.
  2. Scraped images are from unknown page in the previews shown.
  3. Images are by default getting scraped in Safe-Mode filter.

I wanted to scrape all the images(or some pages) that are shown in the bing.com/images/search but instead it does very little of that.

After inspection I found that image links are stored in 'thumb' class in bing so I scraped all the links that had thumb class, but looks like that isn't enough.

After looking through the source only thumb class links were found that actually had .jpg in the end

import requests
from bs4 import BeautifulSoup
import os
import random
from urllib.parse import urljoin


url = "https://www.bing.com"

search = input("enter the search term: ")
r = requests.get(url + "/images/search", params={"q":search})

soup = BeautifulSoup(r.content,"html.parser")

li = soup.find_all("a",class_="thumb")

# getting links from thumb class  

links = [l.get("href") for l in li]


print("{0} results found with the search term: {1}".format(len(links), search))
choice = input("Do You Want To Extract The Images? Y or N ")
dir_name = "Result"

# Creating the Result named directory if it didn't existed
if os.path.isdir(dir_name) == False:
    print("[+] Creating Directory Named '{0}'".format(dir_name))
    os.mkdir(dir_name)
    
n = 1
if(choice == 'Y' or choice == 'y'):
    for i in links:
        req = requests.get(i)

        #title = links[z].split("/")[-1]
        #there were some issues with the default titles so I instead used names generated by
        #random sequence

        print("[+] Extracting Image #",n)
        with open(("{0}/" + generateRandomSequence() + ".jpg").format(dir_name),"wb") as img:
            img.write(req.content)
        n += 1

  #for generating random sequence
def generateRandomSequence():
    seq = ""
    letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z",
               "A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",
                ]
    for i in range(0,5):
        seq = seq + random.choice(letters) + str(random.randrange(1,1000))
    
    return seq

Solution

  • Here is a scrape for you:

    import requests
    from bs4 import BeautifulSoup
    seartext = input("enter the search term: ")
    count = input("Enter the number of images you need:")
    adlt = 'off' # can be set to 'moderate'
    sear=seartext.strip()
    sear=sear.replace(' ','+')
    URL='https://bing.com/images/search?q=' + sear + '&safeSearch=' + adlt + '&count=' + count
    print(URL)
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    headers = {"user-agent": USER_AGENT}
    resp = requests.get(URL, headers=headers)
    results=[]
    soup = BeautifulSoup(resp.content, "html.parser")
    print(soup)
    wow = soup.find_all('a',class_='iusc')
    for i in wow:
        try:
            print(eval(i['m'])['murl'])
            print()
        except:
            pass
    

    Here , you will find the query parameters for bing.