Hi I try to scrape the front page images on digg.com, with the follow code. The issue is that 0.jpg to 6.jpg are normal. Starting at 7.jpg to 47.jpg are corrupt. Not sure why.
Here is the code. Github here: https://github.com/kenpeter/py_mm
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")
news_texts = []
news_texts = selector.xpath("//div[@itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)
The reason why the images are "corrupt" is that the scheme changes within the page and the images start to "hide" in the attribute data-src
instead of src
which content you grab with your code. See here an example of the source code of the grabbed page with both attributes:
<img
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset"
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
width="312"
height="170"
alt=""
/>
In other words you have to check for both attributes src
and data-src
giving data-src
priority over src
while creating the list of image URLs.
THIS code does the "trick" and downloads the proper images:
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")
img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src")
img_urls = img_urls_1b + img_urls_2
# print(img_urls)
news_texts = []
news_texts = selector.xpath("//div[@itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)