How to scrape webpage with scrapy framework?

I am newbie in webscraping. I have started learning scrapy framework.

I covered basic tutorial of scrapy. Now I am trying to scrap this page.

As per this tutorial, to get whole html-page contain one should write following code:

import scrapy


class ClothesSpider(scrapy.Spider):
    name = "clothes"

    start_urls = [
        'https://www.chumbak.com/women-apparel/GY1/c/',
    ]

    def parse(self, response):
        filename = 'clothes.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

This code run fine. But I am not getting expected result.

When I open clothes.html, the html-data is not same as when I do inspect from browser. Lot of things are missing in clothes.html.

I didn't understand what's going wrong here. Please help me to push forward. Any help would be appreciated.

Thanks.

Solution

This page use JavaScript to put data on page.

Using DevTool in Chrome/Firefox you can see which urls use JavaScript to get this data from server (tab Network, filter XHR)

And then you can try to get the data too.

Code generate urls for 10 pages with JSON data and download them, save in separated files, generate full urls to image and Scrapy download them to subfolder full. Scrapy also save in output.json all yield data about downloaded images.

#!/usr/bin/env python3

import scrapy
#from scrapy.commands.view import open_in_browser
import json

class MySpider(scrapy.Spider):

    name = 'myspider'

    #allowed_domains = []

    #start_urls = ['https://www.chumbak.com/women-apparel/GY1/c/']

    #start_urls = [
    #    'https://api-cdn.chumbak.com/v1/category/474/products/?count_per_page=24&page=1',
    #    'https://api-cdn.chumbak.com/v1/category/474/products/?count_per_page=24&page=2',
    #    'https://api-cdn.chumbak.com/v1/category/474/products/?count_per_page=24&page=3',
    #]

    def start_requests(self):
        pages = 10
        url_template = 'https://api-cdn.chumbak.com/v1/category/474/products/?count_per_page=24&page={}'

        for page in range(1, pages+1):
            url = url_template.format(page)
            yield scrapy.Request(url)

    def parse(self, response):
        print('url:', response.url)

        #open_in_browser(response)

        # get page number
        page_number = response.url.strip('=')[-1]

        # save JSON in separated file
        filename = 'page-{}.json'.format(page_number)
        with open(filename, 'wb') as f:
           f.write(response.body)

        # convert JSON into Python's dictionary
        data = json.loads(response.text)

        # get urls for images
        for product in data['products']:
            #print('title:', product['title'])
            #print('url:', product['url'])
            #print('image_url:', product['image_url'])

            # create full url to image
            image_url = 'https://media.chumbak.com/media/catalog/product/small_image/260x455' + product['image_url']
            # send it to scrapy and it will download it
            yield {'image_urls': [image_url]}


        # download files
        #for href in response.css('img::attr(href)').extract():
        #   url = response.urljoin(src)
        #   yield {'file_urls': [url]}

        # download images and convert to JPG
        #for src in response.css('img::attr(src)').extract():
        #   url = response.urljoin(src)
        #   yield {'image_urls': [url]}

# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',

    # save in CSV or JSON
    'FEED_FORMAT': 'json',     # 'cvs', 'json', 'xml'
    'FEED_URI': 'output.json', # 'output.cvs', 'output.json', 'output.xml'

    # download files to `FILES_STORE/full`
    # it needs `yield {'file_urls': [url]}` in `parse()`
    #'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1},
    #'FILES_STORE': '/path/to/valid/dir',

    # download images and convert to JPG
    # it needs `yield {'image_urls': [url]}` in `parse()`
    #'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1},
    #'IMAGES_STORE': '/path/to/valid/dir',
    'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1},
    'IMAGES_STORE': '.',
})
c.crawl(MySpider)
c.start()