htmlweb-scrapingscrapy

Scrapy scraping web page giving me blank value for one value


I am trying to scrape this web page with scrapy and I can get all the data I am needing besides the distance. The link https://www.thedogs.com.au/racing/albion-park/2024-05-30/10/tab-flying-amy-classic-h?trial=false

The distance is 520m. How do I get it to scrape this value? Please see the bold code below.

rules = (
        Rule(LinkExtractor(restrict_xpaths="//td[@class='meetings-venues__race-time']/a"), callback='parse_item', follow=True),
    )

def parse_item(self, response):
    item = {}

    hxs = Selector(response)
    divs = hxs.xpath('//tr[@class="accordion__anchor race-runner"]')   
            
           # titles = [hxs.select('//tr[@class="index class_tr group-6487"] | //tr[@class="index class_tr group-6488"] | //tr[@class="index class_tr group-6489"]')]

    for div in divs:
        item = {
            'grade' : div.xpath(".//td[@class='race-runners__grade']/text()").extract(),
            'greyhound' : div.xpath('./td[3]/div[1]/a/text()').extract(),
            'position' : div.xpath('./td[1]/text()').extract(),
            'trainer' : div.xpath(".//div[@class='race-runners__name__trainer']/a/text()").extract(),
            'weight' : div.xpath(".//td[@class='race-runners__weight']/text()").extract(),
            'first_sec' : div.xpath(".//td[@class='race-runners__sectional']/text()").extract_first(),
            'second_sec' : div.xpath(".//td[@class='race-runners__sectional'][2]/text()").extract(),
            'time' : div.xpath(".//td[@class='race-runners__time']/text()").extract(),
            'margin' : div.xpath(".//td[@class='race-runners__margin']/text()").extract(),
            ***'distance' : div.xpath(".//div[@class='race-header__info__grade']/a/text()").extract(),***
            'starting_price' : div.xpath(".//td[@class='race-runners__starting-price']/text()").extract(),
            'date' : response.url.split('/')[-3],
            'track' : response.url.split('/')[-4],
            'rug' : div.xpath('.//td[@class="table__cell--tight race-runners__box"]/sprite-svg/@name').get()
            #'rug' : div.xpath('//td[@class="table__cell--tight race-runners__box"]/sprite-svg/@name').extract()
            }

        yield item

Solution

  • Grab the distance outside of the loop. I'm using CSS but you can equally use XPath.

    from scrapy import Spider, Request
    
    class MySpider(Spider):
        name = "thedogs"
    
        start_urls = ["https://www.thedogs.com.au/racing/albion-park/2024-05-30/10/tab-flying-amy-classic-h?trial=false"]
    
        def start_requests(self):
            for url in self.start_urls:
                yield Request(url, self.parse)
    
        async def parse(self, response):
            # Get distance once outside of loop.
            distance = response.css(".race-header__info__grade::text").get()
    
            divs = response.xpath('//tr[@class="accordion__anchor race-runner"]')   
    
            for div in divs:
                yield {
                    'distance' : distance,
                    'grade' : div.xpath(".//td[@class='race-runners__grade']/text()").extract(),
                    'greyhound' : div.xpath('./td[3]/div[1]/a/text()').extract(),
                    'position' : div.xpath('./td[1]/text()').extract(),
                    'trainer' : div.xpath(".//div[@class='race-runners__name__trainer']/a/text()").extract(),
                    'weight' : div.xpath(".//td[@class='race-runners__weight']/text()").extract(),
                    'first_sec' : div.xpath(".//td[@class='race-runners__sectional']/text()").extract_first(),
                    'second_sec' : div.xpath(".//td[@class='race-runners__sectional'][2]/text()").extract(),
                    'time' : div.xpath(".//td[@class='race-runners__time']/text()").extract(),
                    'margin' : div.xpath(".//td[@class='race-runners__margin']/text()").extract(),
                    'starting_price' : div.xpath(".//td[@class='race-runners__starting-price']/text()").extract(),
                    'date' : response.url.split('/')[-3],
                    'track' : response.url.split('/')[-4],
                    'rug' : div.xpath('.//td[@class="table__cell--tight race-runners__box"]/sprite-svg/@name').get()
                }
    

    Output:

    {'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Pronouns'], 'position': ['1st'], 'trainer': ['T: Tony Brett'], 'weight': ['26.60'], 'first_sec': '5.54', 'second_sec': ['17.09'], 'time': ['29.77'], 'margin': [], 'starting_price': ['$5.50'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_1'}
    {'distance': 'OPEN 520m', 'grade': ['4'], 'greyhound': ['Cindy Keeping'], 'position': ['2nd'], 'trainer': ['T: Charmaine Roberts'], 'weight': ['28.90'], 'first_sec': '5.58', 'second_sec': ['17.27'], 'time': ['30.05'], 'margin': ['4.00'], 'starting_price': ['$8.50'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_8'}
    {'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Excavation'], 'position': ['3rd'], 'trainer': ['T: Jason Thompson'], 'weight': ['28.70'], 'first_sec': '5.56', 'second_sec': ['17.49'], 'time': ['30.21'], 'margin': ['6.25'], 'starting_price': ['$11.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_4'}
    {'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Which Trap'], 'position': ['4th'], 'trainer': ['T: John Dart'], 'weight': ['32.30'], 'first_sec': '5.67', 'second_sec': ['17.59'], 'time': ['30.32'], 'margin': ['7.75'], 'starting_price': ['$101.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_6'}
    {'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Mackenna'], 'position': ['5th'], 'trainer': ['T: Michelle Sultana'], 'weight': ['28.20'], 'first_sec': '5.54', 'second_sec': ['17.71'], 'time': ['30.52'], 'margin': ['10.50'], 'starting_price': ['$2.10'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_7'}
    {'distance': 'OPEN 520m', 'grade': ['4'], 'greyhound': ['Super Scrub'], 'position': ['6th'], 'trainer': ['T: Travis Elson'], 'weight': ['32.80'], 'first_sec': '5.57', 'second_sec': ['17.31'], 'time': ['30.57'], 'margin': ['11.50'], 'starting_price': ['$8.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_2'}
    {'distance': 'OPEN 520m', 'grade': ['4'], 'greyhound': ["History's Coming"], 'position': ['7th'], 'trainer': ['T: Tomas Rees'], 'weight': ['33.60'], 'first_sec': '5.58', 'second_sec': ['17.43'], 'time': ['30.61'], 'margin': ['12.00'], 'starting_price': ['$5.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_3'}