I'm scraping with selenium, scrapy. here, main problem is to link init and parse. now because parse don't accept response, missed call of driver.get(url) in parse
import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse
from datetime import datetime, timedelta
from bloomberg.items import BloombergItem
import json
from scrapy.shell import inspect_response
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
class HistoricalDataSpider(scrapy.Spider):
name = 'historical_data'
allowed_domains = ['econcal.forexprostools.com']
start_urls = ['http://econcal.forexprostools.com/']
output = []
not_parsed_pages = 0
def __init__(self):
# chrome_options.add_argument('--headless')
# Setting up the Chrome WebDriver with options
self.driver = webdriver.Chrome()
def parse(self, response):
print('###')
self.driver.get(response.url)
page_source = self.driver.page_source
for n in range(0, (self.end_date-self.start_date).days+1, 30):
start_date = self.start_date + timedelta(n)
end_date = self.start_date + timedelta(n+30)
if end_date > self.end_date: end_date = self.end_date
skip = False
for n, date in enumerate(self.scraped_dates):
if start_date <= date <= end_date and (self.end_date - date).days > 90:
skip = True
self.scraped_dates = self.scraped_dates[n:]
break
if skip:
continue
start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d')
html_response = HtmlResponse(url=self.driver.current_url, body=page_source, encoding='utf-8')
rows = html_response.css('button')
# output = []
for row in rows:
# print(row, 'init###')
if 'USD' in row.css('div::text').extract_first():
event_datetime = row.css('button::attr(event_timestamp)').extract_first()
event_datetime = datetime.strptime(event_datetime, '%Y-%m-%d %H:%M:%S')
date = event_datetime.strftime('%m/%d/%Y')
time = event_datetime.strftime('%H:%M')
event_name = row.css('.left.event::text').extract_first().strip()
actual = row.css('.act span::text').extract()
if actual:
actual = actual[1].strip()
if actual:
actual = re.sub('\,', '', actual)
actual = re.search('[-0-9.]+', actual).group()
else: actual = None
forecast = row.css('.fore span::text').extract()
if forecast:
forecast = forecast[1].strip()
if forecast:
forecast = re.sub('\,', '', forecast)
forecast = re.search('[-0-9.]+', forecast).group()
else: forecast = None
prev = row.css('.prev span::text').extract()
if prev:
prev = prev[1].strip()
if prev:
prev = re.sub('\,', '', prev)
prev = re.search('[-0-9.]+', prev).group()
else: prev = None
new_row = [date,time, event_name, actual, forecast, prev]
if new_row not in self.output:
self.output.append([date,time, event_name, actual, forecast, prev])
# self.not_parsed_pages -= 1
if self.not_parsed_pages == 0:
item = BloombergItem()
item['data'] = self.output
yield item
self.driver.quit()
self.driver.close()
from this I got errors below.
2024-06-09 04:55:54 [scrapy.middleware] INFO: Enabled item pipelines:
['bloomberg.pipelines.BloombergPipeline']
2024-06-09 04:55:54 [scrapy.core.engine] INFO: Spider opened
2024-06-09 04:55:54 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2024-06-09 04:55:54 [historical_data] INFO: Spider opened: historical_data
2024-06-09 04:55:54 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2024-06-09 04:55:55 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://econcal.forexprostools.com/> from <GET http://econcal.forexprostools.com/>
2024-06-09 04:55:55 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://econcal.forexprostools.com/> (referer: None)
2024-06-09 04:55:55 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://econcal.forexprostools.com/>: HTTP status code is not handled or not allowed
2024-06-09 04:55:55 [scrapy.core.engine] INFO: Closing spider (finished)
Here, chromedriver is running without url and closed. In my opinion, crawler detected error between init and parse. More correctly, sometimes, I got exact result but mostly, missed urls. I will wait for your help.
Reinstall chromedriver and check the path. And in my opinion, it's better to use firefox browswer to test.