rscreen-scrapingrvest

Problems storing authors when scraping Quotes To Scrape with RVest


I want to store authors from Quotes To Scrape in a list. I have some functions to get quotes and authors, which are good: in the console, my scraping is perfect, and the script is finishing with a perfect list of quotes. But my script can't store the authors, and my list of authors is empty.

Here is my functions (the script is very detailed because it is for a course):

library(rvest)
library(tidyverse)

start_url <- 'http://quotes.toscrape.com/'
start_page <- read_html(start_url)
session <- session(start_url)

get_quotes_elements <- function(page_url) {
  page <- read_html(page_url)
  quotes_elements <- html_nodes(page,".quote")
  return(quotes_elements)
}

get_quote <- function(quote_element) {
  quote <- list()
  quote_text = html_nodes(quote_element,'.text') %>% html_text()
  quote_author = html_nodes(quote_element,'.author') %>% html_text()
  quote_tags = html_nodes(quote_element,'.tags') %>% html_text()
  quote['Author'] <- quote_author
  quote['Quote'] <- quote_text
  quote['Tags'] <- quote_tags
  author_page_url <- paste0('https://quotes.toscrape.com', html_nodes(quote_element,"a[href*='/author/']") %>% html_attr("href"))
  author <- get_author(author_page_url)
  authors <- append(authors, author)
  print(paste(" Quote from ", author["Name"], " added"))
  return(quote)
}

get_quotes_pages <- function(start_url) {
  start_page_number <- 1
  pages_count <- 9
  quotes_pages <- list()
  quotes_pages <- append(quotes_pages, start_url)
  for (i in start_page_number:pages_count) {
    new_quotes_page_url <- quotes_pages[[i]]
    # print(paste("Processing ", new_quotes_page_url))
    new_quotes_page <- read_html(session %>% session_jump_to(new_quotes_page_url))
    next_quotes_page_url <- paste0('https://quotes.toscrape.com', new_quotes_page %>% html_nodes('li.next a') %>% html_attr("href"))
    # print(paste("next_quotes_page_url =  ", next_quotes_page_url))
    quotes_pages <- append(quotes_pages, next_quotes_page_url)
  }
  return(quotes_pages)
}

get_author <- function(author_page_url) {
  author <- list()
  first_author_page <- read_html(session %>% session_jump_to(author_page_url))
  author_name = html_nodes(first_author_page,'.author-title') %>% html_text()
  # print(paste("  Author name  ", author_name))
  author_born_date = html_nodes(first_author_page,'.author-born-date') %>% html_text()
  # print(paste("  Author born date  ", author_born_date))
  author_description = html_nodes(first_author_page,'.author-description') %>% html_text()
  # print(paste("  Author description  ", author_description))
  author['Name'] <- author_name
  author['BornDate'] <- author_born_date
  author['Description'] <- author_description
  return(author)
}

And I launch it with this script (I have limited the example to the first page of quotes):

quotes_pages <- get_quotes_pages(start_url)
first_quotes_page <- quotes_pages[[1]]
quotes <- list()
authors <- list()
for (quotes_page in first_quotes_page) {
  print(paste("Processing ", quotes_page))
  new_quotes <- list()
  new_quotes <- lapply(get_quotes_elements(quotes_page), get_quote)
  quotes <- append(quotes, new_quotes)
}

I was thinking that authors was declared as global variable, so the list authors can be filled with it, but apparently not.

Many thanks in advance for your help!


Solution

  • Finally, as I said in comments, I changed my function get_quote to make a global append to the list authors, as this last is a gloabl variable. Here is the new code :

    get_quote <- function(quote_element) {
      quote <- list()
      quote_text = html_nodes(quote_element,'.text') %>% html_text()
      quote_author = html_nodes(quote_element,'.author') %>% html_text()
      quote_tags = html_nodes(quote_element,'.tags') %>% html_text()
      quote['Author'] <- quote_author
      quote['Quote'] <- quote_text
      quote['Tags'] <- quote_tags
      author_page_url <- paste0('https://quotes.toscrape.com', html_nodes(quote_element,"a[href*='/author/']") %>% html_attr("href"))
      author <- get_author(author_page_url)
      authors <<- append(authors, author) # Here is the global assigment 
      print(paste(" Quote from ", author["Name"], " added"))
      return(quote)
    }