I'm web scraping in R using rvest and I'm trying to get the personal record single and average of each event of the chosen person. It does indeed return the PRs but it also returned data from each person's competitions, and I don't need that.
Here's my code:
#' get_person_data
#'
#' Gets person data from WCA website using the ID provided.
#'
#' @importFrom rvest read_html html_node html_text html_attr
#' @importFrom dplyr %>%
#' @param id ID of the person to be searched.
#' @param export_csv Whether or not the data should be exported. Set to false by default.
#' @export
get_person_data <- function(id, export_csv=FALSE, directory=NULL) {
html <- rvest::read_html(paste("https://www.worldcubeassociation.org/persons/", id, sep = ""))
name <- html %>%
rvest::html_node("div.text-center h2") %>%
rvest::html_text()
image_url <- html %>%
rvest::html_node("div.text-center img.avatar") %>%
rvest::html_attr("src")
country <- html %>%
rvest::html_node(".country") %>%
rvest::html_text()
gender <- html %>%
rvest::html_node("table.table tbody td:nth-child(3)") %>%
rvest::html_text()
comps <- html %>%
rvest::html_node("table.table tbody td:nth-child(4)") %>%
rvest::html_text()
comp_solves <- html %>%
rvest::html_node("table.table tbody td:nth-child(5)") %>%
rvest::html_text()
events <- html %>%
rvest::html_nodes("table.table tbody tr")
event_data <- lapply(events, function(event) {
event_name <- event %>%
rvest::html_node("td:nth-child(1)") %>%
rvest::html_text(trim=TRUE)
single <- event %>%
rvest::html_node("td:nth-child(5)") %>%
rvest::html_text(trim=TRUE)
average <- event %>%
rvest::html_node("td:nth-child(6)") %>%
rvest::html_text(trim=TRUE)
data.frame(
Event = event_name,
Single = single,
Average = average,
stringsAsFactors = FALSE
)
})
event_data <- do.call(rbind, event_data)
if(export_csv) {
key_data <- data.frame(
name,
id,
image_url,
country,
gender,
comps,
comp_solves
)
names(key_data) <- c("Name", "ID", "Avatar", "Country", "Gender", "Competitions", "Completed Solves")
final_data <- cbind(key_data, event_data)
write.csv(final_data, file=paste0(directory, "/person_data.csv"), fileEncoding = "UTF-8")
print("Saved to directory")
}
cat("Name:", name, "\n")
cat("ID:", id, "\n")
cat("Image URL:", image_url, "\n")
cat("Country:", country, "\n")
cat("Gender:", gender, "\n")
cat("Competitions:", comps, "\n")
cat("Completed Solves:", comp_solves, "\n")
cat("Event Data:\n")
print(event_data)
}
event_data
displays this:
Event Data:
Event Single Average
1 3x3x3 Cube 15.02 17.47
2 3x3x3 Cube <NA> <NA>
3 Avenida Chile VIII 2024 18.06
4 17.47
5 Monterrey Bogotá VI 2024 30.20
Using data from here: https://www.worldcubeassociation.org/persons/2024ESPI01 (that's not me btw)
I've tried doing this:
event_table <- html %>%
html_node(xpath = "//table[contains(@class, 'table table-striped table-condensed')][1]") %>%
html_node("tbody") %>%
html_nodes("tr")
But it just returned NULL.
Something like this ?
library(rvest)
options(pillar.sigfig = 7)
read_html("https://www.worldcubeassociation.org/persons/2024ESPI01") |>
html_element(".personal-records table") |>
html_table() |>
dplyr::select(Event, Single, Average)
#> # A tibble: 1 × 3
#> Event Single Average
#> <chr> <dbl> <dbl>
#> 1 3x3x3 Cube 15.02 17.47
There's also Unofficial World Cube Association API that uses daily data exports from https://www.worldcubeassociation.org/export/results :
jsonlite::fromJSON(paste0("https://raw.githubusercontent.com/robiningelbrecht/",
"wca-rest-api/master/api/persons/2024ESPI01.json")) |>
getElement("rank") |>
purrr::list_rbind(names_to = "type") |>
dplyr::select(type, eventId, best) |>
tidyr::pivot_wider(names_from = type, values_from = best)
#> # A tibble: 1 × 3
#> eventId singles averages
#> <chr> <int> <int>
#> 1 333 1502 1747
Created on 2024-06-25 with reprex v2.1.0