The page I am interested in scraping is here: https://www.nba.com/stats/teams/opponent-shots-general?GeneralRange=Pullups&SeasonType=Regular+Season
I have the following code which I have tried running
library(httr2)
req_url <- "https://www.nba.com/stats/teams/opponent-shots-general?GeneralRange=Pullups&SeasonType=Regular+Season"
json <-
request(req_url) |>
req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |>
req_headers(
Accept = "*/*",
Origin = "https://www.nba.com",
Referer = "https://www.nba.com/",
) |>
req_perform() |>
resp_body_json()
hdr <- json$resultSets$headers
# build column names from 2-level header structure
clean_names <-
c(
rep("", hdr[[1]]$columnsToSkip),
rep(unlist(hdr[[1]]$columnNames), each = 3)
) |>
paste(unlist(hdr[[2]]$columnNames)) |>
janitor::make_clean_names()
clean_names
# make each list in a `rowSet` a named list,
# this allows us to use dplyr::bind_rows() to create a tibble
data <- json$resultSets$rowSet |>
lapply(setNames, clean_names) |>
dplyr::bind_rows()
When I run this I get the following error
Error in `resp_body_json()`:
! Unexpected content type "text/html".
• Expecting type "application/json" or suffix "json".
Could someone point me to what the req_url variable should be please
Appolgoes as I do my coding in Python and its been about 5 years since I touched R. But converting my python solution to R is below.
As mentioned, there is an API to access this data and returns in json. The key for nba site is you need the referrer in the headers.
library(httr)
library(jsonlite)
library(dplyr)
library(janitor)
# Base URL
base_url <- "https://stats.nba.com/stats/leaguedashoppptshot"
# Define the query parameters
params <- list(
"Conference" = "",
"DateFrom" = "",
"DateTo" = "",
"Division" = "",
"GameSegment" = "",
"GeneralRange" = "Pullups",
"LastNGames" = "0",
"LeagueID" = "00",
"Location" = "",
"Month" = "0",
"OpponentTeamID" = "0",
"Outcome" = "",
"PORound" = "0",
"PerMode" = "PerGame",
"Period" = "0",
"PlayerExperience" = "",
"PlayerPosition" = "",
"Season" = "2023-24",
"SeasonSegment" = "",
"SeasonType" = "Regular Season",
"TeamID" = "0",
"VsConference" = "",
"VsDivision" = ""
)
# Headers required by the NBA stats API
headers <- c(
"User-Agent" = paste(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"AppleWebKit/537.36 (KHTML, like Gecko)",
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language" = "en-US,en;q=0.9",
"Accept" = "application/json, text/plain, */*",
"Cache-Control" = "no-cache",
"Connection" = "keep-alive",
"Host" = "stats.nba.com",
"Origin" = "https://www.nba.com",
"Pragma" = "no-cache",
"Referer" = "https://www.nba.com/",
"Sec-Fetch-Dest" = "empty",
"Sec-Fetch-Mode" = "cors",
"Sec-Fetch-Site" = "same-site",
"x-nba-stats-origin" = "stats",
"x-nba-stats-token" = "true"
)
# Make the GET request
response <- GET(url = base_url, add_headers(.headers = headers), query = params, timeout(10))
# Check if the request was successful
if (response$status_code != 200) {
print(paste("Request failed with status code", response$status_code))
print(content(response, as = "text"))
} else {
# Parse the JSON response
json_data <- content(response, as = "text", encoding = "UTF-8")
json_data <- fromJSON(json_data)
# Extract headers and row data
result_sets <- json_data$resultSets[[1]]
column_headers <- result_sets$headers
row_data <- result_sets$rowSet
# Create a DataFrame from the row data and column headers
df <- as.data.frame(do.call(rbind, row_data), stringsAsFactors = FALSE)
names(df) <- column_headers
# Clean column names: convert to lowercase and replace spaces with underscores
df <- df %>%
clean_names()
# Display the DataFrame
print(head(df))
}