How to Scrape NBA stats page using rvest

The page I am interested in scraping is here: https://www.nba.com/stats/teams/opponent-shots-general?GeneralRange=Pullups&SeasonType=Regular+Season

I have the following code which I have tried running

library(httr2)

req_url <- "https://www.nba.com/stats/teams/opponent-shots-general?GeneralRange=Pullups&SeasonType=Regular+Season"

json <- 
  request(req_url) |>
  req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |>
  req_headers(
    Accept = "*/*",
    Origin = "https://www.nba.com",
    Referer = "https://www.nba.com/",
  ) |> 
  req_perform() |>
  resp_body_json() 

hdr <- json$resultSets$headers

# build column names from 2-level header structure
clean_names <- 
  c(
    rep("", hdr[[1]]$columnsToSkip), 
    rep(unlist(hdr[[1]]$columnNames), each = 3)
  ) |>
  paste(unlist(hdr[[2]]$columnNames)) |>
  janitor::make_clean_names()
clean_names


# make each list in a `rowSet` a named list, 
# this allows us to use dplyr::bind_rows() to create a tibble
data <- json$resultSets$rowSet |>
  lapply(setNames, clean_names) |>
  dplyr::bind_rows()

When I run this I get the following error

Error in `resp_body_json()`:
! Unexpected content type "text/html".
• Expecting type "application/json" or suffix "json".

Could someone point me to what the req_url variable should be please

Solution

Appolgoes as I do my coding in Python and its been about 5 years since I touched R. But converting my python solution to R is below.

As mentioned, there is an API to access this data and returns in json. The key for nba site is you need the referrer in the headers.

library(httr)
library(jsonlite)
library(dplyr)
library(janitor)

# Base URL
base_url <- "https://stats.nba.com/stats/leaguedashoppptshot"

# Define the query parameters
params <- list(
    "Conference" = "",
    "DateFrom" = "",
    "DateTo" = "",
    "Division" = "",
    "GameSegment" = "",
    "GeneralRange" = "Pullups",
    "LastNGames" = "0",
    "LeagueID" = "00",
    "Location" = "",
    "Month" = "0",
    "OpponentTeamID" = "0",
    "Outcome" = "",
    "PORound" = "0",
    "PerMode" = "PerGame",
    "Period" = "0",
    "PlayerExperience" = "",
    "PlayerPosition" = "",
    "Season" = "2023-24",
    "SeasonSegment" = "",
    "SeasonType" = "Regular Season",
    "TeamID" = "0",
    "VsConference" = "",
    "VsDivision" = ""
)

# Headers required by the NBA stats API
headers <- c(
    "User-Agent" = paste(
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "AppleWebKit/537.36 (KHTML, like Gecko)",
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language" = "en-US,en;q=0.9",
    "Accept" = "application/json, text/plain, */*",
    "Cache-Control" = "no-cache",
    "Connection" = "keep-alive",
    "Host" = "stats.nba.com",
    "Origin" = "https://www.nba.com",
    "Pragma" = "no-cache",
    "Referer" = "https://www.nba.com/",
    "Sec-Fetch-Dest" = "empty",
    "Sec-Fetch-Mode" = "cors",
    "Sec-Fetch-Site" = "same-site",
    "x-nba-stats-origin" = "stats",
    "x-nba-stats-token" = "true"
)

# Make the GET request
response <- GET(url = base_url, add_headers(.headers = headers), query = params, timeout(10))

# Check if the request was successful
if (response$status_code != 200) {
    print(paste("Request failed with status code", response$status_code))
    print(content(response, as = "text"))
} else {
    # Parse the JSON response
    json_data <- content(response, as = "text", encoding = "UTF-8")
    json_data <- fromJSON(json_data)
    
    # Extract headers and row data
    result_sets <- json_data$resultSets[[1]]
    column_headers <- result_sets$headers
    row_data <- result_sets$rowSet
    
    # Create a DataFrame from the row data and column headers
    df <- as.data.frame(do.call(rbind, row_data), stringsAsFactors = FALSE)
    names(df) <- column_headers
    
    # Clean column names: convert to lowercase and replace spaces with underscores
    df <- df %>%
        clean_names()
    
    # Display the DataFrame
    print(head(df))
}