rweb-scrapingrvest

How to Scrape NBA stats page using rvest


The page I am interested in scraping is here: https://www.nba.com/stats/teams/opponent-shots-general?GeneralRange=Pullups&SeasonType=Regular+Season

I have the following code which I have tried running

library(httr2)

req_url <- "https://www.nba.com/stats/teams/opponent-shots-general?GeneralRange=Pullups&SeasonType=Regular+Season"

json <- 
  request(req_url) |>
  req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |>
  req_headers(
    Accept = "*/*",
    Origin = "https://www.nba.com",
    Referer = "https://www.nba.com/",
  ) |> 
  req_perform() |>
  resp_body_json() 

hdr <- json$resultSets$headers

# build column names from 2-level header structure
clean_names <- 
  c(
    rep("", hdr[[1]]$columnsToSkip), 
    rep(unlist(hdr[[1]]$columnNames), each = 3)
  ) |>
  paste(unlist(hdr[[2]]$columnNames)) |>
  janitor::make_clean_names()
clean_names


# make each list in a `rowSet` a named list, 
# this allows us to use dplyr::bind_rows() to create a tibble
data <- json$resultSets$rowSet |>
  lapply(setNames, clean_names) |>
  dplyr::bind_rows()

When I run this I get the following error

Error in `resp_body_json()`:
! Unexpected content type "text/html".
• Expecting type "application/json" or suffix "json".

Could someone point me to what the req_url variable should be please


Solution

  • Appolgoes as I do my coding in Python and its been about 5 years since I touched R. But converting my python solution to R is below.

    As mentioned, there is an API to access this data and returns in json. The key for nba site is you need the referrer in the headers.

    library(httr)
    library(jsonlite)
    library(dplyr)
    library(janitor)
    
    # Base URL
    base_url <- "https://stats.nba.com/stats/leaguedashoppptshot"
    
    # Define the query parameters
    params <- list(
        "Conference" = "",
        "DateFrom" = "",
        "DateTo" = "",
        "Division" = "",
        "GameSegment" = "",
        "GeneralRange" = "Pullups",
        "LastNGames" = "0",
        "LeagueID" = "00",
        "Location" = "",
        "Month" = "0",
        "OpponentTeamID" = "0",
        "Outcome" = "",
        "PORound" = "0",
        "PerMode" = "PerGame",
        "Period" = "0",
        "PlayerExperience" = "",
        "PlayerPosition" = "",
        "Season" = "2023-24",
        "SeasonSegment" = "",
        "SeasonType" = "Regular Season",
        "TeamID" = "0",
        "VsConference" = "",
        "VsDivision" = ""
    )
    
    # Headers required by the NBA stats API
    headers <- c(
        "User-Agent" = paste(
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "AppleWebKit/537.36 (KHTML, like Gecko)",
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Accept-Language" = "en-US,en;q=0.9",
        "Accept" = "application/json, text/plain, */*",
        "Cache-Control" = "no-cache",
        "Connection" = "keep-alive",
        "Host" = "stats.nba.com",
        "Origin" = "https://www.nba.com",
        "Pragma" = "no-cache",
        "Referer" = "https://www.nba.com/",
        "Sec-Fetch-Dest" = "empty",
        "Sec-Fetch-Mode" = "cors",
        "Sec-Fetch-Site" = "same-site",
        "x-nba-stats-origin" = "stats",
        "x-nba-stats-token" = "true"
    )
    
    # Make the GET request
    response <- GET(url = base_url, add_headers(.headers = headers), query = params, timeout(10))
    
    # Check if the request was successful
    if (response$status_code != 200) {
        print(paste("Request failed with status code", response$status_code))
        print(content(response, as = "text"))
    } else {
        # Parse the JSON response
        json_data <- content(response, as = "text", encoding = "UTF-8")
        json_data <- fromJSON(json_data)
        
        # Extract headers and row data
        result_sets <- json_data$resultSets[[1]]
        column_headers <- result_sets$headers
        row_data <- result_sets$rowSet
        
        # Create a DataFrame from the row data and column headers
        df <- as.data.frame(do.call(rbind, row_data), stringsAsFactors = FALSE)
        names(df) <- column_headers
        
        # Clean column names: convert to lowercase and replace spaces with underscores
        df <- df %>%
            clean_names()
        
        # Display the DataFrame
        print(head(df))
    }