Website https://www.supralift.com/uk/itemsearch/results uses a JavaScript based pager which doesn´t expose any parameters in url which I could alter and navigate this way through the website.
Looking into "Network" tab of Chrome console I see, that website exposes also some relatively complete information under url /api/search/item/summary
. Calling this API endpoint returns me however empty result:
{
"type": "about:blank",
"title": "Method Not Allowed",
"status": 405,
"detail": "Method 'GET' is not supported.",
"instance": "/api/search/item/summary"
}
How can I use this hidden API to scrape the website?
Many thanks in advance.
From Network tab of DevTools you can see the actual request method, in this case it's POST
and not GET
, so httr::GET()
, for example, would not be much of a use here. With a POST
request you are usually expected to include a payload, in this case it would be a JSON with search parameters (check Network > Payload in DevTools).
To make the same request in R, you could start by copying request as cURL and translating it through https://curlconverter.com/r/ or similar.
Or if httr2
is also OK, with httr2::curl_translate()
.
Request method & Copy as cURL (go for bash if it's meant to be an input for some conversion tool, even in Windows):
Following is based on a httr2::curl_translate()
output, though in this case it's basically just a convenient shortcut to get a valid payload, using req_body_raw()
automatically sets request method to POST
. httr2
also provides tools to perform requests iteratively, next request being based on the response of current response, e.g. cycling though pages while checking if we have reached the final page.
req_perform_iterative()
is used with iterate_with_offset()
helper to generate a next request based on the current response, resp_pages
is anonymous function to extract totalPages
value from the first JSON response, resp_complete
checks if last
is set in the JSON response. Iterator stops once resp_complete
returns TRUE
or maximum number of requests (max_reqs
) is made.
library(httr2)
# prepare 1st request object, size increased
req <- request("https://www.supralift.com/api/search/item/summary") |>
req_url_query(
size = "2000",
page = "0",
) |>
req_body_raw('{"searchType":null,"bundleId":null,"identification":{"slNumber":null,"serialNumber":null,"supplierProductNumber":null,"slOrSupplierProductNumber":null},"configuration":{"buildClass":null,"manufacturer":null,"buildSeries":null,"acDriven":null,"powerUnit":null,"fuelType":null,"mastType":null,"gearBox":null,"tyres":null,"typeSearch":null},"buildDates":{"month":null,"year":{"from":null,"to":null}},"dimensions":{"overallHeight":{"from":null,"to":null},"workingHours":{"from":null,"to":null},"loadCentreOfGravity":{"from":null,"to":null},"capacity":{"from":null,"to":null},"forkLength":{"from":null,"to":null},"towingCapacity":{"from":null,"to":null}},"price":{"price":{"from":null,"to":null,"currency":"GBP"}},"cabin":{"cabin":null,"height":{"from":null,"to":null},"platformHeight":{"from":null,"to":null}},"engine":{"manufacturer":null,"power":null},"battery":{"exists":null,"manufacturer":null,"batteryType":null,"voltage":{"from":null,"to":null},"capacity":{"from":null,"to":null},"buildDates":null},"batteryCharger":{"exists":null,"manufacturer":null,"voltage":{"from":null,"to":null},"current":{"from":null,"to":null},"buildDates":{"month":null,"year":{"from":null,"to":null}}},"location":{"distance":100,"postCode":null,"region":null,"countryState":null,"country":null,"countryOrNull":null},"container":{"containerType":null,"hubhoehe8Z3":{"from":null,"to":null},"hubhoehe8Z4":{"from":null,"to":null},"hubhoehe8Z5":{"from":null,"to":null},"hubhoehe8Z6":{"from":null,"to":null},"hubhoehe8Z7":{"from":null,"to":null},"hubhoehe8Z8":{"from":null,"to":null},"hubhoehe8Z6I3":{"from":null,"to":null},"hubhoehe8Z6I4":{"from":null,"to":null},"hubhoehe8Z6I5":{"from":null,"to":null},"hubhoehe8Z6I6":{"from":null,"to":null},"hubhoehe8Z6I7":{"from":null,"to":null},"hubhoehe8Z6I8":{"from":null,"to":null},"hubhoehe9Z6I3":{"from":null,"to":null},"hubhoehe9Z6I4":{"from":null,"to":null},"hubhoehe9Z6I5":{"from":null,"to":null},"hubhoehe9Z6I6":{"from":null,"to":null},"hubhoehe9Z6I7":{"from":null,"to":null},"hubhoehe9Z6I8":{"from":null,"to":null}},"offerDetails":{"offerBegin":null,"maxOfferAge":null,"activationDate":null,"offerFormat":null,"dealsOnly":null,"imagesOnly":null,"offerType":"SALE"},"additionalHydraulic":{"toValve":null,"complete":null},"liftAttributes":{"initialLift":null,"liftHeight":{"from":null,"to":null},"freeLift":{"from":null,"to":null},"liftPower":null},"isLicensedDealerOnly":null,"warranty":{"from":null,"to":null},"qualityRating":null,"attachments":null,"accessories":null,"customFields":null,"specialAttributes":{"explosionProof":null,"stainlessSteel":null,"autonomousMobileRobot":null},"freightTerm":null,"itemStatus":[],"backendSearch":false}', "application/json")
# perform series of requests, increase `page` parameter until
# `resp_complete` returns TRUE or when reaching `max_reqs`
resps <-
req_perform_iterative(
req,
next_req = iterate_with_offset(
param_name = "page",
start = 0,
resp_pages = \(resp) resp_body_json(resp)$totalPages,
resp_complete = \(resp) resp_body_json(resp)$last,
),
# generate just first 2 requests as an example
max_reqs = 2
)
#> Iterating ■■■■■■■■■■■■■■■■ 50% | ETA: 3s
# list of responses:
resps
#> [[1]]
#> <httr2_response>
#> POST https://www.supralift.com/api/search/item/summary?size=2000&page=0
#> Status: 200 OK
#> Content-Type: application/json
#> Body: In memory (2043726 bytes)
#>
#> [[2]]
#> <httr2_response>
#> POST https://www.supralift.com/api/search/item/summary?size=2000&page=1
#> Status: 200 OK
#> Content-Type: application/json
#> Body: In memory (1933115 bytes)
All data in a single frame, 2000(size) * 2(max_reqs) = 4000 rows :
resps_data(resps, \(resp) resp_body_json(resp, simplifyVector = TRUE)$content) |>
tibble::as_tibble()
#> # A tibble: 4,000 × 18
#> id slNo type manufacturer powerUnit buildClass mastType liftHeight
#> <chr> <int> <chr> <chr> <chr> <chr> <chr> <int>
#> 1 66f8d101f… 1.29e7 R14S LINDE ELEKTRO SCHUBMAST… DREIFACH 6250
#> 2 66f8ccfaf… 1.29e7 EPL1… WEITERE ELEKTRO NIEDERHUB… <NA> 115
#> 3 66f8c894f… 1.29e7 27823 JUNGHEINRICH ELEKTRO WEITERE DREIFACH 5500
#> 4 66f8c886f… 1.29e7 27799 JUNGHEINRICH ELEKTRO WEITERE DREIFACH 5000
#> 5 66f8c595f… 1.29e7 T401… BOBCAT DIESEL TELESKOPA… <NA> 17000
#> 6 66f8c4e7f… 1.29e7 TFG … JUNGHEINRICH GAS VIERRADFR… DREIFACH 5000
#> 7 66f8b525f… 1.29e7 EFG … JUNGHEINRICH ELEKTRO VIERRADFR… DREIFACH 6000
#> 8 66f81676f… 1.28e7 R16G LINDE ELEKTRO SCHUBMAST… <NA> 6210
#> 9 66f80016f… 1.29e7 C500… COMBILIFT ELEKTRO SEITENSTA… DREIFACH 6000
#> 10 66f80015f… 1.29e7 H30D… LINDE DIESEL VIERRADFR… ZWEIFACH 3760
#> # ℹ 3,990 more rows
#> # ℹ 10 more variables: workingHours <int>, capacity <int>, yearOfBuild <int>,
#> # company <df[,6]>, offerType <chr>, isNew <lgl>, price <df[,4]>,
#> # qualityRating <int>, images <list>, specialRating <df[,4]>
Created on 2024-09-29 with reprex v2.1.1