I'm tying to retrieve data from here. Basically game data that are inside a GET
request that I've managed to identify here:
In the headers tab I can find the requested info so I do the following:
url<-"https://sports-eu-west-3.winamax.fr/uof-sports-server/socket.io/?language=FR&version=3.3.0&embed=false&EIO=3&transport=polling&t=PMkczty&sid=KTgYbJPFwJJvgPH2ELsN"
cookies<- {'PHPSESSIONID=2e8oodpcdd738jnbba7f86n6n3; PHPSESSID=f3fu2f2u256pjfar39d53ccmdm; AWSALB=5dUyh9AdOtQ4+5qn8gOVduPdhBZGKGsiyHHuLinQcqGy82blgtkB7u3Zd+oTyi6Qrr1GSNua5ZoAGnvZ6t/SpvnodZYjivX9DCdkZG7GuYeSysZryg2mNv7ec6Fh; AWSALBCORS=5dUyh9AdOtQ4+5qn8gOVduPdhBZGKGsiyHHuLinQcqGy82blgtkB7u3Zd+oTyi6Qrr1GSNua5ZoAGnvZ6t/SpvnodZYjivX9DCdkZG7GuYeSysZryg2mNv7ec6Fh; io=KTgYbJPFwJJvgPH2ELsN'
}
t<-httr::GET(url,add_headers('User-Agent' = 'Mozilla/5.0', 'Cookie'= cookies,
'sec-ch-ua-platform' = "Windows"), accept_json())
This doesn't work since I get a code 400 when running status_code(t)
However, I have noticed that when I remove what's after polling in the url, I receive a code 200 and I am able to get an sid, which I believe must be used after but I can't find how to do that.
For instance using:
url2<- 'https://sports-eu-west-3.winamax.fr/uof-sports-server/socket.io/?language=FR&version=3.3.0&embed=false&EIO=3&transport=polling'
t<-httr::GET(url2,add_headers('User-Agent' = 'Mozilla/5.0', 'Cookie'= cookies,
'sec-ch-ua-platform' = "Windows"), accept_json())
print(status_code(t))
> [1] 200
get2json<- content(t, as = "text")
> [1] "97:0{\"sid\":\"fLiZq05y2HrayUG6EmxO\",\"upgrades\":[\"websocket\"],\"pingInterval\":25000,\"pingTimeout\":20000}2:40"
Any hints appreciated.
As this is part of Socket.IO message exchange, it takes few more requests to get similar response. And keeping protocol descriptions for both Socket.IO & underlying Engine.IO at hand helps us to put those captured requests & responses into context and re-build a polling session without using Websocket for transport. It goes something like this:
ok
)Here I'm using httr2
instead of httr
to build a naive client to send/receive a single message while hoping nothing else get's ahead in message queue -- it only extracts first packet from received payloads and does not check if requestId
set by the client also happens to match with the response.
Let's start with some helper functions to extract data from payload and to build payload. iterate_with_eio_stages()
is somewhat unique to httr2
, it's a function to create a function for httr2::req_perform_iterative()
, which is used to build request sequences where next request is built or altered based on previous response (here: previously described stages, init-post-get, and also close).
library(httr2)
library(stringr)
library(jsonlite)
library(uuid)
# extract 1st packet from response payload that potentially could pack many
# https://github.com/socketio/engine.io-protocol/tree/v3?tab=readme-ov-file#payload
eio_1st_packet <- function(resp_string){
# payload encoding: <length1>:<packet1>[<length2>:<packet2>[...]]
# ex: '97:0{"sid":"AiFNZ0L0JU6g4Ui4D-4P","upgrades":["websocket"],"pingInterval":25000,"pingTimeout":20000}2:40'
# resp_string <- m1 |> resp_body_string()
len_s <- str_extract(resp_string, "^\\d+(?=:)")
len <- strtoi(len_s)
start_ <- str_length(len_s) + 2
str_sub(resp_string, start_, start_ + len - 1) |>
str_remove("^\\d")
}
# test
eio_1st_packet('97:0{"sid":"AiFNZ0L0JU6g4Ui4D-4P","upgrades":["websocket"],"pingInterval":25000,"pingTimeout":20000}2:40') |> str_view()
#> [1] │ {"sid":"AiFNZ0L0JU6g4Ui4D-4P","upgrades":["websocket"],"pingInterval":25000,"pingTimeout":20000}
# encode list as a packet
eio_encode <- function(msg, event = "m"){
# msg <- list(requestId = UUIDgenerate(), route = "tournament:4")
json_str <- toJSON(list(event, msg), auto_unbox = TRUE)
str_glue("{str_length(json_str)+2}:42{json_str}")
}
# test
eio_encode(list(requestId = UUIDgenerate(), route = "tournament:4")) |> str_view()
#> [1] │ 83:42["m",{"requestId":"0b428121-f804-4fef-ab06-8644fae49af5","route":"tournament:4"}]
# iterate through Engine.IO stages to make a single request over polling,
# inspired by httr2::iterate_with_offset() & other retardation helpers
iterate_with_eio_stages <- function (msg, event = "m"){
PREV_RESP_STAGES <- c("OPEN", "MSG_POST", "MSG_GET", "CLOSE")
stage <- 0L
eio_sid <- NA
# previous(!) response and its request, returns next request
function(resp, req) {
stage <<- stage + 1
# update t, regardless of the state
req <- req_url_query(req, t = Sys.time() |> as.numeric() |> sprintf("%f", . = _))
# modify previous(!) request according to current stage,
# initialization request
switch (PREV_RESP_STAGES[stage],
OPEN = {
# transition to MSG_POST
# store sid from init response
eio_sid <<-
resp |>
resp_body_string() |>
eio_1st_packet() |>
fromJSON() |>
_[["sid"]]
# include sid,
# store encoded message in body, switches to POST request
req_url_query(req, sid = eio_sid) |>
req_body_raw(eio_encode(msg, event))
},
MSG_POST = {
# transition to MSG_GET
# switch from previous POST back to GET by dropping message body
req$body <- NULL
req
},
MSG_GET = {
# transition to CLOSE
# Engine.IO "close" request
req_body_raw(req, "1")
},
# default (CLOSE) action, set output to NULL to stop request iterator
NULL
)
}
}
Let's test with a "route" : "tournament:4"
message, it's similar to the request triggered by the link in question and thanks to defaults in eio_encode()
, generated event is also the same, "m"
.
url_ <- "https://sports-eu-west-3.winamax.fr/uof-sports-server/socket.io/?language=FR&version=3.4.0&embed=false&EIO=3"
msg_ <- list(requestId = UUIDgenerate(), route = "tournament:4")
responses <-
# set up first request to open connection and get session id, preserve cookies
request(url_) |>
req_cookie_preserve(path = tempfile()) |>
req_user_agent("Mozilla/5.0") |>
req_url_query(
transport = "polling",
t = Sys.time() |> as.numeric() |> sprintf("%f", . = _)
) |>
# set up iterator to walk through Engine.IO Protocol
req_perform_iterative(
iterate_with_eio_stages(msg_)
)
We get back all generated responses (& requests):
responses
#> [[1]]
#> <httr2_response>
#> GET
#> https://sports-eu-west-3.winamax.fr/uof-sports-server/socket.io/?language=FR&version=3.4.0&embed=false&EIO=3&transport=polling&t=1742738623.699882
#> Status: 200 OK
#> Content-Type: text/plain
#> Body: In memory (104 bytes)
#>
#> [[2]]
#> <httr2_response>
#> POST
#> https://sports-eu-west-3.winamax.fr/uof-sports-server/socket.io/?language=FR&version=3.4.0&embed=false&EIO=3&transport=polling&t=1742738624.132516&sid=pw3Lq55I2a09qcI3CA58
#> Status: 200 OK
#> Content-Type: text/html
#> Body: In memory (2 bytes)
#>
#> [[3]]
#> <httr2_response>
#> GET
#> https://sports-eu-west-3.winamax.fr/uof-sports-server/socket.io/?language=FR&version=3.4.0&embed=false&EIO=3&transport=polling&sid=pw3Lq55I2a09qcI3CA58&t=1742738624.367167
#> Status: 200 OK
#> Content-Type: text/plain
#> Body: In memory (32948 bytes)
#>
#> [[4]]
#> <httr2_response>
#> POST
#> https://sports-eu-west-3.winamax.fr/uof-sports-server/socket.io/?language=FR&version=3.4.0&embed=false&EIO=3&transport=polling&sid=pw3Lq55I2a09qcI3CA58&t=1742738624.576419
#> Status: 200 OK
#> Content-Type: text/html
#> Body: In memory (2 bytes)
Response to our message is in the 3rd item (last one is for closing the session). Assuming that our packet of interest is the 1st one in payload, we can extract it from response content string with eio_1st_packet()
, remove status code from the beginning and parse resulting JSON string:
responses[[3]] |>
resp_body_string() |>
eio_1st_packet() |>
str_remove("^\\d+") |>
fromJSON() |>
purrr::pluck(2, "matches") |>
purrr::map_chr("title") |>
tibble::enframe()
#> # A tibble: 19 × 2
#> name value
#> <chr> <chr>
#> 1 50955813 Angers - Rennes
#> 2 50955815 Auxerre - Montpellier
#> 3 50955817 Le Havre - Nantes
#> 4 50955819 Strasbourg - Lyon
#> 5 50955821 Toulouse - Brest
#> 6 50955823 Saint-Étienne - Paris SG
#> 7 50955825 Reims - Marseille
#> 8 50955827 Lille - Lens
#> 9 50955829 Monaco - Nice
#> 10 50955831 Marseille - Toulouse
#> 11 50955833 Nice - Nantes
#> 12 50955835 Brest - Monaco
#> 13 50955837 Montpellier - Le Havre
#> 14 50955839 Lyon - Lille
#> 15 50955841 Lens - Saint-Étienne
#> 16 50955843 Paris SG - Angers
#> 17 50955845 Reims - Strasbourg
#> 18 50955847 Rennes - Auxerre
#> 19 1000058305 Ligue 1 McDonald's® 2024/25