rschemaapache-arrowarrows

In arrow , how to create schema according the original variable type and avoid type in manualy


In arrow , can create schema by map just as below my_schema_1 . I want to create my_schema_2 , the schema type to double or string accoring varible type, but the my_schema_2 list value sequence not match width the name of csv file sample.csv, So arrow_data_2 %>% as.data.frame() failed . How to fix it ? Thanks!

library(tidyverse)
library(arrow)
library(lubridate)
data(lakers)

create file sample.csv

write.csv(lakers,"sample.csv",row.names = FALSE)

create my_schema_1 and following process successed

my_schema_1 <- schema(
  purrr::map(names(lakers),
             ~ Field$create(name=.x,type = string()))
)

arrow_data_1 <- arrow::open_csv_dataset('sample.csv',schema = my_schema_1,skip  =1)
arrow_data_1 %>% as.data.frame()

create my_schema_2 and arrow_data_2 %>% as.data.frame() failed, cause the value sequence in my_schema_2 not match with the names of sample.csv

my_schema_2 <- schema(
  c(purrr::map(lakers %>% sapply(.,is.numeric) %>% which() %>% names(),
             ~ Field$create(name=.x,type = double())),
  purrr::map(lakers %>% sapply(.,is.character) %>% which() %>% names(),
             ~ Field$create(name=.x,type = string()))
  )
)

arrow_data_2 <- arrow::open_csv_dataset('sample.csv',schema = my_schema_2,skip=1)
arrow_data_2 %>% as.data.frame()

Solution

  • Another way to create schema_2 is to use a loop, which preserves the order of the variables.

    my_schema_2 <- schema(sapply(names(lakers), \(x) {
      type = typeof(get(x, lakers))
      switch(type,
             integer = field(name=x, type=double()),
             character = field(name=x, type=string())
             )
      }))
    
    arrow_data_2 <- arrow::open_csv_dataset('sample.csv',
                       schema = my_schema_2,skip=1)
    arrow_data_2 %>% as.data.frame()
    

    # A tibble: 34,624 × 13
           date opponent game_type time  period etype      team  player              result points type      x     y
          <dbl> <chr>    <chr>     <chr>  <dbl> <chr>      <chr> <chr>               <chr>   <dbl> <chr> <dbl> <dbl>
     1 20081028 POR      home      12:00      1 jump ball  OFF   NA                  NA          0 NA       NA    NA
     2 20081028 POR      home      11:39      1 shot       LAL   Pau Gasol           missed      0 hook     23    13
     3 20081028 POR      home      11:37      1 rebound    LAL   Vladimir Radmanovic NA          0 off      NA    NA
     4 20081028 POR      home      11:25      1 shot       LAL   Derek Fisher        missed      0 layup    25     6
     5 20081028 POR      home      11:23      1 rebound    LAL   Pau Gasol           NA          0 off      NA    NA
     6 20081028 POR      home      11:22      1 shot       LAL   Pau Gasol           made        2 hook     25    10
     7 20081028 POR      home      11:22      1 foul       POR   Greg Oden           NA          0 shoo…    NA    NA
     8 20081028 POR      home      11:22      1 free throw LAL   Pau Gasol           made        1 NA       NA    NA
     9 20081028 POR      home      11:00      1 foul       LAL   Vladimir Radmanovic NA          0 pers…    NA    NA
    10 20081028 POR      home      10:53      1 shot       POR   LaMarcus Aldridge   made        2 jump     36    21