rpandasdplyrpyarrowreticulate

Passing data between R and Python - How to convert arrow table to tibble/dataframe?


The goal is:

  1. Take Tidy data from R, convert it to Arrow data
  2. Pass arrow data to python, convert to Pandas df
  3. Manipulate Pandas df, convert it back to arrow
  4. Retrieve arrow data from python, convert back to Tidy data

I can't seem to get past that last step.

# R
library(reticulate)
library(dplyr)
library(arrow)

# Python
import pyarrow 
import pandas

Make a tibble, convert it to Arrow table, pass it to Python

arrow_dat <- arrow::as_arrow_table(tibble(col = c(1,2,3)))
py_taxa_arrow <- r_to_py(arrow_dat)

Convert data to Pandas Dataframe, edit it, convert to Arrow

r.py_taxa_arrow

pyarrow.Table
col: double
----
col: [[1,2,3]]

py_taxa_arrow_to_pd = r.py_taxa_arrow.to_pandas()

py_taxa_arrow_edited = pandas.DataFrame(py_taxa_arrow_to_pd) + 1
py_taxa_arrow_edited

   col
0  2.0
1  3.0
2  4.0

py_taxa_arrow_edited_converted = pyarrow.Table.from_pandas(py_taxa_arrow_edited)
py_taxa_arrow_edited_converted

pyarrow.Table
col: double
----
col: [[2,3,4]]

Retrieve the edited data

py_taxa_arrow_edited <- r_to_py(py$py_taxa_arrow_edited_converted)
py_taxa_arrow_edited

pyarrow.Table
col: double
----
col: [[2,3,4]]

Convert to Tibble

tibble(py_taxa_arrow_edited)

Error in `tibble()`:
! All columns in a tibble must be vectors.
āœ– Column `py_taxa_arrow_edited` is a
  `pyarrow.lib.Table/pyarrow.lib._Tabular/pyarrow.lib._PandasConvertible/pyarrow.lib._Weakrefable/python.builtin.object`
  object.
Backtrace:
 1. tibble::tibble(py_taxa_arrow_edited)

 Error in tibble(py_taxa_arrow_edited) : 
āœ– Column `py_taxa_arrow_edited` is a
  `pyarrow.lib.Table/pyarrow.lib._Tabular/pyarrow.lib._PandasConvertible/pyarrow.lib._Weakrefable/python.builtin.object`
  object.
 
9.
    
stop(fallback)
 
8.
    
signal_abort(cnd, .file)
 
7.
    
abort(x, class, ..., call = call, parent = parent, use_cli_format = TRUE)
 
6.
    
tibble_abort(call = call, problems("All columns in a tibble must be vectors:", 
    x = paste0("Column ", name_or_pos(names, positions), " is ", 
        classes)), names = names)
 
5.
    
abort_column_scalar_type(names_x[is_xd], pos[is_xd], classes, 
    call)
 
4.
    
check_valid_cols(set_names(list(x), name), call = call)
 
3.
    
check_valid_col(res, col_names[[j]], j, call)
 
2.
    
tibble_quos(xs, .rows, .name_repair)
 
1.
    
tibble(py_taxa_arrow_edited)

Edit: Final Solution

# R
arrow_dat <- arrow::as_arrow_table(tibble(col = c(1,2,3)))
py_taxa_arrow <- r_to_py(arrow_dat)
# Python
py_taxa_arrow_to_pd = r.py_taxa_arrow.to_pandas()
py_taxa_arrow_edited = py_taxa_arrow_to_pd + 1
py_taxa_arrow_edited_converted = pyarrow.Table.from_pandas(py_taxa_arrow_edited)
# R
as_tibble(py$py_taxa_arrow_edited_converted)

col
<dbl>   
2               
3               
4   

Solution

  • There are two issues with your reprex:

    I have recreated the fixed reprex so it can be run in one go:

    library(reticulate)
    library(dplyr)
    library(arrow)
    
    pa <- import('pyarrow')
    pd <- import('pandas')
    
    arrow_dat <- arrow::as_arrow_table(tibble(col = c(1,2,3)))
    
    # Convert to python
    py_taxa_arrow <- r_to_py(arrow_dat)
    py_taxa_arrow
    
    # Do stuff
    py_taxa_arrow_to_pd = py_taxa_arrow$to_pandas()
    py_taxa_arrow_to_pd
    
    py_taxa_arrow_edited <- py_taxa_arrow_to_pd + 1
    py_taxa_arrow_edited
    
    # Convert back
    # py_to_r converts to data.frame
    # pa$Table$from_pandas converts to Arrow table, so you want to use
    # that if you have larger (than memory) data.
    py_taxa_arrow_edited_converted <- pa$Table$from_pandas(py_taxa_arrow_edited) 
    py_taxa_arrow_edited_converted
    
    as_tibble(py_taxa_arrow_edited_converted)