rdatabasedataframemonetdbmonetdblite

Unable to combine large survey dataframes in R using MonetDB and MonetDBLite


I am trying to concatenate or rbind 12 years of survey data, each with several million observations and a hundred or so variables so as to analyze annual trends while accounting for the survey structure. The recent release of MonetDBLite for R seems like a great opportunity to implement column-based DBMS for this large dataset. I am, though, having difficulty in combining the data sets.

In brief, using plyr::r.bind.fill to DBI:dbWriteTable returns a truncated table. r.bind.fill is ideal because the data sets differ in variables from year to year. Using dplyr::rbind_all does not seem to work at all. A less useful approach (but still acceptable) involves removing columns to make the tables identical and using a straight-ahead UNION ALL SQL operation. But this returns syntax errors.

I am very new to dplyr and MonetDB so this is surely due to my own ignorance, but I've spent a couple of days scouring the web without luck, so any help would be greatly appreciated.

Following is a reproducible example.

Cheers

Charles

# install.packages("MonetDB.R", repos="http://dev.monetdb.org/Assets/R/")
# install.packages("MonetDBLite", repos="http://dev.monetdb.org/Assets/R/")
library(MonetDBLite)
library(MonetDB.R)
library(dplyr)
library(plyr)
library(zoo)


mtcars1<-mtcars[,-11] # create 2 slightly different versions of mtcars
mtcars2<-mtcars[,-10]

mtcars1<-coredata(mtcars1)[rep(seq(nrow(mtcars1)),200000),] # create large versions of those dataframes
mtcars2<-coredata(mtcars2)[rep(seq(nrow(mtcars2)),200000),]

dbdir <- tempdir() # create a temporary directory
con <- dbConnect(MonetDB.R(), embedded=dbdir) # use DBI to connect to MonetDB

dbWriteTable(con, "mtcars1", mtcars1) #  write the dataframes to column-based MonetDB tables
dbWriteTable(con, "mtcars2", mtcars2) 
dbListTables(con)

ms <- src_monetdb(embedded=dbdir) # create a dplyr::tbl version tables
mt1 <- tbl(ms, "mtcars1") 
mt2<-tbl(ms, "mtcars2")

# try plyr::rbind.fill to concatenate tables
dbWriteTable(con, "mt_1_2", rbind.fill(as.data.frame(mt1, mt2)))

# Warning message:
# Only first 6,400,000 results retrieved. Use n = -1 to retrieve all.

dbGetQuery(con, "SELECT COUNT(*) FROM mt_1_2 " )
#      L1
# 1 1e+05

dbRemoveTable(con, "mt_1_2") # remove table to re-try

# try dbFetch(res, n=-1) to retrieve all results
dbFetch(dbWriteTable(con, "mt_1_2", rbind.fill(as.data.frame(mt1, mt2))), n=-1)

# Error in (function (classes, fdef, mtable)  :
#   unable to find an inherited method for function ‘dbFetch’ for signature ‘"logical", "numeric"’
# In addition: Warning message:
# Only first 6,400,000 results retrieved. Use n = -1 to retrieve all.

dbRemoveTable(con, "mt_1_2") # remove table to re-try

dbListFields(con, "mtcars1") # remove fields to make table columns identical
dbListFields(con, "mtcars2")

dbGetQuery(con, "
ALTER TABLE mtcars1
DROP COLUMN gear
")

dbGetQuery(con, "
ALTER TABLE mtcars2
DROP COLUMN carb
")

dbGetQuery(con,
    "CREATE TABLE mt_1_2 WITH 
    Select * FROM mtcars1
    UNION ALL
    Select * FROM mtcars2")


# Error in .local(conn, statement, ...) :
#   Unable to execute statement 'CREATE TABLE mt_1_2 AS
#   Select * FROM mtcars1
#   UNION ALL
#   Select * FROM mtcars2'.
# Server says 'syntax error, unexpected SCOLON, expecting WITH in: "create table mt_1_2 as
#   select * from mtcars1
#   union all
#   select * from mtcars2"
# ' [#42000].

Solution

  • You can stick with dplyr and use rbind_list

    library(MonetDB.R)
    library(MonetDBLite)
    library(dplyr)
    
    mtcars1 <- mtcars[, -11] # create 2 slightly different versions of mtcars
    mtcars2 <- mtcars[, -10]
    
    ## Reduce size
    mtcars1 <- mtcars1[rep(seq(nrow(mtcars1)), 10000), ]
    mtcars2 <- mtcars2[rep(seq(nrow(mtcars2)), 10000), ]
    
    ### Check size
    nrow(mtcars1)
    ## [1] 320000
    
    nrow(mtcars2)
    ## [1] 320000
    
    ###
    dbdir <- tempdir() ## create a temporary directory
    con <- dbConnect(MonetDB.R(), embedded = dbdir)
    
    ###
    dbWriteTable(con, name = "mtcars1", value = mtcars1)
    dbWriteTable(con, name = "mtcars2", value = mtcars2)
    
    dbListTables(con)
    
    ###
    ms <- src_monetdb(embedded = dbdir) # create a dplyr::tbl version tables
    mt1 <- tbl(ms, "mtcars1") 
    mt2 <- tbl(ms, "mtcars2")
    
    ### You need to add `n = -1` to `as.data.frame` to retrieve all rows
    dbWriteTable(con, "mt_1_2", rbind_list(as.data.frame(mt1, n = -1),
                                           as.data.frame(mt2, n = -1)))
    
    ###
    dbGetQuery(con, "SELECT COUNT(*) FROM mt_1_2")
    ##       L1
    ## 1 640000