rdataframedplyrrmysql

How to query number of Ids in batches in R


I have below mentioned dataframe in R.

ID       Amount     Date
IK-1     100        2020-01-01
IK-2     110        2020-01-02
IK-3     120        2020-01-03
IK-4     109        2020-01-03
IK-5     104        2020-01-03

I'm using ID to fetch some details from MySQL using the following code.

library(RMySQL)

conn<- connection

query<-paste0("SELECT c.ID,e.Parameters, d.status
FROM Table1 c
left outer join Table2 d ON d.seq_id=c.ID
LEFT outer JOIN Table3 e ON e.role_id=d.role
           where c.ID IN (", paste(shQuote(dataframe$ID, type = "sh"),
                                      collapse = ', '),") 
and e.Parameters in
           ('Section1',
           'Section2','Section3',
           'Section4');")

res1 <- dbGetQuery(conn,query)

res2<-res1[res1$Parameters=="Section1",4:5]
colnames(res2)[colnames(res2)=="status"] <- "Section1_Status"

The above code is working fine, If i pass ~1000 ID but it throws R termination error when passing 10000 or more ID at a time.

How can I create a loop and pass the Id in batched to get the one final output for 10000 ID.

Error Message:

Warning message:
In dbFetch(rs, n = n, ...) : error while fetching rows

Solution

  • # Load Packages
    library(dplyr) # only needed to create the initial dataframe
    library(RMySQL)
    
    # create the initial dataframe
    df <- tribble(
        ~ID,       ~Amount,     ~Date
        , "IK-1"    , 100       , 2020-01-01
        , "IK-2"    , 110       , 2020-01-02
        , "IK-3"    , 120       , 2020-01-03
        , "IK-4"    , 109       , 2020-01-03
        , "IK-5"    , 104       , 2020-01-03
    )
    
    # first helper function
    createIDBatchVector <- function(x, batchSize){
        paste0(
            "'"
            , sapply(
                split(x, ceiling(seq_along(x) / batchSize))
                , paste
                , collapse = "','"
            )
            , "'"
        )
    }
    
    # second helper function
    createQueries <- function(IDbatches){
        paste0("
    SELECT c.ID,e.Parameters, d.status
    FROM Table1 c
        LEFT OUTER JOIN Table2 d ON d.seq_id =c.ID
        LEFT OUTER JOIN Table3 e ON e.role_id = d.role
    WHERE c.ID IN (", IDbatches,") 
    AND e.Parameters in ('Section1','Section2','Section3','Section4');
    ")
    }
    
    # ------------------------------------------------------------------
    
    # and now the actual script
    
    # first we create a vector that contains one batch per element
    IDbatches <- createIDBatchVector(df$ID, 2)
    
    # It looks like this:
    # [1] "'IK-1','IK-2'" "'IK-3','IK-4'" "'IK-5'" 
    
    # now we create a vector of SQL-queries out of that
    queries <- createQueries(IDbatches)
    
    cat(queries) # use cat to show what they look like
    
    # it looks like this:
    
    # SELECT c.ID,e.Parameters, d.status
    # FROM Table1 c
    #     LEFT OUTER JOIN Table2 d ON d.seq_id =c.ID
    #     LEFT OUTER JOIN Table3 e ON e.role_id = d.role
    # WHERE c.ID IN ('IK-1','IK-2') 
    # AND e.Parameters in ('Section1','Section2','Section3','Section4');
    #  
    # SELECT c.ID,e.Parameters, d.status
    # FROM Table1 c
    #     LEFT OUTER JOIN Table2 d ON d.seq_id =c.ID
    #     LEFT OUTER JOIN Table3 e ON e.role_id = d.role
    # WHERE c.ID IN ('IK-3','IK-4') 
    # AND e.Parameters in ('Section1','Section2','Section3','Section4');
    #  
    # SELECT c.ID,e.Parameters, d.status
    # FROM Table1 c
    #     LEFT OUTER JOIN Table2 d ON d.seq_id =c.ID
    #     LEFT OUTER JOIN Table3 e ON e.role_id = d.role
    # WHERE c.ID IN ('IK-5') 
    # AND e.Parameters in ('Section1','Section2','Section3','Section4');
    
    # and now the loop
    df_final <- data.frame() # initialize a dataframe
    
    conn <- connection # open a connection
    for (query in queries){ # iterate over the queries
        df_final <- rbind(df_final, dbGetQuery(conn,query))
    }
    
    # And here the connection should be closed. (I don't know the function call for this.)