rperformanceloopsdata-sciencexirr

Improving loop performance with function call inside


library(plyr);
library(sqldf);
library(data.table)
library(stringi);
library(RODBC);

dbhandle <- odbcDriverConnect('driver={SQL Server};server=.;database=TEST_DB;trusted_connection=true')
res <- sqlQuery(dbhandle, 'Select Company_ID,
       AsOfDate,
       CashFlow FROM dbo.Accounts')

resdatatable = as.data.table(res)

odbcCloseAll();


sppv <- function(i, n) {
    return((1 + i / 100) ^ (-n))
}


npv <- function(x, i) {
    npv = c()
    for (k in 1:length(i)) {
        pvs = x * sppv(i[k], 1:length(x))
        npv = c(npv, sum(pvs))
    }
    return(npv)
}


xirr <- function(cashflow, dates) {
    if (length(cashflow) != length(dates)) {
        stop("length(cashflow) != length(dates)")
    }

    cashflow_adj <- c(cashflow[1])
    for (i in 1:(length(cashflow) - 1)) {
        d1 <- as.Date(dates[i], "%d-%m-%Y", origin = "1970-01-01")
        d2 <- as.Date(dates[i + 1], "%d-%m-%Y", origin = "1970-01-01")

        # There are no checks about the monotone values of dates
        # put a check in here if the interval is negative

        interval <- as.integer(d2 - d1)

        if (length(interval) > 0 && !is.na(interval)) {
            cashflow_adj <- c(cashflow_adj, rep(0, interval - 1), cashflow[i + 1])
        }
   }

    left = -10
    right = 10
    epsilon = 1e-8
    while (abs(right - left) > 2 * epsilon) {
        midpoint = (right + left) / 2
        if (npv(cashflow_adj, left) * npv(cashflow_adj, midpoint) > 0) {
            left = midpoint
        } else {
            right = midpoint
        }
    }


    irr = (right + left) / 2 / 100
    irr <- irr * 365
    # Annualized yield (return) reflecting compounding effect of daily returns
    irr <- (1 + irr / 365) ^ 365 - 1

    irr
}




groupedCompanyNames <- unique(as.character(resdatatable$Company_ID));




groupedDatesPerCompany <- split(resdatatable$AsOfDate, resdatatable$Company_ID);




groupedCashFlowsPerCompany <- split(resdatatable$CashFlow, resdatatable$Company_ID);


resultsDataFrame <- data.table(Company_ID = character(length(groupedCompanyNames)), XIRR = numeric(length(groupedCompanyNames)));



datalist = result <- vector("list", length(groupedCompanyNames));



for (i in groupedCompanyNames) {


    datesForCompany <- groupedDatesPerCompany[i];
    dates <- datesForCompany[[i]];



    cashFlowsForCompany <- groupedCashFlowsPerCompany[i];
    cashFlows <- cashFlowsForCompany[[i]];


    xirrResult <- tryCatch(xirr(cashFlows, dates),
                           error = function(e) {

                              0
                           });

    newRow <- data.frame(Company_ID = i, XIRR = format(round(xirrResult, 2), nsmall = 2));
    datalist[[i]] <- newRow;

}

resultsDataFrame <- data.table::rbindlist(datalist)
finalDataFrame <- as.data.frame(resultsDataFrame);

print(finalDataFrame);

So to provide context, I am trying to do the following:

  1. Get data out of the database using an RODBC connection
  2. Get the unique company names
  3. Split the cashflows and dates per company
  4. Initialize a data table with a known number of rows so that it doesn't need to incrementally grow.
  5. Loop through the unique company names and call function get xirr on the list of cashflows and dates for the company.
  6. Add each row with the company name and the XIRR value to a new datatable.
  7. Use rbindlist.

Here is a sample of the source data I'm using

Company_ID  CashFlow    AsOfDate
3F68D729-D69D-E711-9C98-5065F34B3E7D    368608.0000 2004-11-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D    366999.0000 2004-12-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D    326174.0000 2005-01-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D    345666.0000 2005-02-28 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D    -1529180.0000   2005-03-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D    -65259.0000 2005-04-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D    514005.0000 2005-05-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D    512951.0000 2005-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-07-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6572.0000  2011-08-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-09-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6572.0000  2011-10-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-11-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6791.0000  2011-12-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -187375.0000    2012-01-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -215902.0000    2012-02-29 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -6572.0000  2012-03-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -217409.0000    2012-04-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D    -191830.0000    2012-05-31 00:00:00.000

I'm new to R - and with circa 2000 unique company names an on average 50 date, cashflow combinations each = 100000 records the loop takes about 28 secs to process.

I've looked at using the asParallel library and used foreach but that didn't seem to make any difference to the speed. If I take out the calling of the function xirr then the loop is processed and finished instantly.

The xirr needs the exception handling as sometimes its not possible to calculate an xirr value iteratively.

I know that looping is not really best practice in R - any suggestions on how to vectorise this for better performance?


Solution

  • In order improve the performance, I used the doParallel library.

    library(doParallel)
    cl <- makeCluster(detectCores() - 1, type = 'PSOCK')
    registerDoParallel(cl)
    

    And instead of the for loop, I put the logic into a foreach

    resultsDataFrame <- foreach(n = 1:length(groupedCompanyNames), .combine = rbind) %dopar% {
    
    
        company_id <- groupedCompanyNames[n];
        datesForCompany <- groupedDatesPerCompany[n];
        dates <- unsplit(datesForCompany, company_id);
    
    
        cashFlowsForCompany <- groupedCashFlowsPerCompany[n];
        cashFlows <- unsplit(cashFlowsForCompany, company_id);
    
        #now calculate the xirr for the values
        xirrResult <- tryCatch(xirr(cashFlows, dates),
        error = function(e) {
    
        0
        });
    
    
    
        data.frame(Company_ID = company_id, XIRR = format(round(xirrResult, 2), nsmall = 2));
    }
    
    registerDoSEQ();
    

    When I ran my full data set into it (4000 companies) with their dates and cashflows. A total of 400000 rows the original loop took around 10 minutes. With the foreach loop and utilising the extra cores in the machine, the operation took 60 seconds.

    I hope that someone will maybe be able to suggest a further performance spike on top of this but I think that is a good improvement.