c++multithreadingcilk-plus

Cilk Plus code result depends on number of workers


I have a small piece of code that I would like to parallelize as I upscale. I've been using cilk_for from Cilk Plus to run the multithreading. The trouble is that I get a different result depending on the number of workers.

I've read that this might be due to a race condition, but I'm not sure what specifically about the code causes that or how to ameliorate it. Also, I realize that long and __float128 are overkill for this problem, but might be necessary in the upscaling.

Code:

#include <assert.h>
#include "cilk/cilk.h"
#include <cstring>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <string>
#include <vector>

using namespace std;

__float128 direct(const vector<double>& Rpct, const vector<unsigned>& values,     double Rbase, double toWin) {
    unsigned count = Rpct.size();
    __float128 sumProb = 0.0;
    __float128 rProb = 0.0;
    long nCombo = static_cast<long>(pow(2, count));

//  for (long j = 0; j < nCombo; ++j) { //over every combination
    cilk_for (long j = 0; j < nCombo; ++j) { //over every combination
        vector<unsigned> binary;

        __float128 prob = 1.0;
        unsigned point = Rbase;
        
        for (unsigned i = 0; i < count; ++i) { //over all the individual events
            long exp = static_cast<long>(pow(2, count-i-1));
            bool odd = (j/exp) %  2;  
            if (odd) {
                binary.push_back(1);
                point += values[i];
                prob *= static_cast<__float128>(Rpct[i]); 
            } else {
                binary.push_back(0);
                prob *= static_cast<__float128>(1.0 - Rpct[i]);  
            }            
        }

        sumProb += prob;
        if (point >= toWin)         rProb += prob;
        assert(sumProb >= rProb);
    }

    //print sumProb
    cout << " sumProb = " << (double)sumProb << endl;
    assert( fabs(1.0 - sumProb) < 0.01);

    return rProb;
}

int main(int argc, char *argv[]) {
    vector<double> Rpct;
    vector<unsigned> value;

    value.assign(20,1);
    Rpct.assign(20,0.25);

    unsigned Rbase  = 22;
    unsigned win = 30;

    __float128 rProb = direct(Rpct, value, Rbase, win);

    cout << (double)rProb << endl;

    return 0;
}

Sample output for export CILK_NWORKERS=1 && ./code.exe:

sumProb = 1

0.101812

Sample output for export CILK_NWORKERS=4 && ./code.exe:

sumProb = 0.948159

Assertion failed: (fabs(1.0 - sumProb) < 0.01), function direct, file code.c, line 61.

Abort trap: 6


Solution

  • It is because of a race condition. cilk_for is implementation of parallel for algorithm. If you want to use parallel for you must use independent iteration (independent data). It`is very important. You have to use cilk reducers for your case: https://www.cilkplus.org/tutorial-cilk-plus-reducers