performancealgorithmrecursioncilk

How Should I Implement Cilk Parallelism with a Recursive Scan Algorithm?


I implemented a recursive scan (prefix sum) algorithm, which I've included below. Here, I simply generate random lists of size powers of two up to the twenty-seventh power, checking against a simple sequential scan for accuracy. It works.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <mkl.h>

int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);

int main(int argc, char **argv)
{
  int n;
  int i, j, k;
  int *x, *seq, *r;
  double begin, end;

  srand48(time(0));

  /* Randomly generate array of size n. */

  for (k = 2; k < 28; k++) {
    n = (int) pow(2, k);

    seq = (int *) malloc(sizeof(int) * n);
    x = (int *) malloc(sizeof(int) * n);

    for (i = 0; i < n; i++) {
      x[i] = lrand48() % 100 - 50;
      seq[i] = x[i];
    }

    /* Parallel scan. */

    begin = dsecnd();

    r = pscan(x, n, 0, 2);

    end = dsecnd();

    printf("%d %lf\n", n, end - begin);

    /* Sequential check. */

    for (i = 1; i < n; i++) {
      seq[i] = seq[i - 1] + seq[i];
    }

    for (i = 0; i < n; i++) {
      if (r[i] != seq[i]) {
        fprintf(stderr, "AGGGHHH!!! ERROR.  Found with vector: \n");
        for (j = 0; j < n; j++) {
          printf("%d ", x[i]);
        }
        printf("\n");
        exit(1);
      }
    }
    free(r);
    free(x);
    free(seq);
  }

  return 0;
}

/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
  int i, j;
  int *sums, *sumscan, *scan, **fsum, *rv;

  /* Base case, serially scan a chunk. */
  if (n <= chunk_size) {
    scan = (int *) malloc(sizeof(int) * n);

    scan[0] = x[0] + z;
    for (i = 1; i < n; i++) {
      scan[i] = x[i] + scan[i - 1];
    }

    return scan;
  }

  sums = (int *) malloc(sizeof(int) * (n / chunk_size));

  /* Reduce each chunk of the array. */
  for (i = 0; i < n / chunk_size; i++) {
    sums[i] = reduce(&x[i * chunk_size], chunk_size);
  }

  /* Perform a scan on the sums. */
  sumscan = pscan(sums, n / chunk_size, 0, chunk_size);

  free(sums);

  fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));

  /* Perform a recursive scan on each chunk, using
     the appropriate offset from the sums scan. */
  for (i = 0; i < n / chunk_size; i++) {
    if (i > 0) {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
    } else {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
    }
  }

  free(sumscan);

  rv = (int *) malloc(sizeof(int) * n);

  /* Join the arrays. */
  for (i = 0; i < n / chunk_size; i++) {
    for (j = 0; j < chunk_size; j++) {
      rv[i * chunk_size + j] = fsum[i][j];
    }
  }

  for (i = 0; i < n / chunk_size; i++) {
    free(fsum[i]);
  }

  free(fsum);

  return rv;
}

/* Serial reduction. */
int reduce(int *x, int n)
{
  int i;
  int sum;

  sum = 0;

  for (i = 0; i < n; i++) {
    sum += x[i];
  }

  return sum;
}

Now, I'd like to parallelize it. Because I'm feeling a little hipster-ish, I've hacked up a Cilk implementation. I just replace the two main for loops to parallelize 1) the reduction and 2) the recursive scan of each chunk, using the appropriate scan of the chunk reductions as an offset. It looks like so.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cilk/cilk.h>
#include <mkl.h>

int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);

int main(int argc, char **argv)
{
  int n;
  int i, j, k;
  int *x, *seq, *r;
  double begin, end;

  srand48(time(0));

  /* Randomly generate array of size n. */

  for (k = 2; k < 28; k++) {
    n = (int) pow(2, k);

    seq = (int *) malloc(sizeof(int) * n);
    x = (int *) malloc(sizeof(int) * n);

    for (i = 0; i < n; i++) {
      x[i] = lrand48() % 100 - 50;
      seq[i] = x[i];
    }

    /* Parallel scan. */

    begin = dsecnd();

    r = pscan(x, n, 0, 2);

    end = dsecnd();

    printf("%d %lf\n", n, end - begin);

    /* Sequential check. */

    for (i = 1; i < n; i++) {
      seq[i] = seq[i - 1] + seq[i];
    }

    for (i = 0; i < n; i++) {
      if (r[i] != seq[i]) {
        fprintf(stderr, "AGGGHHH!!! ERROR.  Found with vector: \n");
        for (j = 0; j < n; j++) {
          printf("%d ", x[i]);
        }
        printf("\n");
        exit(1);
      }
    }
    free(r);
    free(x);
    free(seq);
  }

  return 0;
}

/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
  int i, j;
  int *sums, *sumscan, *scan, **fsum, *rv;

  /* Base case, serially scan a chunk. */
  if (n <= chunk_size) {
    scan = (int *) malloc(sizeof(int) * n);

    scan[0] = x[0] + z;
    for (i = 1; i < n; i++) {
      scan[i] = x[i] + scan[i - 1];
    }

    return scan;
  }

  sums = (int *) malloc(sizeof(int) * (n / chunk_size));

  /* Reduce each chunk of the array. */
  cilk_for (i = 0; i < n / chunk_size; i++) {
    sums[i] = reduce(&x[i * chunk_size], chunk_size);
  }

  /* Perform a scan on the sums. */
  sumscan = pscan(sums, n / chunk_size, 0, chunk_size);

  free(sums);

  fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));

  /* Perform a recursive scan on each chunk, using
     the appropriate offset from the sums scan. */
  cilk_for (i = 0; i < n / chunk_size; i++) {
    if (i > 0) {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
    } else {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
    }
  }

  free(sumscan);

  rv = (int *) malloc(sizeof(int) * n);

  /* Join the arrays. */
  for (i = 0; i < n / chunk_size; i++) {
    for (j = 0; j < chunk_size; j++) {
      rv[i * chunk_size + j] = fsum[i][j];
    }
  }

  for (i = 0; i < n / chunk_size; i++) {
    free(fsum[i]);
  }

  free(fsum);

  return rv;
}

/* Serial reduction. */
int reduce(int *x, int n)
{
  int i;
  int sum;

  sum = 0;

  for (i = 0; i < n; i++) {
    sum += x[i];
  }

  return sum;
}

And it works! Well, it returns correct results. It doesn't achieve the performance I had hoped. The original performance was

4 0.000004
8 0.000001
16 0.000002
32 0.000003
64 0.000005
128 0.000011
256 0.000019
512 0.000035
1024 0.000068
2048 0.000130
4096 0.000257
8192 0.000512
16384 0.001129
32768 0.002262
65536 0.004519
131072 0.009065
262144 0.018297
524288 0.037416
1048576 0.078307
2097152 0.157448
4194304 0.313855
8388608 0.625689
16777216 1.251949
33554432 2.589439
67108864 5.084731
134217728 10.402186

for the single-threaded application, but the Cilk version performend worse, with the following runtimes

4 0.005383
8 0.000011
16 0.000009
32 0.000111
64 0.000055
128 0.000579
256 0.000339
512 0.000544
1024 0.000701
2048 0.001086
4096 0.001265
8192 0.001742
16384 0.002283
32768 0.003891
65536 0.005398
131072 0.009255
262144 0.020736
524288 0.058156
1048576 0.103893
2097152 0.215460
4194304 0.419988
8388608 0.749368
16777216 1.650938
33554432 2.960451
67108864 5.799836
134217728 11.294398

I have a 24-core machine, so we're obviously not seeing the speed-up we would hope for here. My first thought was that Cilk is mishandling the recursion, causing oversubscription, but Cilk is specifically supposed to handle recursion well. Any tips on how to implement this properly? I tried adding cilk_for to the bottom for loop (freeing everything) and the inner for-loop of the penultimate set of loops (joining the array), but that slowed performance down even more.

Any advice is well-appreciated.

However, please don't tell me to switch to Belloch's parallel scan algorithm discussed here. I already implemented that in Cilk, and it worked quite well. I'd like to see if I can match its performance with this recursive solution.


Solution

  • I fixed my performance problems by finding the optimal chunk size for each problem. At that chunk size, the (same) parallel version performs better than the sequential version.

    In summary, there were a few things wrong with both my general approach and particularly the chunk size of two:

    1. My benchmarking approach. In a code with a tuning parameter, it doesn't make much sense to plot runtime vs. problem size using the same value for the tuning parameter because the optimal value is dependent on problem size.
    2. A chunk size of two was likely problematic because, while it maximizes parallelism, it also maximizes the number of levels of recursion and, likewise, the overhead that comes along with it.
    3. A chunk size of two prevents vectorization.
    4. As Leeor suggested, a chunk size of two probably also leads to false sharing in the cache.

    Props to Leeor for leading me in the right direction.