scalavectorsimdjmh

Problems with Java Vector API to sum a list of doubles


I implemented three naive sum methods (in scala) which act on an Array[Double] (or double[])

    inline def sum2 =
      var sum: Double = 0.0
      var i: Int = 0
      inline val species = DoubleVector.SPECIES_256

      while i < species.loopBound(vec.length) do        
        val vec2: DoubleVector = DoubleVector.fromArray(species, vec, i)        
        sum = sum + vec2.reduceLanes(VectorOperators.ADD)
        i += species.length()
      end while      
      while i < vec.length do
        sum += vec(i)
        i += 1
      end while
      sum
    end sum2

    inline def sum: Double =
      var sum = 0.0
      var i = 0;
      while i < vec.length do
        sum = sum + vec(i)
        i = i + 1
      end while
      sum


    inline def sum3 =
      var i: Int = 0
      val species = DoubleVector.SPECIES_256

      var acc = DoubleVector.zero(species)

      while i < species.loopBound(vec.length) do
        val vec2: DoubleVector = DoubleVector.fromArray(species, vec, i)
        acc =
          acc.add(vec2) // This line, appears to break JMH. It's not clear why as it passes a unit test and compiles.
        i += species.length()
      end while

      var temp = acc.reduceLanes(VectorOperators.ADD)
      // var temp = 0.0
      while i < vec.length do
        temp += vec(i)
        i += 1
      end while
      temp
    end sum3

Then setup a benchmark to see if there was a difference.


@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
@Fork(value = 1)
@Warmup(iterations = 3)
@Measurement(iterations = 3)
abstract class BLASBenchmark:


  private final val rand: Random = new Random(0);

  protected def randomDouble(): Double =
    return rand.nextDouble();

  protected def randomDoubleArray(n: Int): Array[Double] =
    val res = new Array[Double](n);

    for i <- 0 until n do res(i) = rand.nextDouble();
    end for
    return res;
  end randomDoubleArray

  protected def randomFloat(): Float =
    return rand.nextFloat();

  protected def randomFloatArray(n: Int): Array[Float] =
    val res = new Array[Float](n);
    for i <- 0 until n do res(i) = rand.nextFloat();
    end for
    return res;
  end randomFloatArray
end BLASBenchmark

@State(Scope.Thread)
class SumBenchmark extends BLASBenchmark:

  @Param(Array("3", "10", "100", "10000", "100000"))
  var len: String = uninitialized;

  var arr: Array[Double] = uninitialized

  @Setup(Level.Trial)
  def setup: Unit =
    arr = randomDoubleArray(len.toInt);
    ()
  end setup

  @Benchmark
  def sum_loop(bh: Blackhole) =
    val r = arr.sum
    bh.consume(r);
  end sum_loop

  @Benchmark
  def sum_vec(bh: Blackhole) =
    val r = arr.sum2
    bh.consume(r);
  end sum_vec

  /**
This doesn't work. I don't understand why.
   */
  @Benchmark
  def sum_vec_alt(bh: Blackhole) =
    val r = arr.sum3
    bh.consume(r);
  end sum_vec_alt

end SumBenchmark

The result of which were as follows (locally)

Benchmark               (len)   Mode  Cnt          Score           Error  Units
SumBenchmark.sum_loop       3  thrpt    3  650060202.104 ± 176059544.674  ops/s
SumBenchmark.sum_loop      10  thrpt    3  422309397.044 ±  19968584.370  ops/s
SumBenchmark.sum_loop     100  thrpt    3   30901788.878 ±    354896.502  ops/s
SumBenchmark.sum_loop   10000  thrpt    3     116283.523 ±      4388.696  ops/s
SumBenchmark.sum_loop  100000  thrpt    3      11487.085 ±       586.552  ops/s
SumBenchmark.sum_vec        3  thrpt    3  607547613.267 ±  25376258.940  ops/s
SumBenchmark.sum_vec       10  thrpt    3   48259596.149 ±   1205351.697  ops/s
SumBenchmark.sum_vec      100  thrpt    3    4604684.325 ±    122538.788  ops/s
SumBenchmark.sum_vec    10000  thrpt    3      45689.056 ±      1872.688  ops/s
SumBenchmark.sum_vec   100000  thrpt    3       4729.203 ±        83.013  ops/s

The way I'm reading this, is that my vectorised version is in fact, significantly slower that the simple while loop.

My questions;


Solution

  • I believe the algothim I used, was indeed naive.

        inline def sum: Double =
          var i: Int = 0
    
          var acc = DoubleVector.zero(Matrix.doubleSpecies)
          val sp = Matrix.doubleSpecies
          val l = sp.length()
    
          while i < sp.loopBound(vec.length) do
            acc = acc.add(DoubleVector.fromArray(Matrix.doubleSpecies, vec, i))
            i += l
          end while
          var temp = acc.reduceLanes(VectorOperators.ADD)
          // var temp = 0.0
          while i < vec.length do
            temp += vec(i)
            i += 1
          end while
          temp
        end sum
    

    With the key changes being;

    This provides a more satisfactory result.

    umBenchmark.sum_loop                  3     N/A  thrpt    3  454987372.518 ±  4974554.176  ops/s
    SumBenchmark.sum_loop                100     N/A  thrpt    3   22881036.167 ±   429259.728  ops/s
    SumBenchmark.sum_loop             100000     N/A  thrpt    3      10733.807 ±       65.014  ops/s
    SumBenchmark.sum_vec                   3     N/A  thrpt    3  454811646.671 ± 12166349.908  ops/s
    SumBenchmark.sum_vec                 100     N/A  thrpt    3   40353921.106 ±   547651.178  ops/s
    SumBenchmark.sum_vec              100000     N/A  thrpt    3      40141.621 ±      714.686  ops/s
    SumBenchmark.sum_vec_alt               3     N/A  thrpt    3  505591602.643 ± 36682920.662  ops/s
    SumBenchmark.sum_vec_alt             100     N/A  thrpt    3  110847968.173 ±  1363183.523  ops/s
    SumBenchmark.sum_vec_alt          100000     N/A  thrpt    3      42821.901 ±      244.293  ops/s
    
    

    For very large vectors, a factor of 4x (or so).