cperformancesimdaltivec

SIMD with Altivec: why is multiplying two vectors faster than adding two vectors?


I've been implementing basic math operations using altivec as a way to learn simd for an upcoming project. Also, just as a way to see the performance benefit of it, I track how long it takes to perform the operations, but I came across something odd.

The first thing I did was add two vectors together and subtract two vectors. This works fine. The next thing I did was multiply two vectors together. However, multiplying is faster than adding, even though less clock cycles are used to add verses multiplying according to what my particular CPU's datasheet says about the instructions being used.

I have two arrays that are each 10MBs large and run them through these two routines:

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
  int iterations = size / (sizeof(__vector int32_t) / sizeof(int32_t));

  __vector int32_t* tempA = (__vector int32_t *) intArrayA;
  __vector int32_t* tempB = (__vector int32_t *) intArrayB;
  __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
  for(int i = 0; i < iterations; i++)
  {
    __vector int32_t sum = vec_add(*tempA, *tempB);
    vec_st(sum, 0, tempOut);

    tempA++;
    tempB++;
    tempOut++;
  }
}

  void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
  {
    int iterations = size / (sizeof(__vector int16_t) / sizeof(int16_t));
    __vector int16_t* tempA = (__vector int16_t *) intArrayA;
    __vector int16_t* tempB = (__vector int16_t *) intArrayB;
    __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;


    for(int i = 0; i < iterations; i++)
    {
      __vector int32_t productEven = vec_mule(*tempA, *tempB);
      __vector int32_t productOdd = vec_mulo(*tempA, *tempB);

      __vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
      __vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);

      vec_st(mergedProductHigh, 0, tempOut);
      tempOut++;
      vec_st(mergedProductLow, 0, tempOut);

      tempA++;
      tempB++;
      tempOut++;
    }
  }

On my particular platform, av_AddValues takes 81ms to process and av_MultiplyValues takes 48ms to process. (Times recorded using std::chrono::high_resolution_clock)

Why does multiplying take less time to process than adding?

I don't think adding 32bit values verses multiplying 16bit values makes a difference considering the __vector type always processing 16 bytes of data.

My first thought was that since adding numbers together is such a trivial task, the CPU finishes the operation faster than it can fetch data from memory. Whereas with multiplying, this latency of fetching is hidden by the fact the CPU is busy doing work and never has to wait as long.

Is this a correct assumption to make?

Full code:

#include <chrono>
#include <random>
#include <limits>

#include <iostream>
#include <cassert>
#include <cstring>
#include <cstdint>
#include <malloc.h>

#include <altivec.h>
#undef vector

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size);
void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size);
void TestAdd();
void TestMultiply();
void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size);
void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size);

int main()
{
  TestAdd();
  TestMultiply();
}

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size)
{
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> dis(std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());

  for(int i = 0; i < size; i++)
  {
    inputABuffer[i] = dis(gen);
    inputBBuffer[i] = dis(gen);
    outputBuffer[i] = 0;
  }
}

void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size)
{
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> dis(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());

  for(int i = 0; i < size; i++)
  {
    inputABuffer[i] = dis(gen);
    inputBBuffer[i] = dis(gen);
    outputBuffer[i] = 0;
  }
}

void TestAdd()
{
    int size = 10'485'760;
    int bytes = size * sizeof(int32_t);

    int32_t* inputABuffer = (int32_t*) memalign(64, bytes);
    int32_t* inputBBuffer = (int32_t*) memalign(64, bytes);
    int32_t* outputBuffer = (int32_t*) memalign(64, bytes);
    assert(inputABuffer != nullptr);
    assert(inputBBuffer != nullptr);
    assert(outputBuffer != nullptr);

    GenerateRandom32bitValues(inputABuffer, inputBBuffer, outputBuffer, size);

    for(int i = 0; i < 20; i++)
    {
      auto start = std::chrono::high_resolution_clock::now();
      av_AddValues(inputABuffer, inputBBuffer, outputBuffer, size);
      auto end = std::chrono::high_resolution_clock::now();
      auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

      for(int k = 0; k < size; k++)
      {
        assert(outputBuffer[k] == (inputABuffer[k] + inputBBuffer[k]));
      }

      std::cout << "Vector Sum - " << diff.count() << "ms\n";
      memset(outputBuffer, 0, size);
    }
}

void TestMultiply()
{
    int size = 10'485'760;
    int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
    int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
    int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t));
    assert(inputABuffer != nullptr);
    assert(inputBBuffer != nullptr);
    assert(outputBuffer != nullptr);

    GenerateRandom16bitValues(inputABuffer, inputBBuffer, outputBuffer, size);

    for(int i = 0; i < 20; i++)
    {
      auto start = std::chrono::high_resolution_clock::now();
      av_MultiplyValues(inputABuffer, inputBBuffer, outputBuffer, size);
      auto end = std::chrono::high_resolution_clock::now();
      auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

      for(int k = 0; k < size; k++)
      {
        assert(outputBuffer[k] == (inputABuffer[k] * inputBBuffer[k]));
      }

      std::cout << "Vector product - " << diff.count() << "ms\n";
      memset(outputBuffer, 0, size);
    }
}

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
  int iterations = size / (sizeof(__vector int32_t) / sizeof(int32_t));

  __vector int32_t* tempA = (__vector int32_t *) intArrayA;
  __vector int32_t* tempB = (__vector int32_t *) intArrayB;
  __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;

  for(int i = 0; i < iterations; i++)
  {
    __vector int32_t sum = vec_add(*tempA, *tempB);
    vec_st(sum, 0, tempOut);

    tempA++;
    tempB++;
    tempOut++;
  }
}

void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
{
  int iterations = size / (sizeof(__vector int16_t) / sizeof(int16_t));
  __vector int16_t* tempA = (__vector int16_t *) intArrayA;
  __vector int16_t* tempB = (__vector int16_t *) intArrayB;
  __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
  for(int i = 0; i < iterations; i++)
  {
    __vector int32_t productEven = vec_mule(*tempA, *tempB);
    __vector int32_t productOdd = vec_mulo(*tempA, *tempB);

    __vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
    __vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);

    vec_st(mergedProductHigh, 0, tempOut);
    tempOut++;
    vec_st(mergedProductLow, 0, tempOut);

    tempA++;
    tempB++;
    tempOut++;
  }
}

Output of perf stat and perf record:

  Adding
   Performance counter stats for './alti':

         2151.146080      task-clock (msec)         #    0.999 CPUs utilized          
                   9      context-switches          #    0.004 K/sec                  
                   0      cpu-migrations            #    0.000 K/sec                  
               30957      page-faults               #    0.014 M/sec                  
          3871497132      cycles                    #    1.800 GHz                    
     <not supported>      stalled-cycles-frontend  
     <not supported>      stalled-cycles-backend   
          1504538891      instructions              #    0.39  insns per cycle        
           234038234      branches                  #  108.797 M/sec                  
              687912      branch-misses             #    0.29% of all branches        
           270305159      L1-dcache-loads           #  125.656 M/sec                  
            79819113      L1-dcache-load-misses     #   29.53% of all L1-dcache hits  
     <not supported>      LLC-loads                
     <not supported>      LLC-load-misses          

         2.152697186 seconds time elapsed


  CPU Utilization
    76.04%  alti     alti                 [.] av_AddValues    

  Multiply

  Performance counter stats for './alti':

         1583.016640      task-clock (msec)         #    0.999 CPUs utilized          
                   4      context-switches          #    0.003 K/sec                  
                   0      cpu-migrations            #    0.000 K/sec                  
               20717      page-faults               #    0.013 M/sec                  
          2849050875      cycles                    #    1.800 GHz                    
     <not supported>      stalled-cycles-frontend  
     <not supported>      stalled-cycles-backend   
          1520409634      instructions              #    0.53  insns per cycle        
           179185029      branches                  #  113.192 M/sec                  
              535437      branch-misses             #    0.30% of all branches        
           205341530      L1-dcache-loads           #  129.715 M/sec                  
            27124936      L1-dcache-load-misses     #   13.21% of all L1-dcache hits  
     <not supported>      LLC-loads                
     <not supported>      LLC-load-misses          

         1.584145737 seconds time elapsed


  CPU Utilization
    60.35%  alti     alti               [.] av_MultiplyValues       

Solution

  • It's related to sizes of your input buffers.

    in one case (TestAdd) :

    int size = 10'485'760;
    int bytes = size * sizeof(int32_t);
    
    int32_t* inputABuffer = (int32_t*) memalign(64, bytes);
    int32_t* inputBBuffer = (int32_t*) memalign(64, bytes);
    int32_t* outputBuffer = (int32_t*) memalign(64, bytes);
    

    you allocate 3 * size * 4 bytes (sizeof(int32_t) = 4)

    in the other (test_mul) :

    int size = 10'485'760;
    int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
    int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
    int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t));
    

    you allocate size*4 + 2*size*2 (sizeof(int16_t) = 2)

    Since this code is entirely memory bound, second code is (3*4) / (4 + 2*2) = 1.5x faster.

    This is in line with your measurements since 2.15 / 1.5 = 1.43, which is close to 1.58.