cavxfma

incompatible types when assigning to type ‘__m256d’ from type ‘int’


I'm working on a project to optimize Matrix Multiplication and I'm trying to use intrinsics.

Here's a bit of the code I'm using :

#include <immintrin.h>

/* Vector tiling and loop unrolling */
static void do_block(int lda, int M, int N, int K, double* A, double* B, double* C) {
  /* For each row i of A */
  int i, j, k;
  for (i = 0; i < M / 4; ++i) {
    /* For each column j of B */
    for (j = 0; j < N / 12; ++j) {
      register __m256d c_00_03_0 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12);
      register __m256d c_00_03_1 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12 + 4);
      register __m256d c_00_03_2 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12 + 8);
      register __m256d c_10_13_0 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12);
      register __m256d c_10_13_1 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12+4);
      register __m256d c_10_13_2 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12+8);
      register __m256d c_20_23_0 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12);
      register __m256d c_20_23_1 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12+4);
      register __m256d c_20_23_2 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12+8);
      register __m256d c_30_33_0 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12);
      register __m256d c_30_33_1 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12+4);
      register __m256d c_30_33_2 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12+8);

      /* Loop unrolling */
      for (k = 0; k < K; k += 1) {
    register __m256d b_00_03 = _mm256_loadu_pd(B + k * lda + j * 12);
    register __m256d b_10_03 = _mm256_loadu_pd(B + k * lda + j * 12+4);
    register __m256d b_20_03 = _mm256_loadu_pd(B + k * lda + j * 12+8);
    register __m256d a00 = _mm256_broadcast_sd(A + (4 * i) * lda + k);

    c_00_03_0 = _mm256_fmadd_pd(a00, b_00_03, c_00_03_0);
    c_00_03_1 = _mm256_fmadd_pd(a00, b_10_03, c_00_03_1);
    c_00_03_2 = _mm256_fmadd_pd(a00, b_20_03, c_00_03_2);

    a00 = _mm256_broadcast_sd(A + (4 * i + 1) * lda + k);
    c_10_13_0 = _mm256_fmadd_pd(a00, b_00_03, c_10_13_0);
    c_10_13_1 = _mm256_fmadd_pd(a00, b_10_03, c_10_13_1);
    c_10_13_2 = _mm256_fmadd_pd(a00, b_20_03, c_10_13_2);

    a00 = _mm256_broadcast_sd(A + (4 * i + 2) * lda + k);
    c_20_23_0 = _mm256_fmadd_pd(a00, b_00_03, c_20_23_0);
    c_20_23_1 = _mm256_fmadd_pd(a00, b_10_03, c_20_23_1);
    c_20_23_2 = _mm256_fmadd_pd(a00, b_20_03, c_20_23_2);

    a00 = _mm256_broadcast_sd(A + (4 * i + 3) * lda + k);
    c_30_33_0 = _mm256_fmadd_pd(a00, b_00_03, c_30_33_0);
    c_30_33_1 = _mm256_fmadd_pd(a00, b_10_03, c_30_33_1);
    c_30_33_2 = _mm256_fmadd_pd(a00, b_20_03, c_30_33_2);
      }
}
}

All the lines using the function __mm256_fmadd_pd throw the following error :

incompatible types when assigning to type ‘__m256d’ from type ‘int’

I'm not sure where the int comes from because all the arguments of __mm256_madd_pd are "__m256d" and the return value is also "__m256d" (__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c))


Solution

  • As mentioned by chtz, the processor I'm using does not support FMA functions as _mm256_fmadd_pd so I used the workaround he proposed that works just fine : _mm256_add_pd(_mm256_mul_pd(aXX, bYY), cZZ)