c++segmentation-fault simd avx google-benchmark

_mm256_load_ps cause segmentation fault with google/benchmark in debug mode

The following code can run in both release and debug mode.

#include <immintrin.h>

constexpr int n_batch = 10240;
constexpr int n = n_batch * 8;
#pragma pack(32)
float a[n];
float b[n];
float c[n];
#pragma pack()

int main() {
    for(int i = 0; i < n; ++i)
        c[i] = a[i] * b[i];

    for(int i = 0; i < n; i += 4) {
        __m128 av = _mm_load_ps(a + i);
        __m128 bv = _mm_load_ps(b + i);
        __m128 cv = _mm_mul_ps(av, bv);
        _mm_store_ps(c + i, cv);
    }

    for(int i = 0; i < n; i += 8) {
        __m256 av = _mm256_load_ps(a + i);
        __m256 bv = _mm256_load_ps(b + i);
        __m256 cv = _mm256_mul_ps(av, bv);
        _mm256_store_ps(c + i, cv);
    }
}

The following code can run only in release mode, and get segmentation fault in debug mode.

#include <immintrin.h>

#include "benchmark/benchmark.h"

constexpr int n_batch = 10240;
constexpr int n = n_batch * 8;
#pragma pack(32)
float a[n];
float b[n];
float c[n];
#pragma pack()

static void BM_Scalar(benchmark::State &state) {
    for(auto _: state)
        for(int i = 0; i < n; ++i)
            c[i] = a[i] * b[i];
}
BENCHMARK(BM_Scalar);

static void BM_Packet_4(benchmark::State &state) {
    for(auto _: state) {
        for(int i = 0; i < n; i += 4) {
            __m128 av = _mm_load_ps(a + i);
            __m128 bv = _mm_load_ps(b + i);
            __m128 cv = _mm_mul_ps(av, bv);
            _mm_store_ps(c + i, cv);
        }
    }
}
BENCHMARK(BM_Packet_4);

static void BM_Packet_8(benchmark::State &state) {
    for(auto _: state) {
        for(int i = 0; i < n; i += 8) {
            __m256 av = _mm256_load_ps(a + i); // Signal: SIGSEGV (signal SIGSEGV: invalid address (fault address: 0x0))
            __m256 bv = _mm256_load_ps(b + i);
            __m256 cv = _mm256_mul_ps(av, bv);
            _mm256_store_ps(c + i, cv);
        }
    }
}
BENCHMARK(BM_Packet_8);

BENCHMARK_MAIN();

Solution

Your arrays aren't aligned by 32. You could check this with a debugger.

#pragma pack(32) only aligns struct/union/class members, as documented by MS. C++ arrays are a different kind of object and aren't affected at all by that MSVC pragma. (I think you're actually using GCC's or clang's version of it, though, because MSVC generally uses vmovups not vmovaps)

For arrays in static or automatic storage (not dynamically allocated), the easiest way to align arrays in C++11 and later is alignas(32). That's fully portable, unlike GNU C __attribute__((aligned(32))) or whatever MSVC's equivalent is.

alignas(32) float a[n];
alignas(32) float b[n];
alignas(32) float c[n];

AVX: data alignment: store crash, storeu, load, loadu doesn't explains why there's a difference depending on optimization level: optimized code will fold one load into a memory source operand for vmulps which (unlike SSE) doesn't require alignment. (Presumably the first array happens to be sufficiently aligned.)

Un-optimized code will do the _mm256_load_ps separately with a vmovaps alignment-required load.

(_mm256_loadu_ps will always avoid using alignment-required loads, so use that if you can't guarantee your data is aligned.)