#include <immintrin.h>
constexpr int n_batch = 10240;
constexpr int n = n_batch * 8;
#pragma pack(32)
float a[n];
float b[n];
float c[n];
#pragma pack()
int main() {
for(int i = 0; i < n; ++i)
c[i] = a[i] * b[i];
for(int i = 0; i < n; i += 4) {
__m128 av = _mm_load_ps(a + i);
__m128 bv = _mm_load_ps(b + i);
__m128 cv = _mm_mul_ps(av, bv);
_mm_store_ps(c + i, cv);
}
for(int i = 0; i < n; i += 8) {
__m256 av = _mm256_load_ps(a + i);
__m256 bv = _mm256_load_ps(b + i);
__m256 cv = _mm256_mul_ps(av, bv);
_mm256_store_ps(c + i, cv);
}
}
#include <immintrin.h>
#include "benchmark/benchmark.h"
constexpr int n_batch = 10240;
constexpr int n = n_batch * 8;
#pragma pack(32)
float a[n];
float b[n];
float c[n];
#pragma pack()
static void BM_Scalar(benchmark::State &state) {
for(auto _: state)
for(int i = 0; i < n; ++i)
c[i] = a[i] * b[i];
}
BENCHMARK(BM_Scalar);
static void BM_Packet_4(benchmark::State &state) {
for(auto _: state) {
for(int i = 0; i < n; i += 4) {
__m128 av = _mm_load_ps(a + i);
__m128 bv = _mm_load_ps(b + i);
__m128 cv = _mm_mul_ps(av, bv);
_mm_store_ps(c + i, cv);
}
}
}
BENCHMARK(BM_Packet_4);
static void BM_Packet_8(benchmark::State &state) {
for(auto _: state) {
for(int i = 0; i < n; i += 8) {
__m256 av = _mm256_load_ps(a + i); // Signal: SIGSEGV (signal SIGSEGV: invalid address (fault address: 0x0))
__m256 bv = _mm256_load_ps(b + i);
__m256 cv = _mm256_mul_ps(av, bv);
_mm256_store_ps(c + i, cv);
}
}
}
BENCHMARK(BM_Packet_8);
BENCHMARK_MAIN();
Your arrays aren't aligned by 32. You could check this with a debugger.
#pragma pack(32)
only aligns struct/union/class members, as documented by MS. C++ arrays are a different kind of object and aren't affected at all by that MSVC pragma. (I think you're actually using GCC's or clang's version of it, though, because MSVC generally uses vmovups
not vmovaps
)
For arrays in static or automatic storage (not dynamically allocated), the easiest way to align arrays in C++11 and later is alignas(32)
. That's fully portable, unlike GNU C __attribute__((aligned(32)))
or whatever MSVC's equivalent is.
alignas(32) float a[n];
alignas(32) float b[n];
alignas(32) float c[n];
AVX: data alignment: store crash, storeu, load, loadu doesn't explains why there's a difference depending on optimization level: optimized code will fold one load into a memory source operand for vmulps
which (unlike SSE) doesn't require alignment. (Presumably the first array happens to be sufficiently aligned.)
Un-optimized code will do the _mm256_load_ps
separately with a vmovaps
alignment-required load.
(_mm256_loadu_ps
will always avoid using alignment-required loads, so use that if you can't guarantee your data is aligned.)