I am writing a double-double arithmetic library for AVX/AVX2. One of the issues I encountered was that the non-Simd and Simd versions have different memory layouts.
// Pair-wise
struct Float64x2 {
double hi;
double lo;
};
// Component-wise
struct __m256dx2 {
__m256d hi; // double hi[4]
__m256d lo; // double lo[4]
};
This means I will have to convert from a pair-wise
{x0,y0}, {x1,y1}, {x2,y2}, {x3,y3}
to a component-wise {x0,x1,x2,x3}, {y0,y1,y2,y3}
memory layout and vice-versa.
__m256dx2 _mm256x2_loadu_pdx2(const double* mem_addr) {
__m256dx2 val;
val.hi = _mm256_loadu_pd(mem_addr);
mem_addr += sizeof(__m256d) / sizeof(double);
val.lo = _mm256_loadu_pd(mem_addr);
// convert to component-wise
return val;
}
void _mm256x2_storeu_pdx2(double* mem_addr, __m256dx2 val) {
// convert to pair-wise
_mm256_storeu_pd(mem_addr, val.hi);
mem_addr += sizeof(__m256d) / sizeof(double);
_mm256_storeu_pd(mem_addr, val.lo);
}
How can I convert between the two memory layouts in AVX (or AVX2)?
I would do it like that. Loading 4 structures:
// Load 16 bytes from the pointer, upcasting to 32 bytes vector
inline __m256d loadLow( const double* rsi )
{
return _mm256_castpd128_pd256( _mm_loadu_pd( rsi ) );
}
// Load 16 bytes into the high half of the vector
inline __m256d loadHigh( __m256d low, const double* rsi )
{
// That insert instruction can insert directly from memory,
// this function should compile into a single vinsertf128 instruction
__m128d tmp = _mm_loadu_pd( rsi );
return _mm256_insertf128_pd( low, tmp, 1 );
}
// Load 8 FP64 numbers from memory, and deinterleave into 2 AVX vectors
__m256dx2 _mm256x2_loadu_pdx2( const double* rsi )
{
__m256d v0 = loadLow( rsi );
__m256d v1 = loadLow( rsi + 2 );
v0 = loadHigh( v0, rsi + 4 );
v1 = loadHigh( v1, rsi + 6 );
__m256dx2 val;
val.hi = _mm256_unpacklo_pd( v0, v1 );
val.lo = _mm256_unpackhi_pd( v0, v1 );
return val;
}
Storing 4 structures:
// Store 8 interleaved numbers to memory
void _mm256x2_storeu_pdx2( double* rdi, __m256dx2 val )
{
// hi.x, lo.x, hi.z, lo.z
__m256d v0 = _mm256_unpacklo_pd( val.hi, val.lo );
// hi.y, lo.y, hi.w, lo.w
__m256d v1 = _mm256_unpackhi_pd( val.hi, val.lo );
__m256d tmp;
// hi.z, lo.z, hi.w, lo.w
tmp = _mm256_permute2f128_pd( v0, v1, 0x31 );
_mm256_storeu_pd( rdi + 4, tmp );
// hi.x, lo.x, hi.y, lo.y
tmp = _mm256_insertf128_pd( v0, _mm256_castpd256_pd128( v1 ), 1);
_mm256_storeu_pd( rdi, tmp );
}
I have used slightly different tactic for loads and stored because many modern CPUs can do twice as many loads per cycle compared to stores.
Still, if the rest of your code does a lot of shuffles, consider changing the store function to remove relatively expensive _mm256_permute2f128_pd
and _mm256_insertf128_pd
shuffles, and instead store your 8 numbers with 4 instructions: _mm_storeu_pd( ptr, _mm256_castpd256_pd128( v ) )
for the low half of the vector, and _mm_storeu_pd( ptr, _mm256_extractf128_pd( v, 1 ) )
for the high half.