I've been optimizing some code, and stumbled across some peculiar case. Here are the two assembly codes:
; FAST
lea rcx,[rsp+50h]
call qword ptr [Random_get_float3] ;this function only writes 3 components
movaps xmm0,xmmword ptr [rsp+50h]
lea rbx,[rbx+0Ch]
mulps xmm0,xmm6
movlps qword ptr [rbx-0Ch],xmm0
movaps xmmword ptr [rsp+50h],xmm0
extractps eax,xmm0,2
mov dword ptr [rbx-4],eax
; SLOW
lea rcx,[rsp+50h]
call qword ptr [Random_get_float3] ;this function only writes 3 components
movaps xmm0,xmmword ptr [rsp+50h]
lea rbx,[rbx+0Ch]
mulps xmm0,xmm6
movlps qword ptr [rbx-0Ch],xmm0
extractps eax,xmm0,2
mov dword ptr [rbx-4],eax
Both of the versions are executed 10000 times in a tight loop (same loop code omited). As you can see, the assemblies are exactly the same, except for one extra movaps xmmword ptr [rsp+50h],xmm0
instruction in the fast version.
Actually it's a no-op, because rsp+50h will be overwritten in the next iteration:
lea rcx,[rsp+50h]
call qword ptr [Random_get_float3]
What's interesting in this code is that slow version is twice as slow compared to fast while missing one extra useless instruction.
Can someone explain why?
The C++ code (compiled with MSVC v140 with VS 2022):
#include <immintrin.h>
#include <cstdlib>
__declspec(noinline) void random_get_float3(float* vec3) {
int v = rand();
vec3[0] = *(float*)&v;
v = rand();
vec3[1] = *(float*)&v;
v = rand();
vec3[2] = *(float*)&v;
vec3[0] = powf(vec3[0], 1.0f / 3.0f);
vec3[1] = powf(vec3[1], 1.0f / 3.0f);
vec3[2] = powf(vec3[2], 1.0f / 3.0f);
}
void* randomGetFuncPtr = &random_get_float3;
// Not aligned by 16.
struct Vector3 {
float x, y, z;
};
struct Vector3Array {
size_t length;
Vector3* m_Items;
};
static bool inited = false;
Vector3 scaledRandomPosExtern = Vector3{ 0.5f, 0.5f, 0.5f };
Vector3Array randomPositions;
#define __SLOW // comment to enable fast version.
int numObjectsExtern = 10000;
void TestFunc()
{
int numObjects = numObjectsExtern;
if (!inited) {
randomPositions = {
10000,
new Vector3[10000]
};
inited = true;
}
typedef void (*Random_get_float3_fptr) (__m128* __restrict);
Random_get_float3_fptr _il2cpp_icall_func = (Random_get_float3_fptr)randomGetFuncPtr;
Vector3 scaledRandomPos = scaledRandomPosExtern;
__m128 scaledRandomPosVec = _mm_setr_ps(scaledRandomPos.x, scaledRandomPos.y, scaledRandomPos.z, 0.0f);
Vector3Array* outputArray = &randomPositions;
int* items = (int*)&outputArray->m_Items[0];
for (int i = 0; i < numObjects; i++) {
__m128 v1;
_il2cpp_icall_func(&v1);
#ifdef __SLOW
__m128 v3;
v3 = _mm_mul_ps(v1, scaledRandomPosVec);
#define RESVEC v3
#else
v1 = _mm_mul_ps(v1, scaledRandomPosVec);
#define RESVEC v1
#endif
_mm_storel_pi((__m64*)(items), RESVEC);
items[2] = _mm_extract_ps(RESVEC, 2);
items += 3;
}
}
Reproducable on
CPU:
AMD Ryzen 7 3700x Windows 10 19045.3930
Other Ryzen CPUs
Can't be reproducable on Intel CPUs.
Thanks to @chtz and @fuz !
Turns out that this extra instruction copied the result of the multiplication where 4th component was a normal float. Without that extra instruction, 4th component of the vector was not initialized and was a denorm float which led to slower computation.
If you manually set 4th component to denorm float, then each mulps
operation will be around 20% slower, while initializing 4th component with zero will remove that overhead.
On Intel CPUs it doesn't matter if the number is norm or denorm, it doesn't affect computation speed.
This extra instruction most likely a bug of MSVC optimizer, because it shouldn't be there, but accidentaly sped up the code.