directxdirectx-11directxmath

What's the difference between XMVectorSetByIndex when index = 0 and XMVectorSetX?


I read the source code of the DirectXMath Library and found that the implementation of XMVectorSetByIndex and XMVectorSetX are completely different. Why not XMVectorSetX simply returns XMVectorSetByIndex (index = 0) ?


Solution

  • XMVectorSetX is actually able to use SSE or ARM-NEON intrinsics, while XMVectorSetByIndex has to 'spill to memory' (i.e. it's not SIMD at all).

    // Set a single indexed floating point component
    inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
    {
        assert( i < 4 );
        _Analysis_assume_( i < 4 );
    #if defined(_XM_NO_INTRINSICS_)
        XMVECTOR U;
        U = V;
        U.vector4_f32[i] = f;
        return U;
    #elif defined(_XM_ARM_NEON_INTRINSICS_)
        XMVECTOR U = V;
        U.n128_f32[i] = f;
        return U;
    #elif defined(_XM_SSE_INTRINSICS_)
        XMVECTOR U = V;
        U.m128_f32[i] = f;
        return U;
    #endif
    }       
    

    vs.

    // Sets the X component of a vector to a passed floating point value
    inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x)
    {
    #if defined(_XM_NO_INTRINSICS_)
        XMVECTOR U;
        U.vector4_f32[0] = x;
        U.vector4_f32[1] = V.vector4_f32[1];
        U.vector4_f32[2] = V.vector4_f32[2];
        U.vector4_f32[3] = V.vector4_f32[3];
        return U;
    #elif defined(_XM_ARM_NEON_INTRINSICS_)
        return vsetq_lane_f32(x,V,0);
    #elif defined(_XM_SSE_INTRINSICS_)
        XMVECTOR vResult = _mm_set_ss(x);
        vResult = _mm_move_ss(V,vResult);
        return vResult;
    #endif
    }
    

    It's informative to look at the XMVectorSetY case as well where with /arch:AVX or /arch:AVX2 it's able to use the SSE4 instruction _mm_insert_ps otherwise it has to do a fair bit of work get SIMD code-gen instead of having to 'spill to memory'.

    inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
    {
    #if defined(_XM_NO_INTRINSICS_)
        XMVECTOR U;
        U.vector4_f32[0] = V.vector4_f32[0];
        U.vector4_f32[1] = y;
        U.vector4_f32[2] = V.vector4_f32[2];
        U.vector4_f32[3] = V.vector4_f32[3];
        return U;
    #elif defined(_XM_ARM_NEON_INTRINSICS_)
        return vsetq_lane_f32(y,V,1);
    #elif defined(_XM_SSE4_INTRINSICS_)
        XMVECTOR vResult = _mm_set_ss(y);
        vResult = _mm_insert_ps( V, vResult, 0x10 );
        return vResult;
    #elif defined(_XM_SSE_INTRINSICS_)
        // Swap y and x
        XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
        // Convert input to vector
        XMVECTOR vTemp = _mm_set_ss(y);
        // Replace the x component
        vResult = _mm_move_ss(vResult,vTemp);
        // Swap y and x again
        vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
        return vResult;
    #endif
    }
    

    Note that DirectXMath is now available on GitHub.