unity-game-enginehlslcompute-shader

Unpack/Pack SNorm16 in HLSL compute shader


I'm trying to unpack SNorm16 values in HLSL compute shader. Since SNorm16x4 = 8 bytes total, and Load/Store functions can only read/write 4 bytes I'm trying to get two 4 bytes values as packed to 1 value of 8 bytes, unpack it to 4 values, work with result and then pack it pack and store as 1 packed value of 8 bytes.

The code is:

float2 UnpackFromSNorm16x2(uint v)
{
    uint2 tempU = asuint(uint2(v, v >> 16) & 0xFFFF);
    int2 tempI = int2(tempU.x - 32767, tempU.y - 32767);
    return float2( tempI * float(1.0 / 32767.0));
}

int FloatToSNorm16(float v)
{
    //According to D3D10 rules, the value "-1.0f" has two representations:
    //  0x1000 and 0x10001
    //This allows everyone to convert by just multiplying by 32767 instead
    //of multiplying the negative values by 32768 and 32767 for positive.
    return int(clamp(v >= 0.0f ? (v * 32767.0f + 0.5f) : (v * 32767.0f - 0.5f), -32768.0f, 32767.0f));
}

uint PackToSNorm16x2(float2 v)
{
    int intX = int(FloatToSNorm16(v.x));
    int intY = int(FloatToSNorm16(v.y));
    uint2 uintXY = uint2(clamp(intX + 32767, 0, 65535), clamp(intY + 32767, 0, 65535));
    uint x = (uintXY.x << 0) & 0x0000FFFF;
    uint y = (uintXY.y << 16) & 0xFFFF0000;
    return x | y;
}

uint2 inputTangentUInt = asuint(vertices.Load2(baseOffset + tangentOffset));
float4 qTangentUnpacked = float4(UnpackFromSNorm16x2(inputTangentUInt.x), UnpackFromSNorm16x2(inputTangentUInt.y));

//Do some work with qTangentUnpacked

uint2 qTangentPacked = uint2(PackTwoSNORM16(qTangentUnpacked.xy), PackTwoSNORM16(qTangentUnpacked.zw));
vertices.Store2(baseOffset + tangentOffset, asuint(qTangentPacked));

But final result is wrong, looks like some data lost. What am I doing wrong?


Solution

  • There is a collection of HLSL routines for doing these kinds of conversions called D3DX_DXGIFormatConvert.inl. It is documented on Microsoft Learn, and used to ship in the legacy DirectX SDK.

    typedef int INT;
    typedef int2 XMINT2;
    typedef float2 XMFLOAT2;
    
    #define D3DX11INLINE
    #define D3DX_Truncate_FLOAT(_V) trunc(_V)
    #define hlsl_precise precise
    
    D3DX11INLINE FLOAT D3DX_INT_to_FLOAT(INT _V,
                                         FLOAT _Scale)
    {
        FLOAT Scaled = (FLOAT)_V / _Scale;
        // The integer is a two's-complement signed
        // number so the negative range is slightly
        // larger than the positive range, meaning
        // the scaled value can be slight less than -1.
        // Clamp to keep the float range [-1, 1].
        return max(Scaled, -1.0f);
    }
    
    D3DX11INLINE INT D3DX_FLOAT_to_INT(FLOAT _V,
                                       FLOAT _Scale)
    {
        return (INT)D3DX_Truncate_FLOAT(_V * _Scale + (_V >= 0 ? 0.5f : -0.5f));
    }
    
    D3DX11INLINE XMFLOAT2 D3DX_R16G16_SNORM_to_FLOAT2(UINT packedInput)
    {
        hlsl_precise XMFLOAT2 unpackedOutput;
        XMINT2 signExtendedBits;
        signExtendedBits.x =  (INT)(packedInput << 16) >> 16;
        signExtendedBits.y =  (INT)(packedInput & 0xffff0000) >> 16;
        unpackedOutput.x = D3DX_INT_to_FLOAT(signExtendedBits.x, 32767);
        unpackedOutput.y = D3DX_INT_to_FLOAT(signExtendedBits.y, 32767);
        return unpackedOutput;
    }
    
    D3DX11INLINE UINT D3DX_FLOAT2_to_R16G16_SNORM(hlsl_precise XMFLOAT2 unpackedInput)
    {
        UINT packedOutput;
        packedOutput = ( (D3DX_FLOAT_to_INT(D3DX_SaturateSigned_FLOAT(unpackedInput.x), 32767) & 0x0000ffff)      |
                         (D3DX_FLOAT_to_INT(D3DX_SaturateSigned_FLOAT(unpackedInput.y), 32767)              <<16) );
        return packedOutput;
    }
    

    The latest version is on GitHub

    See this blog post for more information.