hlsl

Computations non-deterministic


I'm trying to compute the roots of a polynomial with the Durand Kerner algorithm.

I thought if the inputs are the same then the computation would be deterministic. But each thread group converges at a different rate, and sometimes the algorithm diverges (nan I imagine).

I thought at first I was a synchronisation bug (but there is only ever one warp in a thread group (7 < 32)), but have decided it's actually because some operations are non deterministic. Does anyone know which operations are non-deterministic?

// https://ieeexplore.ieee.org/document/6969057/
groupshared Complex roots[7];
groupshared float a[8];
groupshared float2 uv_sum[8];
groupshared float2 uv_max[8];
static const float PI = 3.14159265f;

[numthreads(7, 1, 1)]
void main( uint3 tid : SV_DispatchThreadID )
{
    uint index = tid.x;
    // z^7 + 3z^6 - 2z^5 + 10z^4 - 2z^3 + 8z^2 - z - 13
    float a_[8] = { 1.f, 3.f, -2.f, 10.f, -2.f, 8.f, -1.f, -13.f };
    a[index] = a_[index];
    if (index == 0)
    {
        a[7] = a_[7];
    }
    
    GroupMemoryBarrierWithGroupSync();
    
    float ui = 2 * pow(abs(a[index + 1]), 1.f / (index+1));
    float vi = 0.5f * pow(abs(a[7] / a[index]), 1.f / (7-index));
    uv_sum[index] = float2(ui, vi);
    uv_max[index] = float2(ui, vi);
    GroupMemoryBarrierWithGroupSync();
    
    for (unsigned int s = 8 / 2; s > 0; s >>= 1)
    {
        if (index < s)
        {
            uv_sum[index] += uv_sum[index + s];
            uv_max[index] = max(uv_max[index + s], uv_max[index]);
        }
        GroupMemoryBarrierWithGroupSync();
    }

    float2 cs;
    sincos(index * 2.f * PI / 7.f, cs.y, cs.x);
    float2 uv = uv_sum[0] / uv_max[0];
    cs *= (uv.x + uv.y) / 2;
    //Initial[index] = cs;
    roots[index] = Complex_(cs.x, cs.y);
    
    
    for (uint loop = 0; loop < 15; loop++)
    {
        GroupMemoryBarrierWithGroupSync();
        Complex zi = roots[index];
    
        Complex zipow[8];
        zipow[1] = zi;
        for (uint i = 2; i < 8; i++)
        {
            zipow[i] = zipow[i - 1] * zi;
        }
    
        //Complex Pzi = zip7 * a[0] + zip6 * a[1] + zip5 * a[2] + zip4 * a[3] + zip3 * a[4] + zip2 * a[5] + zi * a[6] + Complex_(a[7], 0);
        Complex Pzi = zipow[7] * a[0] + zipow[6] * a[1] + zipow[5] * a[2] + zipow[4] * a[3] + zipow[3] * a[4] + zipow[2] * a[5] + zi * a[6] + Complex_(a[7], 0);
        Complex d1 = (zi - roots[(index + 1) % 7]);
        Complex d2 = (zi - roots[(index + 2) % 7]);
        Complex d3 = (zi - roots[(index + 3) % 7]);
        Complex d4 = (zi - roots[(index + 4) % 7]);
        Complex d5 = (zi - roots[(index + 5) % 7]);
        Complex d6 = (zi - roots[(index + 6) % 7]);
        Complex product = d1 * d2 * d3 * d4 * d5 * d6;
        GroupMemoryBarrierWithGroupSync();
    
        Complex div = Pzi / product;
        Complex rooti = zi - div;
        roots[index] = rooti;
    }
    
    Complex root = roots[index];
    float theta = atan2(abs(root.imag), root.real) / PI;
    Output[index + 7 * tid.y] =
        int(theta * 1024);
}

Solution

  • Turns out I was not initializing some groupshared memory (uv_sum and uv_max) (assuming it would be zero) which are used to calculate the initial condition.

    if (index == 0)
    {
        a[7] = a_[7];
        uv_sum[7] = float2(0, 0);
        uv_max[7] = float2(0, 0);
    }