I'm trying to compute the roots of a polynomial with the Durand Kerner algorithm.
I thought if the inputs are the same then the computation would be deterministic. But each thread group converges at a different rate, and sometimes the algorithm diverges (nan
I imagine).
I thought at first I was a synchronisation bug (but there is only ever one warp in a thread group (7 < 32)), but have decided it's actually because some operations are non deterministic. Does anyone know which operations are non-deterministic?
// https://ieeexplore.ieee.org/document/6969057/
groupshared Complex roots[7];
groupshared float a[8];
groupshared float2 uv_sum[8];
groupshared float2 uv_max[8];
static const float PI = 3.14159265f;
[numthreads(7, 1, 1)]
void main( uint3 tid : SV_DispatchThreadID )
{
uint index = tid.x;
// z^7 + 3z^6 - 2z^5 + 10z^4 - 2z^3 + 8z^2 - z - 13
float a_[8] = { 1.f, 3.f, -2.f, 10.f, -2.f, 8.f, -1.f, -13.f };
a[index] = a_[index];
if (index == 0)
{
a[7] = a_[7];
}
GroupMemoryBarrierWithGroupSync();
float ui = 2 * pow(abs(a[index + 1]), 1.f / (index+1));
float vi = 0.5f * pow(abs(a[7] / a[index]), 1.f / (7-index));
uv_sum[index] = float2(ui, vi);
uv_max[index] = float2(ui, vi);
GroupMemoryBarrierWithGroupSync();
for (unsigned int s = 8 / 2; s > 0; s >>= 1)
{
if (index < s)
{
uv_sum[index] += uv_sum[index + s];
uv_max[index] = max(uv_max[index + s], uv_max[index]);
}
GroupMemoryBarrierWithGroupSync();
}
float2 cs;
sincos(index * 2.f * PI / 7.f, cs.y, cs.x);
float2 uv = uv_sum[0] / uv_max[0];
cs *= (uv.x + uv.y) / 2;
//Initial[index] = cs;
roots[index] = Complex_(cs.x, cs.y);
for (uint loop = 0; loop < 15; loop++)
{
GroupMemoryBarrierWithGroupSync();
Complex zi = roots[index];
Complex zipow[8];
zipow[1] = zi;
for (uint i = 2; i < 8; i++)
{
zipow[i] = zipow[i - 1] * zi;
}
//Complex Pzi = zip7 * a[0] + zip6 * a[1] + zip5 * a[2] + zip4 * a[3] + zip3 * a[4] + zip2 * a[5] + zi * a[6] + Complex_(a[7], 0);
Complex Pzi = zipow[7] * a[0] + zipow[6] * a[1] + zipow[5] * a[2] + zipow[4] * a[3] + zipow[3] * a[4] + zipow[2] * a[5] + zi * a[6] + Complex_(a[7], 0);
Complex d1 = (zi - roots[(index + 1) % 7]);
Complex d2 = (zi - roots[(index + 2) % 7]);
Complex d3 = (zi - roots[(index + 3) % 7]);
Complex d4 = (zi - roots[(index + 4) % 7]);
Complex d5 = (zi - roots[(index + 5) % 7]);
Complex d6 = (zi - roots[(index + 6) % 7]);
Complex product = d1 * d2 * d3 * d4 * d5 * d6;
GroupMemoryBarrierWithGroupSync();
Complex div = Pzi / product;
Complex rooti = zi - div;
roots[index] = rooti;
}
Complex root = roots[index];
float theta = atan2(abs(root.imag), root.real) / PI;
Output[index + 7 * tid.y] =
int(theta * 1024);
}
Turns out I was not initializing some groupshared memory (uv_sum
and uv_max
) (assuming it would be zero) which are used to calculate the initial condition.
if (index == 0)
{
a[7] = a_[7];
uv_sum[7] = float2(0, 0);
uv_max[7] = float2(0, 0);
}