directx-11hlsldirectcompute

HLSL Compiler omitting crucial statements


I have this compute shader that traverses a binary tree. It used to work fine with the separately installed DirectX SDK (June), with compiler #43.

The compilers #46 and #47 (from Windows SDKs 8.0 and 8.1, respectively) however seem to omit two really crucial lines of code, that have the shader running in circles, checking the same tree nodes over and over again until Windows restarts the graphics driver (Verified by looking at disassembly).

Here's a minimal code sample exhibiting this behavior:

#define LEFT_PROCESSED  1
#define RIGHT_PROCESSED 2

struct Node
{
  float4 min;
  float4 max;
  int left;
  int right;
  int parent;
  int flags;
};

RWStructuredBuffer<Node> tree: register(u0);

bool TreeSearch()
{
  Node node = tree[0];

  int nodeId = 0;

  int statusStack[40];
  int stackSize = 0;
  statusStack[0] = 0;

  while (true)
  {
    if (!(statusStack[stackSize] & LEFT_PROCESSED))
    {
      statusStack[stackSize] |= LEFT_PROCESSED;
      ++stackSize;
      statusStack[stackSize] = 0;
      nodeId = node.left;
      node = tree[nodeId];
      continue;
    }

    if (!(statusStack[stackSize] & RIGHT_PROCESSED))
    {
      statusStack[stackSize] |= RIGHT_PROCESSED; // this line
      ++stackSize;
      statusStack[stackSize] = 0;                // and this line
      nodeId = node.right;
      node = tree[nodeId];
      continue;
    }

    if (node.parent != -1)
    {
      --stackSize;
      nodeId = node.parent;
      node = tree[nodeId];
    }
    else
      return false;
  }
  return false;
}

[numthreads(32, 1, 1)]
void CSSearch(uint2 dispatchThreadId: SV_DispatchThreadID)
{
  TreeSearch();
}

And the corresponding assembly:

cs_5_0
dcl_globalFlags refactoringAllowed
dcl_uav_structured u0, 48
dcl_temps 3
dcl_indexableTemp x0[40], 4
dcl_thread_group 32, 1, 1
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r0.xyz, l(0), l(32), u0.xyzx
mov x0[0].x, l(0)
mov r1.xyz, r0.yzxy
mov r0.w, l(0)
loop 
  mov r1.w, x0[r0.w + 0].x
  and r2.x, r1.w, l(1)
  if_z r2.x
    or r1.w, r1.w, l(1)       // here's the first one in the LEFT branch
    mov x0[r0.w + 0].x, r1.w  //
    iadd r1.w, r0.w, l(1)
    mov x0[r1.w + 0].x, l(0)  // and the second one
    ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.z, l(32), u0.yzxx
    mov r1.xyz, r2.xyzx
    mov r0.w, r1.w
    continue 
  endif 
  mov r1.w, x0[r0.w + 0].x   // why is there nothing in the RIGHT branch?
  and r1.w, r1.w, l(2)
  if_z r1.w
    iadd r1.w, r0.w, l(1)
    ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.x, l(32), u0.yzxx
    mov r1.xyz, r2.xyzx
    mov r0.w, r1.w
    continue 
  endif 
  ine r1.w, r1.y, l(-1)
  if_nz r1.w
    iadd r0.w, r0.w, l(-1)
    ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r1.xyz, r1.y, l(32), u0.yzxx
  else 
    break 
  endif 
endloop 
ret 

When I omit the first continue, it generates the code for those two lines, but then its also broken.

Any idea how to get the newer compiler to generate that code?


Solution

  • Please note: I'm not familiar with GPU programming, and am not sure whether this is the compiler's fault or the code's fault. The following is just a workaround.

    You can mimic the continue behavior by using an explicit variable, in hope that the compiler does not get in the way:

    bool TreeSearch()
    {
      Node node = tree[0];
    
      int nodeId = 0;
    
      int statusStack[40];
      int stackSize = 0;
      statusStack[0] = 0; 
    
      while (stackSize < 10) // Changed to make it compile.
      {
        int shouldContinue = 1;
        if (!(statusStack[stackSize] & LEFT_PROCESSED))
        {
          statusStack[stackSize] |= LEFT_PROCESSED;
          ++stackSize;
          statusStack[stackSize] = 0;
          nodeId = node.left;
          node = tree[nodeId];
          shouldContinue = 0;
        }
    
        if (shouldContinue && 
            !(statusStack[stackSize] & RIGHT_PROCESSED))
        {
          statusStack[stackSize] |= RIGHT_PROCESSED; // this line
          ++stackSize;
          statusStack[stackSize] = 0;                // and this line
          nodeId = node.right;
          node = tree[nodeId];
          shouldContinue = 0;
        }
    
        if (shouldContinue)
        { 
            if (node.parent != -1)
            {
              --stackSize;
              nodeId = node.parent;
              node = tree[nodeId];
            }
            else
              return false;
        }
    
      }
      return false;
    }
    

    The disassembly output doesn't seem to lack any operation that is missing from the original snippet. This may have a overhead, though.

    Link: http://shader-playground.timjones.io/6abdc64cdf98e1840a3b38c629b4e217