SOLVED: Faster HLSL code? Wondering about lower CPU overhead when rendering quads in 3-space

!!!UPDATE!!! Using the vertex shader to generate quads via DrawInstanced() calls definitely reduced CPU overhead and increased quads drawn per second. But there was much more performance to be found by using a combination of instanced drawing via a vertex shader that generates a point list, and a geometry shader that generates quads based on those points.

Thanks to @Soonts for not only recommending a faster way, but also for reminding me of conditional moves and unrolling loops.

Here is the geometry shader I created for sprites with 2D rotation:

cbuffer CB_PROJ {
    matrix camera;
};

/*  Reduced packet size -- 256x256 max atlas segments
     -------------------
FLOAT3  Sprite location                     // 12 bytes
FLOAT   Rotation                            // 16 bytes
FLOAT2  Scale                               // 24 bytes
UINT                                        // 28 bytes
    Fixed8p00  Texture X segment
    Fixed8p00  Texture X total segments
    Fixed8p00  Texture Y segment
    Fixed8p00  Texture Y total segments
.Following vertex data is only processed by the vertex shader.
UINT                                        // 32 bytes
    Fixed3p00  Squadron generation method
    Fixed7p00  Sprite stride
    Fixed8p14  X/Y distance between sprites
*/

struct VOut {
    float3 position : POSITION;
    float3 r_s : NORMAL;
    uint   bits : BLENDINDICES;
};

struct GOut {
    float4 pos : SV_Position;
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 tex : TEXCOORD;
    uint   pID : SV_PrimitiveID;
};

[maxvertexcount(4)]

void main(point VOut gin[1], uint pID : SV_PrimitiveID, inout TriangleStream<GOut> triStream) {
    GOut output;

    const uint   bits   = gin[0].bits;   
    const uint   ySegs  = (bits & 0x0FF000000) >> 24u;
    const uint  _yOS    = (bits & 0x000FF0000) >> 16u;
    const float  yOS    = 1.0f - float(_yOS) / float(ySegs);
    const float  yOSd   = rcp(float(ySegs));
    const uint   xSegs  = (bits & 0x00000FF00) >> 8u;
    const uint   _xOS   = (bits & 0x0000000FF);
    const float  xOS    = float(_xOS) / float(xSegs);
    const float  xOSd   = rcp(float(xSegs));
          float2 v;

    output.pID = pID;
    output.n = float3( 0.0f, 0.0f, -1.0f );
    
    output.position = gin[0].position;  // Translate
    v.x = -gin[0].r_s.y; v.y = -gin[0].r_s.z;   // Scale
    output.tex = float2(xOS, yOS);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x); // Rotate
    output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);    // Transform
    triStream.Append(output);
    
    output.position = gin[0].position;
    v.x = -gin[0].r_s.y; v.y = gin[0].r_s.z;
    output.tex = float2(xOS, yOS - yOSd);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
    output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);
    triStream.Append(output);
    
    output.position = gin[0].position;
    v.x = gin[0].r_s.y; v.y = -gin[0].r_s.z;
    output.tex = float2(xOS + xOSd, yOS);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
    output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);
    triStream.Append(output);
    
    output.position = gin[0].position;
    v.x = gin[0].r_s.y; v.y = gin[0].r_s.z;
    output.tex = float2(xOS + xOSd, yOS - yOSd);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
    output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);
    triStream.Append(output);
}

!!!ORIGINAL TEXT!!!

Last time I was coding, I had barely started learning Direct3D9c. Currently I'm hitting about 30K single-texture quads lit with 15 lights at about 450fps. I haven't learned instancing or geometry shading at all yet, and I'm trying to prioritise the order I learn things in for my needs, so I've only taken glances at them.

My first thought was to reduce the amount of vertex data being shunted to the GPU, so I changed the vertex structure to a FLOAT2 (for texture coords) and an UINT (for indexing), relying on 4x float3 constants in the vertex shader to define the corners of the quads.

I figured I could reduce the size of the vertex data further, and reduced each vertex unit to a single UINT containing a 2bit index (to reference the real vertexes of the quad), and 2x 15bit fixed-point numbers (yes, I'm showing my age but fixed-point still has it's value) representing offsets into atlas textures.

So far, so good, but I know bugger all about Direct3D11 and HLSL so I've been wondering if there's a faster way.

Here's the current state of my vertex shader:

cbuffer CB_PROJ
{
    matrix model;
    matrix modelViewProj;
};

struct VOut
{
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 texcoord : TEXCOORD;
    float4 pos : SV_Position;
};

static const float3 position[4] = { -0.5f, 0.0f,-0.5f,-0.5f, 0.0f, 0.5f, 0.5f, 0.0f,-0.5f, 0.5f, 0.0f, 0.5f };
    
// Index bitpattern: YYYYYYYYYYYYYYYXXXXXXXXXXXXXXXVV
//
// 00-01 .  uint2b   == Vertex index (0-3)
// 02-17 . fixed1p14 == X offset into atlas texture(s)
// 18-31 . fixed1p14 == Y offset into atlas texture(s)
//
VOut main(uint bitField : BLENDINDICES) {
    VOut output;
    
    const uint   i        = bitField & 0x03u;
    const uint   xStep    = (bitField >> 2) & 0x7FFFu;
    const uint   yStep    = (bitField >> 17);
    const float  xDelta   = float(xStep) * 0.00006103515625f;
    const float  yDelta   = float(yStep) * 0.00006103515625f;
    const float2 texCoord = float2(xDelta, yDelta);
    
    output.position = (float3) mul(float4(position[i], 1.0f), model);
    output.n = mul(float3(0.0f, 1.0f, 0.0f), (float3x3) model);
    output.texcoord = texCoord;
    output.pos = mul(float4(output.position, 1.0f), modelViewProj);
    
    return output;
}

My pixel shader for completeness:

Texture2D Texture : register(t0);

SamplerState Sampler : register(s0);

struct LIGHT {
    float4 lightPos; // .w == range
    float4 lightCol; // .a == flags
};

cbuffer cbLight {
    LIGHT l[16] : register(b0); // 256 bytes
}

static const float3 ambient = { 0.15f, 0.15f, 0.15f };

float4 main(float3 position : POSITION, float3 n : NORMAL, float2 TexCoord : TEXCOORD) : SV_Target
{
    const float4 Texel = Texture.Sample(Sampler, TexCoord);

    if (Texel.a < 0.707106f) discard; // My source images have their alpha values inverted.

    float3 result = { 0.0f, 0.0f, 0.0f };

    for (uint xx = 0 ; xx < 16 && l[xx].lightCol.a != 0xFFFFFFFF; xx++)
    {
        const float3 lCol    = l[xx].lightCol.rgb;
        const float  range   = l[xx].lightPos.w;
        const float3 vToL    = l[xx].lightPos.xyz - position;
        const float  distToL = length(vToL);
        
        if (distToL < range * 2.0f)
        {
            const float  att = min(1.0f, (distToL / range + distToL / (range * range)) * 0.5f);
            const float3 lum = Texel.rgb * saturate(dot(vToL / distToL, n)) * lCol;
            result += lum * (1.0f - att);
        }
    }
    return float4(ambient * Texel.rgb + result, Texel.a);
}

And the rather busy looking C function to generate the vertex data (all non-relevant functions removed):

al16 struct CLASS_PRIMITIVES {
    ID3D11Buffer* pVB = { NULL, NULL }, * pIB = { NULL, NULL };
    const UINT strideV1 = sizeof(VERTEX1);

    void CreateQuadSet1(ui32 xSegs, ui32 ySegs) {
        al16 VERTEX1* vBuf;
        al16 D3D11_BUFFER_DESC bd = {};
             D3D11_SUBRESOURCE_DATA srd = {};
             ui32 index = 0, totalVerts = xSegs * ySegs * 4;

        if (pVB) return;
        vBuf = (VERTEX1*)_aligned_malloc(strideV1 * totalVerts, 16);
        for (ui32 yy = ySegs; yy; yy--)
            for (ui32 xx = 0; xx < xSegs; xx++) {
                double dyStep2 = 16384.0 / double(ySegs); double dyStep1 = dyStep2 * double(yy); dyStep2 *= double(yy - 1);
                ui32 yStep1 = dyStep1;
                yStep1 <<= 17;
                ui32 yStep2 = dyStep2;
                yStep2 <<= 17;
                vBuf[index].b = 0 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep1;
                index++;
                vBuf[index].b = 1 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep2;
                index++;
                vBuf[index].b = 2 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep1;
                index++;
                vBuf[index].b = 3 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep2;
                index++;
            }
        bd.Usage = D3D11_USAGE_IMMUTABLE;
        bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
        bd.CPUAccessFlags = 0;
        bd.ByteWidth = strideV1 * totalVerts;
        bd.StructureByteStride = strideV1;
        srd.pSysMem = vBuf;
        hr = dev->CreateBuffer(&bd, &srd, &pVB);
        if (hr != S_OK) ThrowError();
        _aligned_free(vBuf);
    };

    void DrawQuadFromSet1(ui32 offset) {
        offset *= sizeof(VERTEX1) * 4;
        devcon->IASetVertexBuffers(0, 1, &pVB, &strideV1, &offset);
        devcon->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
        devcon->Draw(4, 0);
    };

    void DestroyQuadSet() {
        if (pVB) pVB->Release();
    };

It's all functioning as it should, but it just seems like I'm resorting to hacks to achieve my goal. Surely there's a faster way? Using DrawIndexed() consistently dropped the frame-rate by 1% so I switched back to non-indexed Draw calls.

Solution

I think your code is CPU bound. While your approach has very small vertices, you have non-trivial API overhead.

A better approach is rendering all quads with a single draw call. I would probably use instancing for that.

Assuming you want arbitrary per-quad size, position, and orientation in 3D space, here’s one possible approach. Untested.

Vertex buffer elements:

struct sInstanceData
{
    // Center of the quad in 3D space
    XMFLOAT3 center;
    // XY coordinates of the sprite in the atlas
    uint16_t spriteX, spriteY;
    // Local XY vectors of the quad in 3D space
    // length of the vectors = half width/height of the quad
    XMFLOAT3 plusX, plusY;
};

Input layout:

D3D11_INPUT_ELEMENT_DESC desc[ 4 ];
desc[ 0 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadCenter", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 1 ] = D3D11_INPUT_ELEMENT_DESC{ "SpriteIndex", 0, DXGI_FORMAT_R16G16_UINT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 2 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusX", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 3 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusY", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };

Vertex shader:

cbuffer Constants
{
    matrix viewProj;
    // Pass [ 1.0 / xSegs, 1.0 / ySegs ] in that field
    float2 texcoordMul;
};

struct VOut
{
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 texcoord : TEXCOORD;
    float4 pos : SV_Position;
};

VOut main( uint index: SV_VertexID,
    float3 center : QuadCenter, uint2 texcoords : SpriteIndex,
    float3 plusX : QuadPlusX, float3 plusY : QuadPlusY )
{
    VOut result;
    float3 pos = center;
    int2 uv = ( int2 )texcoords;

    // No branches are generated in release builds;
    // only conditional moves are there
    if( index & 1 )
    {
        pos += plusX;
        uv.x++;
    }
    else
        pos -= plusX;

    if( index & 2 )
    {
        pos += plusY;
        uv.y++;
    }
    else
        pos -= plusY;

    result.position = pos;
    result.n = normalize( cross( plusX, plusY ) );
    result.texcoord = ( ( float2 )uv ) * texcoordMul;
    result.pos = mul( float4( pos, 1.0f ), viewProj );
    return result;
}

Rendering:

UINT stride = sizeof( sInstanceData );
UINT off = 0;
context->IASetVertexBuffers( 0, 1, &vb, &stride, &off );
context->IASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP );
context->DrawInstanced( 4, countQuads, 0, 0 );