How to improve performance of a packed yuv to planar yuv conversion using avx2?

I'm converting frames delivered in v210 (packed 10bit yuv) format to p210 (planar 10bit carried as 16bit values). Input is typically 4k (3840x2160) at upto 60Hz. The scalar implementation is quite simple and probably fast enough but more headroom would be handy so I thought to attempt to get a speedup using avx2.

Unfortunately my current implementation is ~30% slower on my test box (intel alder lake, msvc 19.44 compiler) so am looking for some tips on how to improve it. It would be particularly useful to know which bits of this are "slow" as well example alternative instructions/techniques that can perform this type of interleaving.

For reference

v210 is a format that packs 12 10-bit unsigned components into four 32-bit little-endian words where the two 2 bits of each 32bit is padding.

p210 is a planar format with a Y plane followed an interleaved downsampled UV plane

conceptually current implementation is:

load 2 blocks of 4 words into a single __m256i
mask & shift to extract each component into separate __m256i
pack 32bit values down to 16bit
shuffle & blend to generate the separate y and uv planes (using 96bits of each 128bit lane)
shift up to 16bit & write to the output

which can be represented visually like this (light yellow are irrelevant zero values, bright yellow marks the separation between the lanes

the actual code

    void convert(const uint8_t* src, int srcStride, uint8_t* dstY, uint8_t* dstUV, int width, int height)
{
    const int groupsPerLine = width / 12;

    // Pre-compute constants once outside the loops
    const __m256i mask10 = _mm256_set1_epi32(0x3FF);
    const __m256i zeroes = _mm256_setzero_si256();
    const __m256i s2_shuffleMask = _mm256_setr_epi8(
        -1, -1, 0, 1, 2, 3, -1, -1, 4, 5, 6, 7, -1, -1, -1, -1,
        -1, -1, 0, 1, 2, 3, -1, -1, 4, 5, 6, 7, -1, -1, -1, -1
    );
    const __m256i s1_shuffleMask = _mm256_setr_epi8(
        0, 1, -1, -1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, -1, -1,
        0, 1, -1, -1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, -1, -1
    );
    const __m256i s0_shuffleMask = _mm256_setr_epi8(
        0, 1, 2, 3, -1, -1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1,
        0, 1, 2, 3, -1, -1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1
    );
    const uint8_t y_blend_mask_1 = 0b00001001;
    const uint8_t y_blend_mask_2 = 0b00011011;
    const uint8_t uv_blend_mask_1 = 0b00110110;
    const uint8_t uv_blend_mask_2 = 0b00101101;

    // Process all lines with a single loop implementation
    for (int lineNo = 0; lineNo < height; ++lineNo)
    {
        const uint32_t* srcLine = reinterpret_cast<const uint32_t*>(src + lineNo * srcStride);
        uint16_t* dstLineY = reinterpret_cast<uint16_t*>(dstY + lineNo * width * 2);
        uint16_t* dstLineUV = reinterpret_cast<uint16_t*>(dstUV + lineNo * width * 2);

        // Process all complete groups
        int g = 0;
        for (; g < groupsPerLine - (lineNo == height - 1 ? 1 : 0); ++g)
        {
            // Load 8 dwords
            __m256i dwords = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcLine));

            // transpose to extract a 10-bit component into a 32bit slot spread across 3 registers
            __m256i s0_32 = _mm256_and_si256(dwords, mask10);                         // bits 0–9
            __m256i s1_32 = _mm256_and_si256(_mm256_srli_epi32(dwords, 10), mask10);  // bits 10–19
            __m256i s2_32 = _mm256_and_si256(_mm256_srli_epi32(dwords, 20), mask10);  // bits 20–29

            // pack down to 16bit and fill the remainder with zeroes
            __m256i s0_16 = _mm256_packs_epi32(s0_32, zeroes);
            __m256i s1_16 = _mm256_packs_epi32(s1_32, zeroes);
            __m256i s2_16 = _mm256_packs_epi32(s2_32, zeroes);

            // shuffle to prepare for blending
            __m256i s0_16_shuffled = _mm256_shuffle_epi8(s0_16, s0_shuffleMask);
            __m256i s1_16_shuffled = _mm256_shuffle_epi8(s1_16, s1_shuffleMask);
            __m256i s2_16_shuffled = _mm256_shuffle_epi8(s2_16, s2_shuffleMask);

            // blend to y and uv
            __m256i y_tmp = _mm256_blend_epi16(s0_16_shuffled, s1_16_shuffled, y_blend_mask_1);
            __m256i uv_tmp = _mm256_blend_epi16(s0_16_shuffled, s1_16_shuffled, uv_blend_mask_1);
            __m256i y = _mm256_blend_epi16(s2_16_shuffled, y_tmp, y_blend_mask_2);
            __m256i uv = _mm256_blend_epi16(s2_16_shuffled, uv_tmp, uv_blend_mask_2);

            // scale
            __m256i y_scaled = _mm256_slli_epi16(y, 6);
            // write 96 bits from each lane
            __m128i y_lo = _mm256_extracti128_si256(y_scaled, 0);
            __m128i y_hi = _mm256_extracti128_si256(y_scaled, 1);
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dstLineY), y_lo);
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dstLineY + 6), y_hi);

            // scale
            __m256i uv_scaled = _mm256_slli_epi16(uv, 6);
            // write 96 bits from each lane
            __m128i uv_lo = _mm256_extracti128_si256(uv_scaled, 0);
            __m128i uv_hi = _mm256_extracti128_si256(uv_scaled, 1);
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dstLineUV), uv_lo);
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dstLineUV + 6), uv_hi);

            dstLineY += 12;
            dstLineUV += 12;
            srcLine += 8;
        }

        // Handle last group for the last line as the m128i can overflow
        if (lineNo == height - 1 && g < groupsPerLine)
        {
            // Load 8 dwords
            __m256i dwords = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcLine));

            // transpose to extract a 10-bit component into a 32bit slot spread across 3 registers
            __m256i s0_32 = _mm256_and_si256(dwords, mask10);                         // bits 0–9
            __m256i s1_32 = _mm256_and_si256(_mm256_srli_epi32(dwords, 10), mask10);  // bits 10–19
            __m256i s2_32 = _mm256_and_si256(_mm256_srli_epi32(dwords, 20), mask10);  // bits 20–29

            // pack down to 16bit and fill the remainder with zeroes
            __m256i s0_16 = _mm256_packs_epi32(s0_32, zeroes);
            __m256i s1_16 = _mm256_packs_epi32(s1_32, zeroes);
            __m256i s2_16 = _mm256_packs_epi32(s2_32, zeroes);

            // shuffle to prepare for blending
            __m256i s0_16_shuffled = _mm256_shuffle_epi8(s0_16, s0_shuffleMask);
            __m256i s1_16_shuffled = _mm256_shuffle_epi8(s1_16, s1_shuffleMask);
            __m256i s2_16_shuffled = _mm256_shuffle_epi8(s2_16, s2_shuffleMask);

            // blend to y and uv
            __m256i y_tmp = _mm256_blend_epi16(s0_16_shuffled, s1_16_shuffled, y_blend_mask_1);
            __m256i uv_tmp = _mm256_blend_epi16(s0_16_shuffled, s1_16_shuffled, uv_blend_mask_1);
            __m256i y = _mm256_blend_epi16(s2_16_shuffled, y_tmp, y_blend_mask_2);
            __m256i uv = _mm256_blend_epi16(s2_16_shuffled, uv_tmp, uv_blend_mask_2);

            // scale
            __m256i y_scaled = _mm256_slli_epi16(y, 6);
            // write 96 bits from each lane
            __m128i y_lo = _mm256_extracti128_si256(y_scaled, 0);
            __m128i y_hi = _mm256_extracti128_si256(y_scaled, 1);

            alignas(32) uint16_t tmpY[16] = { 0 };
            _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpY), y_lo);
            _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpY + 6), y_hi);

            // scale
            __m256i uv_scaled = _mm256_slli_epi16(uv, 6);
            // write 96 bits from each lane
            __m128i uv_lo = _mm256_extracti128_si256(uv_scaled, 0);
            __m128i uv_hi = _mm256_extracti128_si256(uv_scaled, 1);
            alignas(32) uint16_t tmpUV[16] = { 0 };
            _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpUV), uv_lo);
            _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpUV + 6), uv_hi);

            // Calculate remaining pixels to avoid writing past the end of the buffer
            const int remainingPixels = width - g * 12;
            const size_t bytesToCopy = std::min(24, remainingPixels * 2); // 2 bytes per pixel

            std::memcpy(dstLineY, tmpY, bytesToCopy);
            std::memcpy(dstLineUV, tmpUV, bytesToCopy);

            dstLineY += 12;
            dstLineUV += 12;
            srcLine += 8;
        }
    }
}

Solution

As far as I counted, there is no single specific thing that is the problem. The code is not even especially p5-heavy despite the shuffles. vextracti128-to-mem doesn't count as a shuffle, and Alder Lake can shuffle on p1 as well. With no specific culprit, the goal becomes more to reduce operations in general. uiCA doesn't have Alder Lake yet but you can see an analysis for Rocket Lake.

It should be possible to remove the vpackssdw-with-zeroes. The saturation does nothing in this context, and both the data movement and zero-insertion can be done by vpshufb from the next step. I haven't worked out what the shuffle masks would need to be, but in any case it's not particularly promising, it would be around a 15% improvement. So really, we need something else.

Another approach could be based on VPMULLW, using it to emulate VPSLLVW and shift different words by different amounts. That would make it possible to handle the first and last 10bit chunk in each dword together, the middle chunk can then be extracted separately.

__m256i maskfirstlast = _mm256_set1_epi32(0x3FF003FF);
__m256i shiftconstant = _mm256_set1_epi32(0x00040040);
__m256i maskmiddle    = _mm256_set1_epi32(0x000FFC00);
__m256i firstlast = _mm256_mullo_epi16(_mm256_and_si256(dwords, maskfirstlast), shiftconstant);
__m256i middle = _mm256_srli_epi32(_mm256_and_si256(dwords, maskmiddle), 4);

This also took care of aligning chunks to the top of the word in which they end up, so that's no longer a separate step.

The odd-index dwords in firstlast together with the even-index dwords in middle contain all the Ys, a bit shuffled but a vpshufb can take care of that.

__m256i Ys = _mm256_blend_epi32(firstlast, middle, 0x55);
__m256i Y_shuffle = _mm256_setr_epi8(
    0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1,
    0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1); // double-check this shufmask
Ys = _mm256_shuffle_epi8(Ys, Y_shuffle);

At this point there's a choice between 2 partially overlapping stores (overlapping by 4 bytes) as you did originally or a cross-lane shuffle to get rid of the "hole" and then only 1 store, I think we should go for the shuffle-and-1-store but you can try it both ways and see which wins.

__m256i skip3perm = _mm256_setr_epi32(0, 1, 2, 4, 5, 6, 7, 7);
Ys = _mm256_permutevar8x32_epi32(Ys, skip3perm);
_mm256_storeu_si256(somewhere, Ys);

The chroma components are in the opposite even/odd blend but otherwise something similar can be done,

__m256i UVs = _mm256_blend_epi32(firstlast, middle, 0xAA);
__m256i UV_shuffle = _mm256_setr_epi8(
    0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1,
    0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1); // also double-check this shufmask
UVs = _mm256_shuffle_epi8(UVs, UV_shuffle);
UVs = _mm256_permutevar8x32_epi32(UVs, skip3perm);
_mm256_storeu_si256(somewhere, UVs);

This is all untested but the µop count looks good.

If I understood chtz's suggestion, that would look something like this (also untested)

__m256i dwords = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcLine));

__m256i UV_mask = _mm256_set1_epi64x(0x000FFC003FF003FF);
__m256i UV_shuf = _mm256_setr_epi8(
    0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 13, 14, -1, -1, -1, -1,
    0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 13, 14, -1, -1, -1, -1);
__m256i UV_shift = _mm256_setr_epi16(
    64, 4, 16, 64, 4, 16, 0, 0,
    64, 4, 16, 64, 4, 16, 0, 0);
__m256i skip3perm = _mm256_setr_epi32(0, 1, 2, 4, 5, 6, 7, 7);
__m256i Y_mask = _mm256_set1_epi64x(0x3FF003FF000FFC00);
__m256i Y_shuf = _mm256_setr_epi8(
    1, 2, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1,
    1, 2, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1);
__m256i Y_shift = _mm256_setr_epi16(
    16, 64, 4, 16, 64, 4, 0, 0,
    16, 64, 4, 16, 64, 4, 0, 0);

__m256i UVs = _mm256_and_si256(dwords, UV_mask);
UVs = _mm256_shuffle_epi8(UVs, UV_shuf);
UVs = _mm256_mullo_epi16(UVs, UV_shift);
UVs = _mm256_permutevar8x32_epi32(UVs, skip3perm);
__m256i Ys = _mm256_and_si256(dwords, Y_mask);
Ys = _mm256_shuffle_epi8(Ys, Y_shuf);
Ys = _mm256_mullo_epi16(Ys, Y_shift);
Ys = _mm256_permutevar8x32_epi32(Ys, skip3perm);

_mm256_storeu_si256(reinterpret_cast<__m256i*>(dstLineY), Ys);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dstLineUV), UVs);