How to reorder interleaved 8-bit values across AVX2 lanes efficiently?

I'm working on grayscale to BGRA conversion using AVX2. Something a little unusual about the greyscale format is it's 16 bit but the greyscale value is only in the lower 8 bits, i.e. it's 8 bit data stored in 16 bit words.

The vanilla C++ is of course quite simple:

 auto * s = reinterpret_cast<uint16_t*>(input_data);
 auto * d = reinterpret_cast<uint8_t *>(output_data);
   
 for (auto y = 0; y < Height; y++, s += input_pitch, d += output_pitch)
 {
     for (auto x = 0; x < Width; x++)
     {
         auto v = static_cast<uint8_t>(s[x]);

         d[x * 4 + 0] = v;
         d[x * 4 + 1] = v;
         d[x * 4 + 2] = v;
         d[x * 4 + 3] = 255; // Alpha channel
     }
 }

I would like to make an AVX2 version but I'm struggling with cross lane permutation. I was thinking I'd fill two __m256i with consecutive 16 words, i.e. take the first 32 pixels, then narrow them to bytes (discarding the top 8 bits) as follows:

// Load 32 grayscale pixels from source (2 bytes per pixel)
__m256i a = _mm256_load_si256(reinterpret_cast<const __m256i*>(s + x));
__m256i b = _mm256_load_si256(reinterpret_cast<const __m256i*>(s + x + 16));

// Pack 16-bit grayscale into 8-bit grayscale
__m256i packed = _mm256_packus_epi16(a, b);

However, although _mm256_packus_epi16 does do the narrowing, it interleaves lanes so we end up with order a0 to a7, b0 to b7, a8 to a15, b8 to b15. I need to be able to re-order again after this narrowing, so I end up with a0 to a15, b0 to b15.

I don't think we can do any cross lane permutations with AVX2 so I suspect there's some multi-instruction solution but I'm not sure what it might be. My test program (console app) given below, sets up the problem. I'm failing at reordering and really don't have much of an idea how to do it with AVX2, not being very familiar with the instruction set.

Can anyone assist?

#include <immintrin.h>
#include <iostream>
#include <vector>
#include <iomanip>

// Helper function to print 256-bit register as bytes.

void print_m256i(char const * label, __m256i reg)
{
    uint8_t vals[32];

    _mm256_storeu_si256(reinterpret_cast<__m256i*>(vals), reg);

    std::cout << label << ": ";

    for (int i = 0; i < 32; ++i)
    {
        std::cout << std::setw(3) << static_cast<int>(vals[i]) << " ";
    }

    std::cout << "\n";
}        

int main(void)
{
    // Simulated 16-bit grayscale values (s_1 and s_2).

    alignas(32) uint16_t s1_vals[16] = { 21, 19, 24, 27, 27, 30, 41, 38, 32, 41, 35, 38, 41, 38, 38, 30 };
    alignas(32) uint16_t s2_vals[16] = { 30, 30, 30, 30, 24, 24, 24, 24, 24, 32, 32, 32, 35, 35, 32, 32 };

    // Load the source data into AVX2 registers.

    __m256i s1 = _mm256_load_si256(reinterpret_cast<const __m256i*>(s1_vals));
    __m256i s2 = _mm256_load_si256(reinterpret_cast<const __m256i*>(s2_vals));

    // Step 1: Pack 16-bit values into 8-bit values.

    __m256i packed = _mm256_packus_epi16(s1, s2);

    print_m256i("Packed", packed);

    // Step 2: Split packed into two 128-bit lanes for further reordering.

    __m128i lower_lane = _mm256_castsi256_si128(packed);
    __m128i upper_lane = _mm256_extracti128_si256(packed, 1);

    print_m256i("Lower Lane", _mm256_castsi128_si256(lower_lane));
    print_m256i("Upper Lane", _mm256_castsi128_si256(upper_lane));

    // Step 3: Shuffle within each lane (adjust this to debug the issue).

    __m128i interleaved_s1 = _mm_unpacklo_epi8(lower_lane, upper_lane); // s1[0..7] and s1[8..15]
    __m128i interleaved_s2 = _mm_unpackhi_epi8(lower_lane, upper_lane); // s2[0..7] and s2[8..15]

    print_m256i("Interleaved S1", _mm256_castsi128_si256(interleaved_s1));
    print_m256i("Interleaved S2", _mm256_castsi128_si256(interleaved_s2));

    // Step 4: Combine the interleaved lanes back into a single 256-bit register.

    __m256i reordered = _mm256_set_m128i(interleaved_s2, interleaved_s1);

    print_m256i("Reordered", reordered);

    return 0;
}

Solution

From chtz's comment, I am doing 8 pixels at a time instead. Here in the repro we see load to 128, broadcast to 256, shuffle and mask to give us a single 256 bit register with 8 BGRA pixels.

#include <immintrin.h>
#include <iostream>
#include <vector>
#include <iomanip>

// Helper function to print 256-bit register as bytes.

void print_m256i(char const * label, __m256i reg)
{
    uint8_t vals[32];

    _mm256_storeu_si256(reinterpret_cast<__m256i*>(vals), reg);

    std::cout << label << ": ";

    for (int i = 0; i < 32; ++i)
    {
        std::cout << std::setw(3) << static_cast<int>(vals[i]) << " ";
    }

    std::cout << "\n";
}

int main(void)
{
    // Simulated 16-bit grayscale values (16 bytes, 8 grayscale pixels).
    alignas(16) uint16_t grayscale_input[8] = { 21, 19, 24, 27, 27, 30, 41, 38 };

    // Step 1: Load 8 grayscale pixels (16 bytes) into a 128-bit register.
    __m128i grayscale = _mm_load_si128(reinterpret_cast<const __m128i*>(grayscale_input));

    // Step 2: Broadcast the 128-bit grayscale data to both lanes of a 256-bit register.
    __m256i broadcast = _mm256_broadcastsi128_si256(grayscale);
    print_m256i("Grayscale (16-bit)", broadcast);

    // Step 3: Shuffle to convert the 8 grayscale pixels into BGRA format.
    const __m256i shuffle_mask = _mm256_setr_epi8(
        0, 0, 0, -1, 2, 2, 2, -1, 4, 4, 4, -1, 6, 6, 6, -1,
        8, 8, 8, -1, 10, 10, 10, -1, 12, 12, 12, -1, 14, 14, 14, -1
    );
    __m256i rgba = _mm256_shuffle_epi8(broadcast, shuffle_mask);
    print_m256i("Shuffled RGBA (no alpha)", rgba);

    // Step 4: Add the alpha channel (0xFF) to the BGRA data.
    const __m256i alpha_mask = _mm256_set1_epi32(0xFF000000);
    rgba = _mm256_or_si256(rgba, alpha_mask);
    print_m256i("RGBA with Alpha", rgba);


    return 0;
}