c++rustsimdavx2

6-bit lookup using SIMD AVX2


I am trying to get the 6-bit lookup on SIMD AVX2 correct. I am splitting the 6 bits into lower 4 bits and high 2 bits, the lower 4 are used for the shuffle operation, and subsequently blending the results with the appropriate masks on. The logic seems fine to me and need help understanding what I am doing wrong. The values are kinda close compared to the scalar equivalent, but incorrect.

Edit: The lookup table has 64 entries, so I am doing 16 byte register loads four times.


pub fn senary_weighted_wrapper(data: &[u8]) -> u64 {
    // Initialize lookup table
    let mut lookup = [0u8; 64];
    for i in 0..64 {
        lookup[i] = i.count_ones() as u8;
    }

    unsafe { senary_weighted_simd_avx2(data.as_ptr(), data.len(), &lookup) }
}


unsafe fn senary_weighted_simd_avx2(data: *const u8, n: usize, lookup: &[u8; 64]) -> u64 {
    let mut i = 0;
    let lookup_vec0 = _mm256_loadu_si256(lookup.as_ptr() as *const __m256i);
    let lookup_vec1 = _mm256_loadu_si256(lookup.as_ptr().add(16) as *const __m256i);
    let lookup_vec2 = _mm256_loadu_si256(lookup.as_ptr().add(32) as *const __m256i);
    let lookup_vec3 = _mm256_loadu_si256(lookup.as_ptr().add(48) as *const __m256i);
    let low_mask = _mm256_set1_epi8(0x0f); // 4 bits mask
    let mut acc = _mm256_setzero_si256();

    while i + 32 < n {
        let mut local = _mm256_setzero_si256();
        for _ in 0..255 / 8 {
            if i + 32 >= n {
                break;
            }
            let vec = _mm256_loadu_si256(data.add(i) as *const __m256i);
            let vec_masked = _mm256_and_si256(vec, _mm256_set1_epi8(0x3F)); // Mask to lower 6 bits

            let lo = _mm256_and_si256(vec_masked, low_mask);
            let hi = _mm256_srli_epi16(vec_masked, 4);

            let result0 = _mm256_shuffle_epi8(lookup_vec0, lo);
            let result1 = _mm256_shuffle_epi8(lookup_vec1, lo);
            let result2 = _mm256_shuffle_epi8(lookup_vec2, lo);
            let result3 = _mm256_shuffle_epi8(lookup_vec3, lo);

            let blend01 = _mm256_blendv_epi8(result0, result1, _mm256_slli_epi16(hi, 7));
            let blend23 = _mm256_blendv_epi8(result2, result3, _mm256_slli_epi16(hi, 7));
            let popcnt = _mm256_blendv_epi8(blend01, blend23, _mm256_slli_epi16(hi, 6));

            local = _mm256_add_epi8(local, popcnt);
            i += 32;
        }
        acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
    }

    let mut result = 0u64;
    result += _mm256_extract_epi64(acc, 0) as u64;
    result += _mm256_extract_epi64(acc, 1) as u64;
    result += _mm256_extract_epi64(acc, 2) as u64;
    result += _mm256_extract_epi64(acc, 3) as u64;

    // Handle remaining bytes
    while i < n {
        let byte = *data.add(i) & 0x3F; // Mask to lower 6 bits
        result += lookup[byte as usize] as u64;
        i += 1;
    }

    result
}

The values are kinda close compared to the scalar equivalent, but incorrect.


Solution

  • let lookup_vec0 = _mm256_loadu_si256(lookup.as_ptr() as *const __m256i);

    That's not right, this shouldn't have been a contiguous 32 byte load, but you wanted the same 16 bytes in both the lower and upper half, for each of the 4 lookup registers.