I am trying to get the 6-bit lookup on SIMD AVX2 correct. I am splitting the 6 bits into lower 4 bits and high 2 bits, the lower 4 are used for the shuffle operation, and subsequently blending the results with the appropriate masks on. The logic seems fine to me and need help understanding what I am doing wrong. The values are kinda close compared to the scalar equivalent, but incorrect.
Edit: The lookup table has 64 entries, so I am doing 16 byte register loads four times.
pub fn senary_weighted_wrapper(data: &[u8]) -> u64 {
// Initialize lookup table
let mut lookup = [0u8; 64];
for i in 0..64 {
lookup[i] = i.count_ones() as u8;
}
unsafe { senary_weighted_simd_avx2(data.as_ptr(), data.len(), &lookup) }
}
unsafe fn senary_weighted_simd_avx2(data: *const u8, n: usize, lookup: &[u8; 64]) -> u64 {
let mut i = 0;
let lookup_vec0 = _mm256_loadu_si256(lookup.as_ptr() as *const __m256i);
let lookup_vec1 = _mm256_loadu_si256(lookup.as_ptr().add(16) as *const __m256i);
let lookup_vec2 = _mm256_loadu_si256(lookup.as_ptr().add(32) as *const __m256i);
let lookup_vec3 = _mm256_loadu_si256(lookup.as_ptr().add(48) as *const __m256i);
let low_mask = _mm256_set1_epi8(0x0f); // 4 bits mask
let mut acc = _mm256_setzero_si256();
while i + 32 < n {
let mut local = _mm256_setzero_si256();
for _ in 0..255 / 8 {
if i + 32 >= n {
break;
}
let vec = _mm256_loadu_si256(data.add(i) as *const __m256i);
let vec_masked = _mm256_and_si256(vec, _mm256_set1_epi8(0x3F)); // Mask to lower 6 bits
let lo = _mm256_and_si256(vec_masked, low_mask);
let hi = _mm256_srli_epi16(vec_masked, 4);
let result0 = _mm256_shuffle_epi8(lookup_vec0, lo);
let result1 = _mm256_shuffle_epi8(lookup_vec1, lo);
let result2 = _mm256_shuffle_epi8(lookup_vec2, lo);
let result3 = _mm256_shuffle_epi8(lookup_vec3, lo);
let blend01 = _mm256_blendv_epi8(result0, result1, _mm256_slli_epi16(hi, 7));
let blend23 = _mm256_blendv_epi8(result2, result3, _mm256_slli_epi16(hi, 7));
let popcnt = _mm256_blendv_epi8(blend01, blend23, _mm256_slli_epi16(hi, 6));
local = _mm256_add_epi8(local, popcnt);
i += 32;
}
acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
}
let mut result = 0u64;
result += _mm256_extract_epi64(acc, 0) as u64;
result += _mm256_extract_epi64(acc, 1) as u64;
result += _mm256_extract_epi64(acc, 2) as u64;
result += _mm256_extract_epi64(acc, 3) as u64;
// Handle remaining bytes
while i < n {
let byte = *data.add(i) & 0x3F; // Mask to lower 6 bits
result += lookup[byte as usize] as u64;
i += 1;
}
result
}
The values are kinda close compared to the scalar equivalent, but incorrect.
let lookup_vec0 = _mm256_loadu_si256(lookup.as_ptr() as *const __m256i);
That's not right, this shouldn't have been a contiguous 32 byte load, but you wanted the same 16 bytes in both the lower and upper half, for each of the 4 lookup registers.