
Fast counting the number of set bits in __m128i register

I should count the number of set bits of a __m128i register. In particular, I should write two functions that are able to count the number of bits of the register, using the following ways.

  1. The total number of set bits of the register.
  2. The number of set bits for each byte of the register.

Are there intrinsic functions that can perform, wholly or partially, the above operations?


  • Here are some codes I used in an old project (there is a research paper about it). The function popcnt8 below computes the number of bits set in each byte.

    SSE2-only version (based on Algorithm 3 in Hacker's Delight book):

    static const __m128i popcount_mask1 = _mm_set1_epi8(0x77);
    static const __m128i popcount_mask2 = _mm_set1_epi8(0x0F);
    static inline __m128i popcnt8(__m128i x) {
        __m128i n;
        // Count bits in each 4-bit field.
        n = _mm_srli_epi64(x, 1);
        n = _mm_and_si128(popcount_mask1, n);
        x = _mm_sub_epi8(x, n);
        n = _mm_srli_epi64(n, 1);
        n = _mm_and_si128(popcount_mask1, n);
        x = _mm_sub_epi8(x, n);
        n = _mm_srli_epi64(n, 1);
        n = _mm_and_si128(popcount_mask1, n);
        x = _mm_sub_epi8(x, n);
        x = _mm_add_epi8(x, _mm_srli_epi16(x, 4));
        x = _mm_and_si128(popcount_mask2, x);
        return x;

    SSSE3 version (due to Wojciech Mula):

    static const __m128i popcount_mask = _mm_set1_epi8(0x0F);
    static const __m128i popcount_table = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
    static inline __m128i popcnt8(__m128i n) {
        const __m128i pcnt0 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(n, popcount_mask));
        const __m128i pcnt1 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(_mm_srli_epi16(n, 4), popcount_mask));
        return _mm_add_epi8(pcnt0, pcnt1);

    XOP version (equivalent to SSSE3, but uses XOP instructions which are faster on AMD Bulldozer)

    static const __m128i popcount_mask = _mm_set1_epi8(0x0F);
    static const __m128i popcount_table = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
    static const __m128i popcount_shift = _mm_set1_epi8(-4);
    static inline __m128i popcount8(__m128i n) {
        const __m128i pcnt0 = _mm_perm_epi8(popcount_table, popcount_table, _mm_and_si128(n, popcount_mask));
        const __m128i pcnt1 = _mm_perm_epi8(popcount_table, popcount_table, _mm_shl_epi8(n, popcount_shift));
        return _mm_add_epi8(pcnt0, pcnt1);

    Function popcnt64 below counts the number of bits in the low and high 64-bit parts of the SSE register:

    SSE2 version:

    static inline __m128i popcnt64(__m128i n) {
        const __m128i cnt8 = popcnt8(n);
        return _mm_sad_epu8(cnt8, _mm_setzero_si128());

    XOP version:

    static inline __m128i popcnt64(__m128i n) {
        const __m128i cnt8 = popcnt8(n);
        return _mm_haddq_epi8(cnt8);

    Finally, the function popcnt128 below count the number of bits in the whole 128-bit register:

    static inline int popcnt128(__m128i n) {
        const __m128i cnt64 = popcnt64(n);
        const __m128i cnt64_hi = _mm_unpackhi_epi64(cnt64, cnt64);
        const __m128i cnt128 = _mm_add_epi32(cnt64, cnt64_hi);
        return _mm_cvtsi128_si32(cnt128);

    However, a more efficient way to implement popcnt128 is to use hardware POPCNT instruction (on processors which support it):

    static inline int popcnt128(__m128i n) {
        const __m128i n_hi = _mm_unpackhi_epi64(n, n);
        #ifdef _MSC_VER
            return __popcnt64(_mm_cvtsi128_si64(n)) + __popcnt64(_mm_cvtsi128_si64(n_hi));
            return __popcntq(_mm_cvtsi128_si64(n)) + __popcntq(_mm_cvtsi128_si64(n_hi));