Why is SSE4.2 cmpstr slower than regular code?

I'm trying to validate a string that must only contain ASCII visible characters, white space and \t.

But it seems that ASCII table lookups are faster than the _mm_cmpestri instruction with _SIDD_CMP_RANGES on most CPUs. I've tested it on an i5-2410M, an i7-3720QM, an i7-5600U and a KVM-virtualized Xeon of unknown type and only on the last one is the vectorized version faster.

My test code is here:

#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <immintrin.h>
#include <stdalign.h>
#include <stdlib.h>

#define MIN(a,b) (((a)<(b))?(a):(b))

#define ALIGNED16 alignas(16)

#define MEASURE(msg,stmt) { \
    struct timeval tv; \
    gettimeofday(&tv, NULL); \
    uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
    stmt; \
    gettimeofday(&tv, NULL); \
    uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
    printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}

// Character table
#define VWSCHAR(c)  (vis_ws_chars[(unsigned char)(c)])   // Visible characters and white space
#define YES     1,
#define NO      0,
#define YES16   YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16    NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128   NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16

// Visible ASCII characters with space and tab
ALIGNED16 static const int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS  HT  LF  VT  FF  CR  SO  SI
   NO  NO  NO  NO  NO  NO  NO  NO  NO  YES NO  NO  NO  NO  NO  NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM  SUB ESC FS  GS  RS  US
   NO16
// SP  !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
// 0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
// @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
// P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
// `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
   YES16 YES16 YES16 YES16 YES16
// p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~   DEL
   YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
   NO128
};

size_t search_logic(const char* data, size_t len) {
    __m128i ht = _mm_set1_epi8('\t');
    //__m128i del = _mm_set1_epi8(0x7f);
    __m128i td = _mm_set1_epi8('~');
    __m128i sp_m1 = _mm_set1_epi8(' ' - 1);
    size_t i = 0;
    while (len - i >= 16) {
        __m128i c = _mm_loadu_si128((const __m128i *) (data + i));
        // (!((c < del) && (c >= sp)) && (c != ht)) == 0
        //if(!_mm_testc_si128(_mm_and_si128(_mm_cmpgt_epi8(c, sp_m1), _mm_cmplt_epi8(c, del)), _mm_xor_si128(c, ht)))
            //break;
        // !(c == del) && ((c == ht) || (c >= sp)) == 1
        //if(!_mm_test_all_ones(_mm_andnot_si128(_mm_cmpeq_epi8(c, del), _mm_or_si128(_mm_cmpeq_epi8(c, ht), _mm_cmpgt_epi8(c, sp_m1)))))
            //break;
        // (((c != ht) && (c >= sp)) && (c > td)) == 0
        if(!_mm_test_all_zeros(_mm_and_si128(_mm_xor_si128(c, ht), _mm_cmpgt_epi8(c, sp_m1)), _mm_cmpgt_epi8(c, td)))
            break;
        i += 16;
    }
    // Check last 15 bytes
    for (; i < len; ++i) {
        if (!VWSCHAR(data[i])) {
            break;
        }
    }
    return i;
}

size_t search_table(const char* data, size_t len)
{
    // Search non-matching character via table lookups
    size_t i = 0;
    while (len - i >= 16) {
        if (!VWSCHAR(data[i + 0])) break;
        if (!VWSCHAR(data[i + 1])) break;
        if (!VWSCHAR(data[i + 2])) break;
        if (!VWSCHAR(data[i + 3])) break;
        if (!VWSCHAR(data[i + 4])) break;
        if (!VWSCHAR(data[i + 5])) break;
        if (!VWSCHAR(data[i + 6])) break;
        if (!VWSCHAR(data[i + 7])) break;
        if (!VWSCHAR(data[i + 8])) break;
        if (!VWSCHAR(data[i + 9])) break;
        if (!VWSCHAR(data[i + 10])) break;
        if (!VWSCHAR(data[i + 11])) break;
        if (!VWSCHAR(data[i + 12])) break;
        if (!VWSCHAR(data[i + 13])) break;
        if (!VWSCHAR(data[i + 14])) break;
        if (!VWSCHAR(data[i + 15])) break;
        i += 16;
    }
    // Check last 15 bytes
    for (; i < len; ++i) {
        if (!VWSCHAR(data[i])) {
            break;
        }
    }
    return i;
}

size_t search_sse4cmpstr(const char* data, size_t len)
{
    static const char legal_ranges[16] = {
        '\t', '\t',
        ' ',  '~',
    };
    __m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
    size_t i = 0;
    while (len - i >= 16) {
        __m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
        unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
        i += consumed;
        if (consumed < 16) {
            return i;
        }
    }
    // Check last 15 bytes
    for (; i < len; ++i) {
        if (!VWSCHAR(data[i])) {
            break;
        }
    }
    return i;
}

size_t search_sse4cmpstr_implicit(const char* data, size_t len)
{
    static const char legal_ranges[16] = {
        '\t', '\t',
        ' ',  '~',
    };
    __m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
    size_t i = 0;
    while (len - i >= 16) {
        __m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
        unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
        i += consumed;
        if (consumed < 16) {
            return i;
        }
    }
    // Check last 15 bytes
    for (; i < len; ++i) {
        if (!VWSCHAR(data[i])) {
            break;
        }
    }
    return i;
}

int main()
{
    printf("Setting up 1GB of data...\n");
    size_t len = 1024 * 1024 * 1024 + 3;
    char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
    srand(0);
    for (size_t i = 0; i < len; ++i) {
        const char v = rand() % 96;
        data[i] = v == 95 ? '\t' : ' ' + v;
    }
    size_t end = len - 2;
    data[end] = '\n'; // Illegal character to be found

    MEASURE("table lookup", {
        size_t i = search_table(data, len);
        if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
    });
    MEASURE("cmpestr ranges", {
        size_t i = search_sse4cmpstr(data, len);
        if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
    });
    MEASURE("cmpistr ranges", {
        size_t i = search_sse4cmpstr_implicit(data, len);
        if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
    });
    MEASURE("logic ranges", {
        size_t i = search_logic(data, len);
        if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
    });
}

Compiled with gcc -O3 -march=native -pedantic -Wall -Wextra main2.cpp it gives me these results:

Setting up 1GB of data...
table lookup         - 476.4820ms
cmpestr ranges       - 519.3350ms
cmpistr ranges       - 497.5770ms
logic ranges         - 153.2650ms

I've also checked the assembly output and search_sse4cmpstr uses vpcmpestri while search_table is non-vectorized.

Am I using it wrong? Or why does this instruction exist at all?

EDIT: As pointed out in the comments, cmpistr (implicit length instruction with less parameters) is slightly faster than cmpestr and sometimes faster than the table lookup.

However, SSE2 bitwise and integer operations seem to be even faster.

EDIT2 Peter Cordes found the right answer. I've added the revised program in a new answer, so please look at this one if you are interested in cmpstr.

DO NOT USE THE CODE ABOVE!

Solution

The code has an unnecessary dependency of i on the previous vector, bottlenecking on pcmpestri + L1d load-use latency of about 12 + 5 cycles. (https://agner.org/optimize/ and https://uops.info/) So yes, you are using it wrong, unfortunately.

If you wrote it similar to your scalar loop, doing i+=16 and just checking the pcmpestri result as a loop-exit condition, you'd bottleneck on its throughput of 1 vector per 4 clocks on your Sandybridge-family CPUs. (SnB and IvB specifically).

Or if your input can use pcmpistri, that's somewhat less bad and can go at 1 per 3 clocks on Sandybridge-family.

I didn't notice this problem at first because I wasn't expecting the loop to be written that way, and there was other clutter in the asm loop. :/ I spent a bunch of time profiling with perf to be sure it wasn't a front-end bottleneck from the microcoded (8 uop) instruction on my Skylake CPU. See the now-archived comments.

A throughput bottleneck would let you go at about 4 bytes / cycle, vs. about 1 for the other way (2 loads per input byte, and Intel since SnB can do 2 loads per clock). So a factor of 4 speedup. Or a factor of 8 on Nehalem with 1/clock load throughput.

The latency bottleneck is just about 1 cycle per input byte, about the same as the table lookup, by coincidence.

Also, don't use len - i < 16; gcc actually calculates that inside the loop costing extra uops. Use i < len-15 once you know that len>=16. (unsigned types make this tricky because they wrap at zero; what you want it to compile to is a cmp/jcc to skip the loop, then a do{}while asm loop structure. So the initial len>=16 really is separate from the normal loop condition.)

Other fun facts about pcmpestri:

How much faster are SSE4.2 string instructions than SSE2 for memcmp? (it's slower, especially with AVX2)
SSE42 & STTNI - PcmpEstrM is twice slower than PcmpIstrM, is it true? Yes, the explicit-length versions are slower than the implicit-length versions. Masking based on extra 2 length inputs is slower and costs more uops than scanning for a 0 byte in the existing inputs, apparently.
Performance doesn't depend on the value of the immediate. At one point I thought it did, but that was with i dependent on the result, so changing the immediate led to cache-line splits, making loop latency even worse. Re-testing with an i+=16 loop shows no effect.
If used with a REX.W prefix (to take inputs in RAX and RDX instead of EAX and EDX) it's much slower (according to https://uops.info/) for Intel, but there's no intrinsic for that so you don't have to worry about compilers doing that.

Or why does this instruction exist at all?

It can be good for strspn and similar, checking each byte (or word) of one vector against every element of another vector. Can also be good for strstr / memmem. But generally not for things that you can do easily with pcmpeqb (the "EqualEach" aggregation option).

These instructions were introduced in Nehalem. Intel might have had plans to make them faster if they "caught on" and became widely used e.g. for short-string strcmp. But without fault-suppression (for unaligned loads that potentially cross into a new page) you still have to check stuff about the pointer unless you know your string is allocated in a larger buffer so it's definitely safe to read up to 15 bytes past the end. If you're going to do checks anyway, you might as well use an efficient pcmpeqb/pmovmskb which is fewer uops. And maybe find the first zero in either string with pcmpeqb/pmovmskb -> bsf. (For long strings, glibc strlen uses pminub to get a vector that has a zero if any of the input vectors do, more cheaply than pcmpeqb/pand. Maybe there's a use-case for SSE4.2 for the initial startup of a strcmp, but once you get going not so much.

And most of the world cares about UTF-8, not 8-bit character sets. And with UTF-16 not being fixed-width anymore (thanks to 32-bit Unicode), even wide-character stuff is harder to accelerate with these.

Using the ranges features basically requires hand-vectorization, which is a lot of work for something that only handles ASCII. Could be worth it for libc str[c]spn / strpbrk for short-enough accept/reject sets.

And as you found, for simple cases you can go even faster with pcmpgtb and boolean logic. With AVX2 you could process 32 bytes at once instead of 16, but there's no AVX2 version of vpcmpistri, only the AVX1 VEX encoding of the 16-byte instruction.