c++assemblyintrinsics

optimization of STRCMP


I have set myself the task of optimizing the strcmp function in C. I have approached this task in two ways:

  1. Creating a new string comparison function in assembly language.
  2. Implementing the strcmp functionality using "intrinsics", where strings are represented by 256-bit integer registers. After compiling the code, I observed that the execution time was not as expected.

strcmp works 0.000002 seconds my_strcmp works 0.000851 seconds my_strcmp2 works 0.017444 seconds

I would appreciate your advice on how the strcmp function can be optimized to achieve faster execution times. The string length is limited to 32 characters in this context.

Compilation: nasm -g -f elf64 my_strcmp2.asm -o my_strcmp2.o g++ my_strcmp2.o main.cpp -march=znver3 -O3 -o app

main.cpp:

#include <stdio.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>

int my_strcmp(const char* word1, const char* word2);
extern "C" int my_strcmp2(const char* word1, const char* word2);
double timer(int (*func)(const char*, const char*));

int main()
{
    int j = 0;
    const char* word1 = "my name is";
    const char* word2 = "my name are";

    clock_t begin = clock();
    for (int i = 0; i < 1000000; i++)
    {
        j = strcmp(word1, word2);
    }
    clock_t end = clock();
    fprintf(stderr, "strcmp     : %lf seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);

    begin = clock();
    for (int i = 0; i < 1000000; i++)
    {
        j = my_strcmp(word1, word2);
    }
    end = clock();
    fprintf(stderr, "my_strcmp  : %lf seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);

    begin = clock();
    for (int i = 0; i < 1000000; i++)
    {
        j = my_strcmp2(word1, word2);
    }
    end = clock();
    fprintf(stderr, "my_strcmp2 : %lf seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);

    return 0;
}

int my_strcmp(const char* word1, const char* word2)
{

    __m256i str_reg_1 = _mm256_lddqu_si256((__m256i*)word1);
    __m256i str_reg_2 = _mm256_lddqu_si256((__m256i*)word2);
    __m256i res_cmp = _mm256_cmpeq_epi8(str_reg_1, str_reg_2);
    int mask = ~_mm256_movemask_epi8(res_cmp);

    return mask;
}

my_strcmp2.asm:

global my_strcmp2
section .text

my_strcmp2:
    cycle:
        cmp byte [rel rdi], 0x00
        je check_equal
        mov dl, byte [rel rsi]
        cmp byte [rel rdi], dl
        jne not_equal
        add rdi, 1
        add rsi, 1
    jmp cycle

    not_equal:
        mov rax, 0x01
        ret

    check_equal:
        cmp byte [rel rsi], 0x00
        jne not_equal
        xor rax, rax
        ret

Solution

  • GCC knows what strcmp does and can see that this code:

    for (int i = 0; i < 1000000; i++)
        {
            j = strcmp(word1, word2);
        }
    

    is equivalent to:

    j = strcmp(word1, word2);
    

    Therefore, it optimizes the loop away and evaluates strcmp during compilation, replacing it with a constant, and the time per iteration is effectively zero. It is impossible for you to beat this with your own implementation of strcmp.

    You might measure the time of the library strcmp by adding -fnobuiltin-strcmp to the compilation switches and declaring j volatile. You should examine the assembly generated by the compiler to see whether it is actually calling strcmp and is doing so in a loop.