I have set myself the task of optimizing the strcmp function in C. I have approached this task in two ways:
strcmp works 0.000002 seconds my_strcmp works 0.000851 seconds my_strcmp2 works 0.017444 seconds
I would appreciate your advice on how the strcmp function can be optimized to achieve faster execution times. The string length is limited to 32 characters in this context.
Compilation: nasm -g -f elf64 my_strcmp2.asm -o my_strcmp2.o g++ my_strcmp2.o main.cpp -march=znver3 -O3 -o app
main.cpp:
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>
int my_strcmp(const char* word1, const char* word2);
extern "C" int my_strcmp2(const char* word1, const char* word2);
double timer(int (*func)(const char*, const char*));
int main()
{
int j = 0;
const char* word1 = "my name is";
const char* word2 = "my name are";
clock_t begin = clock();
for (int i = 0; i < 1000000; i++)
{
j = strcmp(word1, word2);
}
clock_t end = clock();
fprintf(stderr, "strcmp : %lf seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);
begin = clock();
for (int i = 0; i < 1000000; i++)
{
j = my_strcmp(word1, word2);
}
end = clock();
fprintf(stderr, "my_strcmp : %lf seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);
begin = clock();
for (int i = 0; i < 1000000; i++)
{
j = my_strcmp2(word1, word2);
}
end = clock();
fprintf(stderr, "my_strcmp2 : %lf seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);
return 0;
}
int my_strcmp(const char* word1, const char* word2)
{
__m256i str_reg_1 = _mm256_lddqu_si256((__m256i*)word1);
__m256i str_reg_2 = _mm256_lddqu_si256((__m256i*)word2);
__m256i res_cmp = _mm256_cmpeq_epi8(str_reg_1, str_reg_2);
int mask = ~_mm256_movemask_epi8(res_cmp);
return mask;
}
my_strcmp2.asm:
global my_strcmp2
section .text
my_strcmp2:
cycle:
cmp byte [rel rdi], 0x00
je check_equal
mov dl, byte [rel rsi]
cmp byte [rel rdi], dl
jne not_equal
add rdi, 1
add rsi, 1
jmp cycle
not_equal:
mov rax, 0x01
ret
check_equal:
cmp byte [rel rsi], 0x00
jne not_equal
xor rax, rax
ret
GCC knows what strcmp
does and can see that this code:
for (int i = 0; i < 1000000; i++)
{
j = strcmp(word1, word2);
}
is equivalent to:
j = strcmp(word1, word2);
Therefore, it optimizes the loop away and evaluates strcmp
during compilation, replacing it with a constant, and the time per iteration is effectively zero. It is impossible for you to beat this with your own implementation of strcmp
.
You might measure the time of the library strcmp
by adding -fnobuiltin-strcmp
to the compilation switches and declaring j
volatile. You should examine the assembly generated by the compiler to see whether it is actually calling strcmp
and is doing so in a loop.