c++gcccompiler-optimization

Why __restrict often doesn't give optimization as expected?


#include <iostream>
#include <chrono>

void foo(int* a, int* b) {
  *a+=1;
  *b+=1;
  *a+=1;
}

void goo(int* __restrict a, int* b) {
  *a+=1;
  *b+=1;
  *a+=1;
}

void measure() {
  int x = 1;
  int y = 2;

  auto start_foo = std::chrono::high_resolution_clock::now();  
 
  for(int i = 0; i < 10000000; ++i) {
    foo(&x, &y);
  }
  
  auto end_foo = std::chrono::high_resolution_clock::now();
  
  
  std::chrono::duration<double> duration_foo = end_foo - start_foo;
  std::cout << "foo Runtime(secs): " << duration_foo.count() << std::endl;

  auto start_goo = std::chrono::high_resolution_clock::now();

  for(int i = 0; i < 10000000; ++i) {
    goo(&x, &y);
  }

  auto end_goo = std::chrono::high_resolution_clock::now(); 
  std::chrono::duration<double> duration_goo = end_goo - start_goo;

  std::cout << "goo Runtime(secs): " << duration_goo.count() << std::endl;
  std::cout << "Without/With: " << duration_foo.count() / duration_goo.count() << std::endl << std::endl;
}

int main() {
  for (int i = 0; i < 10; ++i) {
    measure();
  }
}

I see that goo is optimized as expected via __restrict, but the output confused me a lot since there are cases when goo performs much more worse than foo. In 3/10 cases they perform similiarly, in 3/10 cases foo performs better, and only in 4/10 cases we have expected result.

Please, explain why?

foo(int*, int*):
        add     DWORD PTR [rdi], 1
        add     DWORD PTR [rsi], 1
        add     DWORD PTR [rdi], 1
        ret
goo(int*, int*):
        add     DWORD PTR [rsi], 1
        add     DWORD PTR [rdi], 2
        ret
.LC1:
        .string "foo Runtime(secs): "
.LC2:
        .string "goo Runtime(secs): "
.LC3:
        .string "Without/With: "
measure():
        push    rbp
        push    rbx
        sub     rsp, 24
        call    std::chrono::_V2::system_clock::now()
        mov     rbx, rax
        call    std::chrono::_V2::system_clock::now()
        mov     edx, 19
        mov     esi, OFFSET FLAT:.LC1
        mov     edi, OFFSET FLAT:std::cout
        sub     rax, rbx
        pxor    xmm0, xmm0
        cvtsi2sdq       xmm0, rax
        divsd   xmm0, QWORD PTR .LC0[rip]
        movsd   QWORD PTR [rsp], xmm0
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        movsd   xmm0, QWORD PTR [rsp]
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L9
        cmp     BYTE PTR [rbx+56], 0
        je      .L7
        movsx   esi, BYTE PTR [rbx+67]
.L8:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >::flush()
        call    std::chrono::_V2::system_clock::now()
        mov     rbx, rax
        call    std::chrono::_V2::system_clock::now()
        mov     edx, 19
        mov     esi, OFFSET FLAT:.LC2
        mov     edi, OFFSET FLAT:std::cout
        sub     rax, rbx
        pxor    xmm0, xmm0
        cvtsi2sdq       xmm0, rax
        divsd   xmm0, QWORD PTR .LC0[rip]
        movsd   QWORD PTR [rsp+8], xmm0
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        movsd   xmm0, QWORD PTR [rsp+8]
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L9
        cmp     BYTE PTR [rbx+56], 0
        je      .L10
        movsx   esi, BYTE PTR [rbx+67]
.L11:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >::flush()
        mov     edx, 14
        mov     esi, OFFSET FLAT:.LC3
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     edi, OFFSET FLAT:std::cout
        movsd   xmm0, QWORD PTR [rsp]
        divsd   xmm0, QWORD PTR [rsp+8]
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L9
        cmp     BYTE PTR [rbx+56], 0
        je      .L12
        movsx   esi, BYTE PTR [rbx+67]
.L13:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >::flush()
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L9
        cmp     BYTE PTR [rbx+56], 0
        je      .L14
        movsx   esi, BYTE PTR [rbx+67]
.L15:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        add     rsp, 24
        pop     rbx
        mov     rdi, rax
        pop     rbp
        jmp     std::basic_ostream<char, std::char_traits<char> >::flush()
.L7:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
        je      .L8
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L8
.L10:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
        je      .L11
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L11
.L12:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
        je      .L13
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L13
.L14:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
        je      .L15
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L15
.L9:
        call    std::__throw_bad_cast()
main:
        push    rbx
        mov     ebx, 10
.L35:
        call    measure()
        sub     ebx, 1
        jne     .L35
        xor     eax, eax
        pop     rbx
        ret
_GLOBAL__sub_I_foo(int*, int*):
        sub     rsp, 8
        mov     edi, OFFSET FLAT:std::__ioinit
        call    std::ios_base::Init::Init() [complete object constructor]
        mov     edx, OFFSET FLAT:__dso_handle
        mov     esi, OFFSET FLAT:std::__ioinit
        mov     edi, OFFSET FLAT:std::ios_base::Init::~Init() [complete object destructor]
        add     rsp, 8
        jmp     __cxa_atexit
.LC0:
        .long   0
        .long   1104006501

goo Runtime(secs): 7e-08
Without/With: 2.14286

foo Runtime(secs): 4e-08
goo Runtime(secs): 4e-08
Without/With: 1

foo Runtime(secs): 4e-08
goo Runtime(secs): 3e-08
Without/With: 1.33333

foo Runtime(secs): 4e-08
goo Runtime(secs): 3e-08
Without/With: 1.33333

foo Runtime(secs): 4e-08
goo Runtime(secs): 4e-08
Without/With: 1

foo Runtime(secs): 4e-08
goo Runtime(secs): 9e-08
Without/With: 0.444444

foo Runtime(secs): 3e-08
goo Runtime(secs): 4e-08
Without/With: 0.75

foo Runtime(secs): 4e-08
goo Runtime(secs): 4e-08
Without/With: 1

foo Runtime(secs): 3e-08
goo Runtime(secs): 4e-08
Without/With: 0.75

foo Runtime(secs): 4e-08
goo Runtime(secs): 3e-08
Without/With: 1.33333

Solution

  • As the disassembly shows, the two high_resolution_clock::now() calls are separated by only a single mov, and that's to store the result of the first now(). Nothing is being measured.

    The problem is that at -O3, the optimizer can see that your code does nothing. And at lower optimization levels, you would not be measuring anything meaningful. __restrict is explicitly intended for the optimizer. That explains why it's so rarely useful: Often the optimizer can optimize without it (superfluous) or the optimizer still can't optimize with it (insufficient).