#include <iostream>
#include <chrono>
void foo(int* a, int* b) {
*a+=1;
*b+=1;
*a+=1;
}
void goo(int* __restrict a, int* b) {
*a+=1;
*b+=1;
*a+=1;
}
void measure() {
int x = 1;
int y = 2;
auto start_foo = std::chrono::high_resolution_clock::now();
for(int i = 0; i < 10000000; ++i) {
foo(&x, &y);
}
auto end_foo = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration_foo = end_foo - start_foo;
std::cout << "foo Runtime(secs): " << duration_foo.count() << std::endl;
auto start_goo = std::chrono::high_resolution_clock::now();
for(int i = 0; i < 10000000; ++i) {
goo(&x, &y);
}
auto end_goo = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration_goo = end_goo - start_goo;
std::cout << "goo Runtime(secs): " << duration_goo.count() << std::endl;
std::cout << "Without/With: " << duration_foo.count() / duration_goo.count() << std::endl << std::endl;
}
int main() {
for (int i = 0; i < 10; ++i) {
measure();
}
}
I see that goo
is optimized as expected via __restrict
, but the output confused me a lot since there are cases when goo
performs much more worse than foo
. In 3/10 cases they perform similiarly, in 3/10 cases foo
performs better, and only in 4/10 cases we have expected result.
Please, explain why?
foo(int*, int*):
add DWORD PTR [rdi], 1
add DWORD PTR [rsi], 1
add DWORD PTR [rdi], 1
ret
goo(int*, int*):
add DWORD PTR [rsi], 1
add DWORD PTR [rdi], 2
ret
.LC1:
.string "foo Runtime(secs): "
.LC2:
.string "goo Runtime(secs): "
.LC3:
.string "Without/With: "
measure():
push rbp
push rbx
sub rsp, 24
call std::chrono::_V2::system_clock::now()
mov rbx, rax
call std::chrono::_V2::system_clock::now()
mov edx, 19
mov esi, OFFSET FLAT:.LC1
mov edi, OFFSET FLAT:std::cout
sub rax, rbx
pxor xmm0, xmm0
cvtsi2sdq xmm0, rax
divsd xmm0, QWORD PTR .LC0[rip]
movsd QWORD PTR [rsp], xmm0
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
movsd xmm0, QWORD PTR [rsp]
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L9
cmp BYTE PTR [rbx+56], 0
je .L7
movsx esi, BYTE PTR [rbx+67]
.L8:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >::flush()
call std::chrono::_V2::system_clock::now()
mov rbx, rax
call std::chrono::_V2::system_clock::now()
mov edx, 19
mov esi, OFFSET FLAT:.LC2
mov edi, OFFSET FLAT:std::cout
sub rax, rbx
pxor xmm0, xmm0
cvtsi2sdq xmm0, rax
divsd xmm0, QWORD PTR .LC0[rip]
movsd QWORD PTR [rsp+8], xmm0
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
movsd xmm0, QWORD PTR [rsp+8]
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L9
cmp BYTE PTR [rbx+56], 0
je .L10
movsx esi, BYTE PTR [rbx+67]
.L11:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >::flush()
mov edx, 14
mov esi, OFFSET FLAT:.LC3
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
mov edi, OFFSET FLAT:std::cout
movsd xmm0, QWORD PTR [rsp]
divsd xmm0, QWORD PTR [rsp+8]
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L9
cmp BYTE PTR [rbx+56], 0
je .L12
movsx esi, BYTE PTR [rbx+67]
.L13:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >::flush()
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L9
cmp BYTE PTR [rbx+56], 0
je .L14
movsx esi, BYTE PTR [rbx+67]
.L15:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
add rsp, 24
pop rbx
mov rdi, rax
pop rbp
jmp std::basic_ostream<char, std::char_traits<char> >::flush()
.L7:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
je .L8
mov rdi, rbx
call rax
movsx esi, al
jmp .L8
.L10:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
je .L11
mov rdi, rbx
call rax
movsx esi, al
jmp .L11
.L12:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
je .L13
mov rdi, rbx
call rax
movsx esi, al
jmp .L13
.L14:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:_ZNKSt5ctypeIcE8do_widenEc
je .L15
mov rdi, rbx
call rax
movsx esi, al
jmp .L15
.L9:
call std::__throw_bad_cast()
main:
push rbx
mov ebx, 10
.L35:
call measure()
sub ebx, 1
jne .L35
xor eax, eax
pop rbx
ret
_GLOBAL__sub_I_foo(int*, int*):
sub rsp, 8
mov edi, OFFSET FLAT:std::__ioinit
call std::ios_base::Init::Init() [complete object constructor]
mov edx, OFFSET FLAT:__dso_handle
mov esi, OFFSET FLAT:std::__ioinit
mov edi, OFFSET FLAT:std::ios_base::Init::~Init() [complete object destructor]
add rsp, 8
jmp __cxa_atexit
.LC0:
.long 0
.long 1104006501
goo Runtime(secs): 7e-08
Without/With: 2.14286
foo Runtime(secs): 4e-08
goo Runtime(secs): 4e-08
Without/With: 1
foo Runtime(secs): 4e-08
goo Runtime(secs): 3e-08
Without/With: 1.33333
foo Runtime(secs): 4e-08
goo Runtime(secs): 3e-08
Without/With: 1.33333
foo Runtime(secs): 4e-08
goo Runtime(secs): 4e-08
Without/With: 1
foo Runtime(secs): 4e-08
goo Runtime(secs): 9e-08
Without/With: 0.444444
foo Runtime(secs): 3e-08
goo Runtime(secs): 4e-08
Without/With: 0.75
foo Runtime(secs): 4e-08
goo Runtime(secs): 4e-08
Without/With: 1
foo Runtime(secs): 3e-08
goo Runtime(secs): 4e-08
Without/With: 0.75
foo Runtime(secs): 4e-08
goo Runtime(secs): 3e-08
Without/With: 1.33333
As the disassembly shows, the two high_resolution_clock::now()
calls are separated by only a single mov
, and that's to store the result of the first now()
. Nothing is being measured.
The problem is that at -O3
, the optimizer can see that your code does nothing. And at lower optimization levels, you would not be measuring anything meaningful. __restrict
is explicitly intended for the optimizer. That explains why it's so rarely useful: Often the optimizer can optimize without it (superfluous) or the optimizer still can't optimize with it (insufficient).