I try to see the effects of individual compiler optimizations on the execution time of programs. For this, I use a simple matrix addition program.
When I compile the program with some different compiler optimizations, the resulting executable file is same as the one with when I compile it with no optimizations. That is,
gcc -funroll-loops code.c -o code_unroll
gcc -floop-unroll-and-jam code.c -o code_unroll_jam
gcc -finline-functions code.c -o code_inline
result in the same executable files. I have tried it with LLVM frontend Clang as well but there was no difference with it either.
When I searched online, it says that the compiler may not apply the optimization flags. Is there a way to force a single optimization flag to be applied to a program?
The code is as following:
void sum_array(int *a, int *b, int *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i]; // Simple loop for GCC to unroll
}
}
I put your code into Godbolt and added the minimum -O1 optimisation needed to allow further custom optimisations to be applied. Unless you really know what you are doing and have very specific requirements or unusual code you are best off leaving optimisation to the compiler and merely specifying -O2 or -O3 and then whether you have a preference for size over speed. You can play with the Godbolt version and see the immediate effect of various compiler flags at assembly level here.
Starting with your original source code (you should take a look at the unoptimised asm output too)
void sum_array(int *a, int *b, int *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i]; // Simple loop for GCC to unroll
}
}
Compiled -O1
sum_array(int*, int*, int*, int):
test ecx, ecx
jle .L1
movsx rcx, ecx
lea r8, [0+rcx*4]
mov eax, 0
.L3:
mov ecx, DWORD PTR [rsi+rax]
add ecx, DWORD PTR [rdi+rax]
mov DWORD PTR [rdx+rax], ecx
add rax, 4
cmp rax, r8
jne .L3
.L1:
ret
Compiled -O1 -funroll-loops
sum_array(int*, int*, int*, int):
mov r8, rsi
mov rsi, rdx
test ecx, ecx
jle .L1
movsx rcx, ecx
sal rcx, 2
mov eax, 0
lea rdx, [rcx-4]
shr rdx, 2
add rdx, 1
and edx, 7
je .L3
cmp rdx, 1
je .L26
cmp rdx, 2
je .L27
cmp rdx, 3
je .L28
cmp rdx, 4
je .L29
cmp rdx, 5
je .L30
cmp rdx, 6
je .L31
mov r9d, DWORD PTR [r8+rax]
add r9d, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], r9d
add rax, 4
.L31:
mov r10d, DWORD PTR [r8+rax]
add r10d, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], r10d
add rax, 4
.L30:
mov r11d, DWORD PTR [r8+rax]
add r11d, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], r11d
add rax, 4
.L29:
mov edx, DWORD PTR [r8+rax]
add edx, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], edx
add rax, 4
.L28:
mov r9d, DWORD PTR [r8+rax]
add r9d, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], r9d
add rax, 4
.L27:
mov r10d, DWORD PTR [r8+rax]
add r10d, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], r10d
add rax, 4
.L26:
mov r11d, DWORD PTR [r8+rax]
add r11d, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], r11d
add rax, 4
cmp rax, rcx
je .L33
.L3:
mov edx, DWORD PTR [r8+rax]
add edx, DWORD PTR [rdi+rax]
mov DWORD PTR [rsi+rax], edx
lea r10, [rax+4]
mov r9d, DWORD PTR [r8+4+rax]
add r9d, DWORD PTR [rdi+4+rax]
mov DWORD PTR [rsi+4+rax], r9d
mov eax, DWORD PTR [r8+4+r10]
add eax, DWORD PTR [rdi+4+r10]
mov DWORD PTR [rsi+4+r10], eax
mov r11d, DWORD PTR [r8+8+r10]
add r11d, DWORD PTR [rdi+8+r10]
mov DWORD PTR [rsi+8+r10], r11d
mov edx, DWORD PTR [r8+12+r10]
add edx, DWORD PTR [rdi+12+r10]
mov DWORD PTR [rsi+12+r10], edx
mov r9d, DWORD PTR [r8+16+r10]
add r9d, DWORD PTR [rdi+16+r10]
mov DWORD PTR [rsi+16+r10], r9d
mov eax, DWORD PTR [r8+20+r10]
add eax, DWORD PTR [rdi+20+r10]
mov DWORD PTR [rsi+20+r10], eax
mov r11d, DWORD PTR [r8+24+r10]
add r11d, DWORD PTR [rdi+24+r10]
mov DWORD PTR [rsi+24+r10], r11d
lea rax, [r10+28]
cmp rax, rcx
jne .L3
.L1:
ret
.L33:
ret
Be careful with loop unrolling always measure that it improves performance! On modern architectures it can sometimes be better to have a very compact inner loop that fits entirely into the instruction cache.
FWIW Intel ICX 2025 at -O3 makes the best fist of it of any of the compilers I use regularly.