cassemblygccoptimization

How is the function get called with -O2?


I had below C program:

__int64_t fib_tr_const (__int64_t n, __int64_t n_1, __int64_t n_2)
{
    if (n == 0x42 || n == 0x43 ) {
        return n_1 + n_2;
    }

    return 0x31 + fib_tr_const (n-1, n_2, n_1 + n_2);
}


void main (int argc, char ** argv)
{
    __int64_t n = atoi (argv[1]);
    // printf("fib(%ld) = %ld\n", n, fib(n));
    // printf("fib_tr(%ld) = %ld\n", n, fib_tr(n, 0, 1));
    __int64_t volatile result = 0x4567;
    printf ("11111\n");
    printf ("11111\n");
    result = fib_tr_const(n, 0x123, 0x234);
    printf ("22222\n");
    printf ("22222\n");
    printf("fib_tr_const(%ld) = %ld\n", n, result);
}

And I built it with -O2.

gcc tail_call.c -O2 -o tail_call_const_O2

I disassembled the binary:

The main():

00000000000010a0 <main>:
    10a0:   f3 0f 1e fa             endbr64 
    10a4:   41 54                   push   %r12
    10a6:   ba 0a 00 00 00          mov    $0xa,%edx
    10ab:   48 83 ec 10             sub    $0x10,%rsp
    10af:   48 8b 7e 08             mov    0x8(%rsi),%rdi
    10b3:   31 f6                   xor    %esi,%esi
    10b5:   e8 c6 ff ff ff          callq  1080 <strtol@plt>
    10ba:   48 8d 3d 43 0f 00 00    lea    0xf43(%rip),%rdi        # 2004 <_IO_stdin_used+0x4>
    10c1:   48 c7 44 24 08 67 45    movq   $0x4567,0x8(%rsp)
    10c8:   00 00 
    10ca:   4c 63 e0                movslq %eax,%r12
    10cd:   e8 9e ff ff ff          callq  1070 <puts@plt>
    10d2:   48 8d 3d 2b 0f 00 00    lea    0xf2b(%rip),%rdi        # 2004 <_IO_stdin_used+0x4>
    10d9:   e8 92 ff ff ff          callq  1070 <puts@plt>


------------------START-------------------

    10de:   49 8d 44 24 ff          lea    -0x1(%r12),%rax
    10e3:   48 83 f8 01             cmp    $0x1,%rax
    10e7:   76 75                   jbe    115e <main+0xbe>
    10e9:   4c 89 e0                mov    %r12,%rax
    10ec:   31 c9                   xor    %ecx,%ecx
    10ee:   ba 01 00 00 00          mov    $0x1,%edx
    10f3:   31 ff                   xor    %edi,%edi
    10f5:   eb 0c                   jmp    1103 <main+0x63>
    10f7:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
    10fe:   00 00 
    1100:   48 89 f2                mov    %rsi,%rdx
    1103:   48 8d 34 17             lea    (%rdi,%rdx,1),%rsi
    1107:   48 89 c7                mov    %rax,%rdi
    110a:   48 83 e8 01             sub    $0x1,%rax
    110e:   48 01 f9                add    %rdi,%rcx
    1111:   48 89 d7                mov    %rdx,%rdi
    1114:   48 83 f8 02             cmp    $0x2,%rax
    1118:   75 e6                   jne    1100 <main+0x60>
    111a:   48 01 f2                add    %rsi,%rdx
    111d:   48 01 d1                add    %rdx,%rcx
    1120:   48 8d 3d e3 0e 00 00    lea    0xee3(%rip),%rdi        # 200a <_IO_stdin_used+0xa>
    1127:   48 89 4c 24 08          mov    %rcx,0x8(%rsp)

------------------END-------------------


    112c:   e8 3f ff ff ff          callq  1070 <puts@plt>
    1131:   48 8d 3d d2 0e 00 00    lea    0xed2(%rip),%rdi        # 200a <_IO_stdin_used+0xa>
    1138:   e8 33 ff ff ff          callq  1070 <puts@plt>
    113d:   48 8b 4c 24 08          mov    0x8(%rsp),%rcx
    1142:   48 83 c4 10             add    $0x10,%rsp
    1146:   31 c0                   xor    %eax,%eax
    1148:   4c 89 e2                mov    %r12,%rdx
    114b:   48 8d 35 be 0e 00 00    lea    0xebe(%rip),%rsi        # 2010 <_IO_stdin_used+0x10>
    1152:   bf 01 00 00 00          mov    $0x1,%edi
    1157:   41 5c                   pop    %r12
    1159:   e9 32 ff ff ff          jmpq   1090 <__printf_chk@plt>
    115e:   ba 01 00 00 00          mov    $0x1,%edx
    1163:   31 c9                   xor    %ecx,%ecx
    1165:   eb b6                   jmp    111d <main+0x7d>
    1167:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
    116e:   00 00 

The fib_tr_const():

00000000000012e0 <fib_tr_const>:
    12e0:   f3 0f 1e fa             endbr64 
    12e4:   48 8d 47 be             lea    -0x42(%rdi),%rax
    12e8:   48 89 f9                mov    %rdi,%rcx
    12eb:   48 83 f8 01             cmp    $0x1,%rax
    12ef:   77 0a                   ja     12fb <fib_tr_const+0x1b>
    12f1:   eb 35                   jmp    1328 <fib_tr_const+0x48>
    12f3:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    12f8:   48 89 c2                mov    %rax,%rdx
    12fb:   48 83 ef 01             sub    $0x1,%rdi
    12ff:   48 8d 04 16             lea    (%rsi,%rdx,1),%rax
    1303:   48 89 d6                mov    %rdx,%rsi
    1306:   48 83 ff 43             cmp    $0x43,%rdi
    130a:   75 ec                   jne    12f8 <fib_tr_const+0x18>
    130c:   48 8d 34 49             lea    (%rcx,%rcx,2),%rsi
    1310:   48 01 c2                add    %rax,%rdx
    1313:   48 c1 e6 04             shl    $0x4,%rsi
    1317:   48 8d 8c 31 2d f3 ff    lea    -0xcd3(%rcx,%rsi,1),%rcx
    131e:   ff 
    131f:   48 8d 04 0a             lea    (%rdx,%rcx,1),%rax
    1323:   c3                      retq   
    1324:   0f 1f 40 00             nopl   0x0(%rax)
    1328:   48 89 d0                mov    %rdx,%rax
    132b:   48 89 f2                mov    %rsi,%rdx
    132e:   31 c9                   xor    %ecx,%ecx
    1330:   48 01 c2                add    %rax,%rdx
    1333:   48 8d 04 0a             lea    (%rdx,%rcx,1),%rax
    1337:   c3                      retq   
    1338:   0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
    133f:   00 

With the landmarks printf ("11111\n"); and printf ("22222\n");, and thus puts in assembly. I believe the part between START and END should be where the main() calls fib_tr_const().

I don't see any call or jmp instructions that branch from main() to the fib_tr_const(). So how is this function called?

I compared the assembly between the marked points with the assembly of the fib_tr_const(). The asm code looks to be similar. It seems the fib_tr_const() function is inlined.

Another question arises - if the function is inlined, why does the code still exist in the final binary? It would save space if it were removed, so why doesn't this happen?


Solution

  • TL;DR: It's inlined here, but the compiler doesn't know that it isn't used elsewhere.

    When you compile a program, it could be for a library, in which case it would be bad to remove a function just because it isn't used elsewhere internally. The compiler doesn't know what the full executable contains or uses so it can't skip generating fib_tr_const. For example:

    // foo.c
    float square_helper(float x){
        return x * x;
    }
    float hypot2(float x,float y){
        return square_helper(x) + square_helper(y);
    }
    //bar.c
    float square_helper(float);
    float hypot2(float,float);
    int main(void){
        printf("%.2f",hypot2(square_helper(3.4f),2.2f));
        return 0;
    }
    

    Here, the compiler can't remove square_helper from foo.c even though it might be inlined because bar.c depends on it. The compiler works on one TU(.c file) at a time, so it has no way of knowing whether such a bar.c exists.

    There are three solutions to the problem:

    1. mark it as static. static functions are local to the TU it's declared in. This means that nothing outside can see the function, so the compiler is free to remove it after it inlines all the usages everywhere it can be seen (which is within that .c file).
    2. Tell the linker to optimize it out. gcc has -flto -fuse-linker-plugin which tells the linker to do Link Time Optimization, including removing functions that are neither exported (if you are compiling a DLL) nor used anywhere in the full program. The linker sees that fib_tr_const is not used anywhere after the compiler has inlined its use, so the linker removes it.
    3. Use -fwhole-program. This will only work if the program is a single TU(one .c file), and it tells gcc that this file is the whole program; there's nothing elsewhere that could possibly use the functions here. This allows gcc to remove the inlined functions. Usually this is not the best solution because option #2 is better in most cases and it limits your program to a single file.