assemblygccsse

Why CSAPP say Gcc do not use vcvtss2sd?


Computer Systems: A Programmer's Perpective (3rd), in section 3.11.1, say "Suppose the low-order 4 bytes of %xmm0 hold a single-precision value; then it would seem straightforward to use the instruction vcvtss2sd %xmm0, %xmm0, %xmm0 to convert this to a double-precision value and store the result in the lower 8 bytes of register %xmm0. Instead, we find the following code generated by GCC: vunpcklps %xmm0, %xmm0, %xmm0 \n vcvtps2pd %xmm0, %xmm0." But I wrote a demo vcvtss2sdtest.c, and used gcc to compile it under -O0, -O1, -O2, -O3 and -Og optimization levels. All use vcvtss2sd rather than vunpcklps and vcvtps2pd. So I want to know the source of the statement.

vcvtss2sd.c

#include <stdio.h>
int main()
{
        float f;
        if (scanf("%f", &f) != 1) {
                printf("Inupt Error!");
                return 1;
        }
        double d = (double) f;
        printf("%f", d);
        return 0;
}

gcc -O0 -o test0 vcvtss2sd.c objdump -d test0 (portion)

11ba:       e8 d1 fe ff ff          call   1090 <__isoc99_scanf@plt>
    11bf:       83 f8 01                cmp    $0x1,%eax
    11c2:       74 1b                   je     11df <main+0x56>
    11c4:       48 8d 05 3c 0e 00 00    lea    0xe3c(%rip),%rax        # 2007 <_IO_stdin_used+0x7>
    11cb:       48 89 c7                mov    %rax,%rdi
    11ce:       b8 00 00 00 00          mov    $0x0,%eax
    11d3:       e8 a8 fe ff ff          call   1080 <printf@plt>
    11d8:       b8 01 00 00 00          mov    $0x1,%eax
    11dd:       eb 30                   jmp    120f <main+0x86>
    11df:       f3 0f 10 45 ec          movss  -0x14(%rbp),%xmm0
    11e4:       f3 0f 5a c0             cvtss2sd %xmm0,%xmm0
    11e8:       f2 0f 11 45 f0          movsd  %xmm0,-0x10(%rbp)
    11ed:       48 8b 45 f0             mov    -0x10(%rbp),%rax
    11f1:       66 48 0f 6e c0          movq   %rax,%xmm0
    11f6:       48 8d 05 07 0e 00 00    lea    0xe07(%rip),%rax        # 2004 <_IO_stdin_used+0x4>
    11fd:       48 89 c7                mov    %rax,%rdi
    1200:       b8 01 00 00 00          mov    $0x1,%eax
    1205:       e8 76 fe ff ff          call   1080 <printf@plt>

gcc -O1 -o test1 vcvtss2sd.c objdump -d test1 (portion)

11a6:       48 8d 3d 57 0e 00 00    lea    0xe57(%rip),%rdi        # 2004 <_IO_stdin_used+0x4>
    11ad:       e8 de fe ff ff          call   1090 <__isoc99_scanf@plt>
    11b2:       83 f8 01                cmp    $0x1,%eax
    11b5:       74 30                   je     11e7 <main+0x5e>
    11b7:       48 8d 35 49 0e 00 00    lea    0xe49(%rip),%rsi        # 2007 <_IO_stdin_used+0x7>
    11be:       bf 01 00 00 00          mov    $0x1,%edi
    11c3:       b8 00 00 00 00          mov    $0x0,%eax
    11c8:       e8 b3 fe ff ff          call   1080 <__printf_chk@plt>
    11cd:       b8 01 00 00 00          mov    $0x1,%eax
    11d2:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
    11d7:       64 48 2b 14 25 28 00    sub    %fs:0x28,%rdx
    11de:       00 00
    11e0:       75 2c                   jne    120e <main+0x85>
    11e2:       48 83 c4 18             add    $0x18,%rsp
    11e6:       c3                      ret
    11e7:       66 0f ef c0             pxor   %xmm0,%xmm0
    11eb:       f3 0f 5a 44 24 04       cvtss2sd 0x4(%rsp),%xmm0
    11f1:       48 8d 35 0c 0e 00 00    lea    0xe0c(%rip),%rsi        # 2004 <_IO_stdin_used+0x4>
    11f8:       bf 01 00 00 00          mov    $0x1,%edi
    11fd:       b8 01 00 00 00          mov    $0x1,%eax
    1202:       e8 79 fe ff ff          call   1080 <__printf_chk@plt>

gcc -O2 -o test2 vcvtss2sd.c objdump -d test2 (portion)

10c8:       e8 c3 ff ff ff          call   1090 <__isoc99_scanf@plt>
    10cd:       83 f8 01                cmp    $0x1,%eax
    10d0:       74 2e                   je     1100 <main+0x60>
    10d2:       48 8d 35 2e 0f 00 00    lea    0xf2e(%rip),%rsi        # 2007 <_IO_stdin_used+0x7>
    10d9:       bf 01 00 00 00          mov    $0x1,%edi
    10de:       31 c0                   xor    %eax,%eax
    10e0:       e8 9b ff ff ff          call   1080 <__printf_chk@plt>
    10e5:       b8 01 00 00 00          mov    $0x1,%eax
    10ea:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
    10ef:       64 48 2b 14 25 28 00    sub    %fs:0x28,%rdx
    10f6:       00 00
    10f8:       75 26                   jne    1120 <main+0x80>
    10fa:       48 83 c4 10             add    $0x10,%rsp
    10fe:       5d                      pop    %rbp
    10ff:       c3                      ret
    1100:       66 0f ef c0             pxor   %xmm0,%xmm0
    1104:       48 89 ee                mov    %rbp,%rsi
    1107:       bf 01 00 00 00          mov    $0x1,%edi
    110c:       b8 01 00 00 00          mov    $0x1,%eax
    1111:       f3 0f 5a 44 24 04       cvtss2sd 0x4(%rsp),%xmm0
    1117:       e8 64 ff ff ff          call   1080 <__printf_chk@plt>

gcc -O3 -o test3 vcvtss2sd.c objdump -d test3 (portion)

10c8:       e8 c3 ff ff ff          call   1090 <__isoc99_scanf@plt>
    10cd:       83 f8 01                cmp    $0x1,%eax
    10d0:       74 2e                   je     1100 <main+0x60>
    10d2:       48 8d 35 2e 0f 00 00    lea    0xf2e(%rip),%rsi        # 2007 <_IO_stdin_used+0x7>
    10d9:       bf 01 00 00 00          mov    $0x1,%edi
    10de:       31 c0                   xor    %eax,%eax
    10e0:       e8 9b ff ff ff          call   1080 <__printf_chk@plt>
    10e5:       b8 01 00 00 00          mov    $0x1,%eax
    10ea:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
    10ef:       64 48 2b 14 25 28 00    sub    %fs:0x28,%rdx
    10f6:       00 00
    10f8:       75 26                   jne    1120 <main+0x80>
    10fa:       48 83 c4 10             add    $0x10,%rsp
    10fe:       5d                      pop    %rbp
    10ff:       c3                      ret
    1100:       66 0f ef c0             pxor   %xmm0,%xmm0
    1104:       48 89 ee                mov    %rbp,%rsi
    1107:       bf 01 00 00 00          mov    $0x1,%edi
    110c:       b8 01 00 00 00          mov    $0x1,%eax
    1111:       f3 0f 5a 44 24 04       cvtss2sd 0x4(%rsp),%xmm0
    1117:       e8 64 ff ff ff          call   1080 <__printf_chk@plt>

gcc -Og -o testg vcvtss2sd.c objdump -d testg (portion)

11ad:       e8 de fe ff ff          call   1090 <__isoc99_scanf@plt>
    11b2:       83 f8 01                cmp    $0x1,%eax
    11b5:       74 30                   je     11e7 <main+0x5e>
    11b7:       48 8d 35 49 0e 00 00    lea    0xe49(%rip),%rsi        # 2007 <_IO_stdin_used+0x7>
    11be:       bf 01 00 00 00          mov    $0x1,%edi
    11c3:       b8 00 00 00 00          mov    $0x0,%eax
    11c8:       e8 b3 fe ff ff          call   1080 <__printf_chk@plt>
    11cd:       b8 01 00 00 00          mov    $0x1,%eax
    11d2:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
    11d7:       64 48 2b 14 25 28 00    sub    %fs:0x28,%rdx
    11de:       00 00
    11e0:       75 2c                   jne    120e <main+0x85>
    11e2:       48 83 c4 18             add    $0x18,%rsp
    11e6:       c3                      ret
    11e7:       66 0f ef c0             pxor   %xmm0,%xmm0
    11eb:       f3 0f 5a 44 24 04       cvtss2sd 0x4(%rsp),%xmm0
    11f1:       48 8d 35 0c 0e 00 00    lea    0xe0c(%rip),%rsi        # 2004 <_IO_stdin_used+0x4>
    11f8:       bf 01 00 00 00          mov    $0x1,%edi
    11fd:       b8 01 00 00 00          mov    $0x1,%eax
    1202:       e8 79 fe ff ff          call   1080 <__printf_chk@plt>

Solution

  • Older GCC versions were different. The default -mtune=generic choices (and those baked in to GCC, without alternatives for different uarches) depend on what's good on CPUs that are current at the time. At least hopefully. Two single-uop instructions can be faster than one 2-uop instruction because of front-end throughput with the complex decoder, especially on Intel without a uop cache (e.g. Core 2). https://agner.org/optimize/. And on AMD K10, cvtss2sd xmm, xmm is 3 uops for the front-end.

    You can reproduce this with GCC4.8 for example:

    double foo(float f){
        return f;
    }
    

    Godbolt GCC -O3 (with -march=core2 or not).

    foo(float):
            unpcklps        xmm0, xmm0
            cvtps2pd        xmm0, xmm0
            ret
    

    GCC 4.9 and later use cvtss2sd even with -march=core2, so I assume they just changed the pattern for how to convert a float in a register to a double in a register in GCC's machine-description file without adding alternatives for different -mtune settings.

    (Of course you only get the AVX encoding vcvtss2sd if you enable -mavx explicitly or with a -march= that includes it. cvtss2sd is the SSE2 encoding, which leaves the upper part of a YMM/ZMM unmodified instead of zeroing.)

    With the AVX version, it's possible to somewhat work around the bad design of the original, which leaves the upper half of the destination unmodified. That's why it's 2 uops even on some CPUs where 128-bit operations are single uop: it has to convert and merge. The AVX version has a second source, the register to merge into. You can pick a "cold" register (not recently touched, or loaded with something early on a dependency chain that led to your other input) to avoid a false output dependency on the register you want to write to, which can sometimes be a problem for vctss2sd. Or you can pick the source register twice, so vcvtss2sd xmm0, xmm1,xmm1 has the same result as movaps xmm0, xmm1 / cvtss2sd xmm0, xmm1, and also avoids a false dependency on XMM0. (This isn't an option when the source is memory, then it is useful to a you know won't make this depend on some other dep chain.)

    The AVX version still has to merge so is still 2 uops even on modern Intel such as Alder Lake. But with uop caches making decode bottlenecks less of a problem, smaller machine-code size from one 2-uop instruction has advantages.