cgcc

Why does gcc generate different code for s->a[i] and *(s->a+i)?


I compile this with gcc -O1:

#include <stdint.h>

struct mystruct{
    uint32_t arr[1];
    uint32_t something_after_arr;
};

uint32_t sum1(const struct mystruct *s,uintptr_t n){
    uint32_t result=0;
    for(uintptr_t i=0;i<n;++i)result+=s->arr[i];
    return result;
}

uint32_t sum2(const struct mystruct *s,uintptr_t n){
    uint32_t result=0;
    for(uintptr_t i=0;i<n;++i)result+=i[s->arr];
    return result;
}

uint32_t sum3(const struct mystruct *s,uintptr_t n){
    uint32_t result=0;
    for(uintptr_t i=0;i<n;++i)result+=*(s->arr+i);
    return result;
}

compiler (https://godbolt.org/#…) generated this x86_64 assembly (assembler directives removed):

sum1:
        mov     eax, 0
        test    rsi, rsi
        je      .L1
        mov     eax, DWORD PTR [rdi]
.L1:
        ret
sum2:
        mov     eax, 0
        test    rsi, rsi
        je      .L4
        mov     eax, DWORD PTR [rdi]
.L4:
        ret
sum3:
        test    rsi, rsi
        je      .L10
        mov     eax, 0
        mov     edx, 0
.L9:
        add     edx, DWORD PTR [rdi+rax*4]
        add     rax, 1
        cmp     rsi, rax
        jne     .L9
.L7:
        mov     eax, edx
        ret
.L10:
        mov     edx, 0
        jmp     .L7

Only sum3 has backwards jumps. sum1 and sum2 only have forward jump (which means no looping).

Why are sum3 and sum1 different and why no loop in sum1?

I expected sum1 and sum2 and sum3 have same code with loop (like clang (https://godbolt.org/#…)).


Solution

  • I would guess this is a side effect of all 3 versions having undefined behavior for any n larger than 1 and the only valid index for i is 0. When going beyond that, the compiler is free to generate any crap it wants.

    Contrary to popular belief, C never allowed wild & crazy pointer arithmetic using any type across any random memory space. C only ever allowed pointer arithmetic within the bounds of a declared array, using a type corresponding to the array item type.

    Meaning it is not allowed to do uint32_t pointer arithmetic outside the bounds of the array arr followed by dereferencing - that's undefined behavior regardless of what happens to exist in memory after the array. It's just how the additive operators are specified.

    We may however iterate across the struct using a character pointer, byte by byte. This is thanks to a special rule in C23 6.3.2.3:

    When a pointer to an object is converted to a pointer to a character type, the result points to the lowest addressed byte of the object. Successive increments of the result, up to the size of the object, yield pointers to the remaining bytes of the object.

    After fixing your code to remove undefined behavior by doing arithmetic on a character type instead (uint8_t is treated as a character type by the compiler here), I got the same machine code for all 3 versions:

    uint32_t sum1(const struct mystruct *s,uintptr_t n){
        uint32_t result=0;
        const uint8_t* u8 = (const uint8_t*)s;
        for(uintptr_t i=0; i<n*4; i+=4)
          result += *(uint32_t*) &u8[i];
        return result;
    }
    
    uint32_t sum2(const struct mystruct *s,uintptr_t n){
        uint32_t result=0;
        const uint8_t* u8 = (const uint8_t*)s;
        for(uintptr_t i=0; i<n*4; i+=4)
          result += *(uint32_t*) &i[u8];
        return result;
    }
    
    uint32_t sum3(const struct mystruct *s,uintptr_t n){
        uint32_t result=0;
        const uint8_t* u8 = (const uint8_t*)s;
        for(uintptr_t i=0; i<n*4; i+=4)
          result += *(uint32_t*) (u8+i);
        return result;
    }
    

    Messy diff views from Godbolt with gcc x86 -O3: https://godbolt.org/z/e6G5MGfxT (And yeah this turned into some far less efficient SIMD mess that I'm not even going to try to understand.)

    De-referencing is ok since there is no padding, the addresses are aligned and the effective type stored in the memory is uint32_t. Otherwise all manner of other UB could appear here.