cpointersmemorycastingcpu-cycles

Memory Size Load and Store penalty analysis?


Profiling the code with ocount shows more cycles with penalty on and lesser cycles with penalty off. I'm trying to understand why there is more penalty when the penalty flag is on?

uint16_t arr[1010];
uint32_t r[500];


void func()
{     
            uint32_t i = 0;


            for (i = 0; i < 1000; i+=2)
            {
                arr[i] = i;
                arr[i+1] = i+10;
        #ifdef PENALTY_ON

               r[i/2] = *(uint32_t *)((uint16_t *)&arr[i+1]);
        #endif
            }
        #ifndef PENALTY_ON
            for (i = 0; i < 1000; i+=2)
            {
                r[i/2] = *(uint32_t *)((uint16_t *)&arr[i+1]);
            }
        #endif
 }

Solution

  • Compiling both with gcc on a 32-bit machine with -O3

    With PENALTY_ON

    00000000 <func>:
    0:  31 c0                   xor    %eax,%eax
    2:  8d b6 00 00 00 00       lea    0x0(%esi),%esi
    8:  8d 50 0a                lea    0xa(%eax),%edx
    b:  66 89 94 00 02 00 00    mov    %dx,0x2(%eax,%eax,1)
    12: 00 
    13: 8b 8c 00 02 00 00 00    mov    0x2(%eax,%eax,1),%ecx
    1a: 89 c2                   mov    %eax,%edx
    1c: 66 89 84 00 00 00 00    mov    %ax,0x0(%eax,%eax,1)
    23: 00 
    24: 83 c0 02                add    $0x2,%eax
    27: d1 ea                   shr    %edx
    29: 3d e8 03 00 00          cmp    $0x3e8,%eax
    2e: 89 0c 95 00 00 00 00    mov    %ecx,0x0(,%edx,4)
    35: 75 d1                   jne    8 <func+0x8>
    37: f3 c3                   repz ret  
    

    Without PENALTY_ON

    00000000 <func>:
    0:  31 c0                   xor    %eax,%eax
    2:  8d b6 00 00 00 00       lea    0x0(%esi),%esi
    8:  8d 50 0a                lea    0xa(%eax),%edx
    b:  66 89 84 00 00 00 00    mov    %ax,0x0(%eax,%eax,1)
    12: 00 
    13: 66 89 94 00 02 00 00    mov    %dx,0x2(%eax,%eax,1)
    1a: 00 
    1b: 83 c0 02                add    $0x2,%eax
    1e: 3d e8 03 00 00          cmp    $0x3e8,%eax
    23: 75 e3                   jne    8 <func+0x8>
    25: 66 31 c0                xor    %ax,%ax
    28: 8b 8c 00 02 00 00 00    mov    0x2(%eax,%eax,1),%ecx
    2f: 89 c2                   mov    %eax,%edx
    31: 83 c0 02                add    $0x2,%eax
    34: d1 ea                   shr    %edx
    36: 3d e8 03 00 00          cmp    $0x3e8,%eax
    3b: 89 0c 95 00 00 00 00    mov    %ecx,0x0(,%edx,4)
    42: 75 e4                   jne    28 <func+0x28>
    44: f3 c3                   repz ret 
    

    I think the reason is that a Read-after-Write stall occurs with PENALTY_ON

    b:  66 89 94 00 02 00 00    mov    %dx,0x2(%eax,%eax,1)
    12: 00 
    13: 8b 8c 00 02 00 00 00    mov    0x2(%eax,%eax,1),%ecx