javaassemblyx86-64jitauto-vectorization

Understanding JIT's rewrite of a for loop


I have the following Java code (all arrays are initialized before we call "arrays" and all are of size "arraySize")

int arraySize = 64;

float[] a;
float[] b;
float[] result;

public void arrays() {
    for (int i = 0; i < arraySize; i++) {
        result[i] = ((a[i] * b[i] + b[i] - b[i]) / b[i]) +
                     a[i] + a[i] + a[i] + a[i];
    }
}

The JIT's output for it is:

# {method} {0x00000001034751a8} 'arrays' '()V' in 'main/ComplexExpression'
#           [sp+0x30]  (sp of caller)
[Entry Point]
0x000000010c4c55a0: mov 0x8(%rsi),%r10d
0x000000010c4c55a4: movabs $0x800000000,%r11
0x000000010c4c55ae: add %r11,%r10
0x000000010c4c55b1: cmp %r10,%rax
0x000000010c4c55b4: jne 0x000000010c44b780  ;   {runtime_call ic_miss_stub}
0x000000010c4c55ba: xchg %ax,%ax
0x000000010c4c55bc: nopl 0x0(%rax)
[Verified Entry Point]
0x000000010c4c55c0: mov %eax,-0x14000(%rsp)
0x000000010c4c55c7: push %rbp
0x000000010c4c55c8: sub $0x20,%rsp  ;*synchronization entry
                                    ; - main.ComplexExpression::arrays@-1 (line 51)
0x000000010c4c55cc: mov %rsi,%rcx
0x000000010c4c55cf: mov 0xc(%rsi),%ebp  ;*getfield arraySize {reexecute=0 rethrow=0 return_oop=0}
                                        ; - main.ComplexExpression::arrays@4 (line 51)
0x000000010c4c55d2: test %ebp,%ebp
0x000000010c4c55d4: jle L0006  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                               ; - main.ComplexExpression::arrays@7 (line 51)
0x000000010c4c55da: mov 0x10(%rsi),%r11d  ;*getfield a {reexecute=0 rethrow=0 return_oop=0}
                                          ; - main.ComplexExpression::arrays@16 (line 52)
0x000000010c4c55de: xchg %ax,%ax
0x000000010c4c55e0: mov 0xc(%r12,%r11,8),%r10d  ; implicit exception: dispatches to 0x000000010c4c58a3
                                                ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c55e5: test %r10d,%r10d
0x000000010c4c55e8: jbe L0007
0x000000010c4c55ee: movslq %ebp,%r9
0x000000010c4c55f1: movslq %r10d,%r10
0x000000010c4c55f4: dec %r9
0x000000010c4c55f7: cmp %r10,%r9
0x000000010c4c55fa: nopw 0x0(%rax,%rax,1)
0x000000010c4c5600: jae L0007
0x000000010c4c5606: mov 0x14(%rsi),%ebx  ;*getfield b {reexecute=0 rethrow=0 return_oop=0}
                                         ; - main.ComplexExpression::arrays@22 (line 52)
0x000000010c4c5609: mov 0xc(%r12,%rbx,8),%r10d  ; implicit exception: dispatches to 0x000000010c4c58a3
                                                ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c560e: test %r10d,%r10d
0x000000010c4c5611: jbe L0007
0x000000010c4c5617: movslq %r10d,%r10
0x000000010c4c561a: nopw 0x0(%rax,%rax,1)
0x000000010c4c5620: cmp %r10,%r9
0x000000010c4c5623: jae L0007
0x000000010c4c5629: mov 0x18(%rsi),%r8d  ;*getfield result {reexecute=0 rethrow=0 return_oop=0}
                                         ; - main.ComplexExpression::arrays@11 (line 52)
0x000000010c4c562d: mov 0xc(%r12,%r8,8),%r10d  ; implicit exception: dispatches to 0x000000010c4c58a3
                                               ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                               ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5632: test %r10d,%r10d
0x000000010c4c5635: jbe L0007
0x000000010c4c563b: movslq %r10d,%r10
0x000000010c4c563e: xchg %ax,%ax
0x000000010c4c5640: cmp %r10,%r9
0x000000010c4c5643: jae L0007
0x000000010c4c5649: lea (%r12,%r8,8),%rdx
0x000000010c4c564d: lea (%r12,%r11,8),%rdi
0x000000010c4c5651: mov %edx,%r11d
0x000000010c4c5654: lea (%r12,%rbx,8),%rax
0x000000010c4c5658: shr $0x2,%r11d
0x000000010c4c565c: and $0x7,%r11d
0x000000010c4c5660: mov $0x3,%r9d
0x000000010c4c5666: sub %r11d,%r9d
0x000000010c4c5669: and $0x7,%r9d
0x000000010c4c566d: inc %r9d
0x000000010c4c5670: cmp %ebp,%r9d
0x000000010c4c5673: cmovg %ebp,%r9d
0x000000010c4c5677: xor %r10d,%r10d
0x000000010c4c567a: xor %r11d,%r11d  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                                     ; - main.ComplexExpression::arrays@10 (line 52)
             L0000: vmovss 0x10(%rax,%r11,4),%xmm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5684: vmovss 0x10(%rdi,%r11,4),%xmm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c568b: vmulss %xmm1,%xmm0,%xmm3
0x000000010c4c568f: vaddss %xmm1,%xmm3,%xmm2
0x000000010c4c5693: vsubss %xmm1,%xmm2,%xmm3
0x000000010c4c5697: vdivss %xmm1,%xmm3,%xmm1
0x000000010c4c569b: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c569f: vaddss %xmm0,%xmm2,%xmm1
0x000000010c4c56a3: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c56a7: vaddss %xmm0,%xmm2,%xmm0
0x000000010c4c56ab: vmovss %xmm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c56b2: inc %r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                               ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c56b5: cmp %r9d,%r11d
0x000000010c4c56b8: jl L0000  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                              ; - main.ComplexExpression::arrays@7 (line 51)
0x000000010c4c56ba: mov %ebp,%r9d
0x000000010c4c56bd: add $0xffffffe1,%r9d
0x000000010c4c56c1: mov $0x80000000,%r8d
0x000000010c4c56c7: cmp %r9d,%ebp
0x000000010c4c56ca: cmovl %r8d,%r9d
0x000000010c4c56ce: cmp %r9d,%r11d
0x000000010c4c56d1: jge L0004
0x000000010c4c56d7: mov $0x7d00,%ebx
             L0001: mov %r9d,%esi
0x000000010c4c56df: sub %r11d,%esi
0x000000010c4c56e2: cmp %r11d,%r9d
0x000000010c4c56e5: cmovl %r10d,%esi
0x000000010c4c56e9: cmp $0x7d00,%esi
0x000000010c4c56ef: cmova %ebx,%esi
0x000000010c4c56f2: add %r11d,%esi
0x000000010c4c56f5: data16 data16 nopw 0x0(%rax,%rax,1)  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                                                         ; - main.ComplexExpression::arrays@10 (line 52)
             L0002: vmovdqu 0x10(%rax,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5707: vmovdqu 0x10(%rdi,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c570e: vmulps %ymm0,%ymm1,%ymm2
0x000000010c4c5712: vaddps %ymm0,%ymm2,%ymm2
0x000000010c4c5716: vsubps %ymm0,%ymm2,%ymm2
0x000000010c4c571a: vdivps %ymm0,%ymm2,%ymm0
0x000000010c4c571e: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5722: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5726: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c572a: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c572e: vmovdqu %ymm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5735: vmovdqu 0x30(%rdi,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c573c: vmovdqu 0x30(%rax,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5743: vmulps %ymm1,%ymm0,%ymm2
0x000000010c4c5747: vaddps %ymm1,%ymm2,%ymm2
0x000000010c4c574b: vsubps %ymm1,%ymm2,%ymm2
0x000000010c4c574f: vdivps %ymm1,%ymm2,%ymm1
0x000000010c4c5753: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c5757: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c575b: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c575f: vaddps %ymm0,%ymm1,%ymm0
0x000000010c4c5763: vmovdqu %ymm0,0x30(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c576a: vmovdqu 0x50(%rdi,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c5771: vmovdqu 0x50(%rax,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5778: vmulps %ymm1,%ymm0,%ymm2
0x000000010c4c577c: vaddps %ymm1,%ymm2,%ymm2
0x000000010c4c5780: vsubps %ymm1,%ymm2,%ymm2
0x000000010c4c5784: vdivps %ymm1,%ymm2,%ymm1
0x000000010c4c5788: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c578c: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c5790: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c5794: vaddps %ymm0,%ymm1,%ymm0
0x000000010c4c5798: vmovdqu %ymm0,0x50(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c579f: vmovdqu 0x70(%rdi,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c57a6: vmovdqu 0x70(%rax,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c57ad: vmulps %ymm1,%ymm0,%ymm2
0x000000010c4c57b1: vaddps %ymm1,%ymm2,%ymm2
0x000000010c4c57b5: vsubps %ymm1,%ymm2,%ymm2
0x000000010c4c57b9: vdivps %ymm1,%ymm2,%ymm1
0x000000010c4c57bd: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c57c1: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c57c5: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c57c9: vaddps %ymm0,%ymm1,%ymm0
0x000000010c4c57cd: vmovdqu %ymm0,0x70(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c57d4: add $0x20,%r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                                     ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c57d8: cmp %esi,%r11d
0x000000010c4c57db: nopl 0x0(%rax,%rax,1)
0x000000010c4c57e0: jl L0002  ;*goto {reexecute=0 rethrow=0 return_oop=0}
                              ; - main.ComplexExpression::arrays@81 (line 51)
0x000000010c4c57e6: mov 0x348(%r15),%rsi  ; ImmutableOopMap {rcx=Oop rdi=Oop rdx=Oop rax=Oop }
                                          ;*goto {reexecute=1 rethrow=0 return_oop=0}
                                          ; - (reexecute) main.ComplexExpression::arrays@81 (line 51)
0x000000010c4c57ed: test %eax,(%rsi)  ;*goto {reexecute=0 rethrow=0 return_oop=0}
                                      ; - main.ComplexExpression::arrays@81 (line 51)
                                      ;   {poll} *** SAFEPOINT POLL ***
0x000000010c4c57ef: cmp %r9d,%r11d
0x000000010c4c57f2: jl L0001
0x000000010c4c57f8: mov %ebp,%r10d
0x000000010c4c57fb: add $0xfffffff9,%r10d
0x000000010c4c57ff: cmp %r10d,%ebp
0x000000010c4c5802: cmovl %r8d,%r10d
0x000000010c4c5806: cmp %r10d,%r11d
0x000000010c4c5809: jge L0004
0x000000010c4c580b: nop  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                         ; - main.ComplexExpression::arrays@10 (line 52)
             L0003: vmovdqu 0x10(%rax,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5813: vmovdqu 0x10(%rdi,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c581a: vmulps %ymm0,%ymm1,%ymm2
0x000000010c4c581e: vaddps %ymm0,%ymm2,%ymm2
0x000000010c4c5822: vsubps %ymm0,%ymm2,%ymm2
0x000000010c4c5826: vdivps %ymm0,%ymm2,%ymm0
0x000000010c4c582a: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c582e: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5832: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5836: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c583a: vmovdqu %ymm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5841: add $0x8,%r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                                    ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c5845: cmp %r10d,%r11d
0x000000010c4c5848: jl L0003  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                              ; - main.ComplexExpression::arrays@7 (line 51)
             L0004: cmp %ebp,%r11d
0x000000010c4c584d: jge L0006
0x000000010c4c584f: nop  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                         ; - main.ComplexExpression::arrays@10 (line 52)
             L0005: vmovss 0x10(%rax,%r11,4),%xmm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5857: vmovss 0x10(%rdi,%r11,4),%xmm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c585e: vmulss %xmm1,%xmm0,%xmm3
0x000000010c4c5862: vaddss %xmm1,%xmm3,%xmm2
0x000000010c4c5866: vsubss %xmm1,%xmm2,%xmm3
0x000000010c4c586a: vdivss %xmm1,%xmm3,%xmm1
0x000000010c4c586e: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c5872: vaddss %xmm0,%xmm2,%xmm1
0x000000010c4c5876: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c587a: vaddss %xmm0,%xmm2,%xmm0
0x000000010c4c587e: vmovss %xmm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5885: inc %r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                               ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c5888: cmp %ebp,%r11d
0x000000010c4c588b: jl L0005
             L0006: vzeroupper
0x000000010c4c5890: add $0x20,%rsp
0x000000010c4c5894: pop %rbp
0x000000010c4c5895: cmp 0x340(%r15),%rsp  ;   {poll_return} *** SAFEPOINT POLL ***
0x000000010c4c589c: ja L0008
0x000000010c4c58a2: ret  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                         ; - main.ComplexExpression::arrays@7 (line 51)
             L0007: mov $0xffffff76,%esi
0x000000010c4c58a8: mov %rcx,(%rsp)
0x000000010c4c58ac: vzeroupper
0x000000010c4c58af: call 0x000000010c451000  ; ImmutableOopMap {[0]=Oop }
                                             ;*if_icmpge {reexecute=1 rethrow=0 return_oop=0}
                                             ; - (reexecute) main.ComplexExpression::arrays@7 (line 51)
                                             ;   {runtime_call UncommonTrapBlob}
             L0008: movabs $0x10c4c5895,%r10  ;   {internal_word}
0x000000010c4c58be: mov %r10,0x358(%r15)
0x000000010c4c58c5: jmp 0x000000010c452100  ;   {runtime_call SafepointBlob}
0x000000010c4c58ca: hlt
0x000000010c4c58cb: hlt
0x000000010c4c58cc: hlt
0x000000010c4c58cd: hlt
0x000000010c4c58ce: hlt
0x000000010c4c58cf: hlt
0x000000010c4c58d0: hlt
0x000000010c4c58d1: hlt
0x000000010c4c58d2: hlt
0x000000010c4c58d3: hlt
0x000000010c4c58d4: hlt
0x000000010c4c58d5: hlt
0x000000010c4c58d6: hlt
0x000000010c4c58d7: hlt
0x000000010c4c58d8: hlt
0x000000010c4c58d9: hlt
0x000000010c4c58da: hlt
0x000000010c4c58db: hlt
0x000000010c4c58dc: hlt
0x000000010c4c58dd: hlt
0x000000010c4c58de: hlt
0x000000010c4c58df: hlt
[Exception Handler]
0x000000010c4c58e0: jmp 0x000000010c464a00  ;   {no_reloc}
[Deopt Handler Code]
0x000000010c4c58e5: call 0x000000010c4c58ea
0x000000010c4c58ea: subq $0x5,(%rsp)
0x000000010c4c58ef: jmp 0x000000010c4513a0  ;   {runtime_call DeoptimizationBlob}
0x000000010c4c58f4: hlt
0x000000010c4c58f5: hlt
0x000000010c4c58f6: hlt
0x000000010c4c58f7: hlt

It seems like the for loop was translated to multiple "assembly loops". The main part is in L0002 I guess, where you can see the loop unrolling, but there is also L0000, L0003 and L0005 which also seem to be part of the for loop but Im having a hard time understanding how they fit in.

Can someone explain how do all the parts in all the labels compose the actual loop? I imagine its some "known" pattern in the JIT that I just dont know.


Solution

  • Just discussing the main loops as indicated by their jump labels.

    L0000 loop: Align to a multiple of 8 elements (32 byte). The and $0x7 is a dead giveaway. Vector alignment is discussed pretty much everywhere when it comes to vector instructions. I don't have one golden bullet reference link that explains the topic but you will find plenty of resources on SO and elsewhere on it. The IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual should also cover it.

    Even without, it should be pretty self-evident that the most efficient memory access is one where the data is not split across cache lines or worse, memory pages. Aligning to the natural vector size ensures that.

    AVX and AVX2 hardware generally handles misalignment gracefully but older SSE and newer AVX512 hardware require alignment for optimal performance. I assume the code generator just handles all those cases the same.

    Since we have two inputs and one output, we can only really align to one of those and then hope that the others are equally aligned. If I read the code correctly, the alignment is performed on the output, not the input, but I could be reading it wrong.

    L0002 loop: Main loop unrolled 4 times for 4 x 8 = 32 elements per iteration.

    L0003 loop: Same as the main loop but not unrolled to handle the last 0-3 full vectors.

    L0005 loop: Handle last 0-7 elements as scalars. Interestingly 4 iterations could be saved by making one iteration using 16 byte XMM registers but the compiler chose not to do so. I guess they decided it's not worth it. The same could be done in the L0000 loop.

    The compiler knows that the output does not overlap with the input. Otherwise you would expect another separate loop to handle the case where both overlap so closely that vectorization is impossible. That might just be a separate conditional jump to the L0005 loop.

    Alignment loop counter

    Let's try to untangle the alignment loop L0000. It's loop counter is %r11d (set to 0 immediately before the loop) with %r9d as its loop limit. So let's trace %r9d back.

    0x000000010c4c5670: cmp %ebp,%r9d
    0x000000010c4c5673: cmovg %ebp,%r9d
    

    We previously established that %ebp holds arraySize, so this is basically r9d = min(r9d, arraySize). Makes sense. In a small, misaligned array, the L0000 loop handles the full content.

    Before that:

    0x000000010c4c5649: lea (%r12,%r8,8),%rdx
    0x000000010c4c5651: mov %edx,%r11d
    0x000000010c4c5658: shr $0x2,%r11d
    0x000000010c4c565c: and $0x7,%r11d
    0x000000010c4c5660: mov $0x3,%r9d
    0x000000010c4c5666: sub %r11d,%r9d
    0x000000010c4c5669: and $0x7,%r9d
    0x000000010c4c566d: inc %r9d
    

    r12 is never initialized in that code snippet. This question suggests that it holds the heap base. Further up we find the line 0x000000010c4c5629: mov 0x18(%rsi),%r8d ;*getfield result. So let's just assume that lea (%r12,%r8,8),%rdx gives us the start address of the result array.

    However, it doesn't seem like that address is the start of the data content. Later in the loop you find that all memory accesses follow the pattern vmovss %xmm0,0x10(%rdx,%r11,4). Notice the additional offset 0x10 = 16. We can guess that the first 16 byte of the array hold metadata such as its size. This offset is folded into every following computation. Ugh.

    The shr $0x2 divides by 4, going from bytes to float indices. and $0x7 gives the offset from the last multiple of 8. In C code, this would be

    struct Array {
       int64_t meta1, meta2;
       float content[];
    };
    Array* result = heap_base + offset_of_result;
    unsigned r11d = (unsigned) result / sizeof(float) % 8;
    

    Then we compute %r9d:

    unsigned r9d = (3 - r11d) % 8 + 1;
    

    We've established that %r11d is between 0 and 7 so we can just look at the numbers.

    | r11d |  r9d |
    |   0  |   4  |
    |   1  |   3  |
    |   2  |   2  |
    |   3  |   1  |
    |   4  |   8  |
    |   5  |   7  |
    |   6  |   6  |
    |   7  |   5  |
    

    This only makes sense when we keep in mind that all our memory accesses are offset by 4 additional floats. Let's say %r11d is 1, meaning the array header starts 1 float = 4 byte after the last 32 byte alignment. The array content then starts 1 + 4 = 5 float = 20 byte after alignment. Then we do 3 scalar iterations to get to the next 32 byte alignment.

    The one issue that I have with this code is that for r11d = 4, we should not do any scalar iterations. We can go straight to vectorized code. An additional and $0x7,r9d would fix that. But on the other hand this saves a conditional branch instruction since the loop is always executed at least once.

    Follow-up questions

    so the "loop alignment" is not so alignment of the "i" used for loop control, its alignment to cache line size?

    Not cache line size, natural vector size. This code uses AVX vectors (YMM registers). Their size is 32 byte, so we try to get 32 byte alignment. Cache line would be 64 byte. But notice how a 32 byte-aligned access of 32 byte vectors never crosses a cache line boundary. That's the main goal.

    If so, do you know the pseudo algorithm to determine that number of iterations? I would assume just (address % 8) but it seems the only "lea" in the code before L0000 do not find their way into r9d or r11d, which are used for the loop control

    I think I've made it clear above. However, in general, and written in C, you could write:

    float* array = ...;
    size_t array_size = ...;
    const size_t vector_in_bytes = 32;
    size_t start_in_bytes = (size_t) array;
    size_t offset_from_alignment = start_in_bytes % vector_in_bytes;
    size_t bytes_to_alignment = vector_in_bytes - offset_from_alignment;
    // if offset_from_alignment is 0, make bytes_to_alignment 0 instead of 32
    bytes_to_alignment %= vector_in_bytes;
    size_t elements_to_alignment = min(array_size, bytes_to_alignment / sizeof(float));
    size_t i; // loop counter will be carried through all 3 loops
    for(i = 0; i < elements_to_alignment; ++i) {
        // scalar loop for alignment
    }
    size_t elements_per_vector = vector_in_bytes / sizeof(float);
    for(; i + elements_per_vector <= array_size; i += elements_per_vector) {
        // vectorized loop
    }
    for(;  i < array_size; ++i) {
        // scalar loop for last few elements
    }
    

    There are shorter versions of writing this but this should be the easiest to understand. The key point to note is that we have to reinterpret the array pointer as an integer for this to work, which is why I used C and not Java.

    A shorter, more elegant version could be

    float* array = ...;
    size_t array_size = ...;
    const size_t vector_in_bytes = 32;
    size_t start_in_bytes = (size_t) array;
    size_t next_aligned = (start_in_bytes + vector_in_bytes - 1) & -vector_in_bytes;
    size_t elements_to_alignment = min(array_size,
          (next_aligned - start_in_bytes) / sizeof(float));
    size_t i;
    for(i = 0; i < elements_to_alignment; ++i) {
       ...
    }