assemblyarmsimdarm64neon

How to exactly find the first matching zero in ARM using `shrn`, `fmov`, `rbit`, `clz`?


This blog talks about using ARM instructions that can make finding the first match faster. As much as I understand we use shrn, then fmov, rbit, clz and lsr since we do not have ctz in the ARM. I am not sure if my code correctly implements that logic.

I am trying to do:

I am trying to opimitize [>] loops in my Brainfuck programming language. [>] increments the pointer to find the cell that has the value 0. Instead of doing that one cell at a time I want to load 16 bytes at a time, find the first matching zero and update the index. If it is not found, I want to load the next 16 bytes, and so on.

  1. First load the 16 bytes
  2. if not found (cmp x1, #16 checks if the number of zeros are 64, 64 >> 2 = 16 so no zero found) load the next 16 bytes till found.).

I do not know if I am doing wrong and I have no idea what is the next step to look at? My exact question: Are the generated assembly code is correct? Am I using the registers and instructions correctly? Because there is little to none information about it, I had to look at the some patches

That is the an exmaple of the generated assembly code But I do not exactly how I can write a case and try if it finds the first matching zero. (The assembly code is generated from compiling the the brainfuck code [>]):

    .text
    .align 2
    .global _main
    .extern _putchar, _getchar, _malloc, _free, _memset
_main:
    stp x29, x30, [sp, #-16]!
    mov x29, sp
    stp x19, x20, [sp, #-16]!
    mov x0, #30000
    bl _malloc
    mov x19, x0
    mov x20, x0
    mov x1, x19
    eor w2, w2, w2
    mov x3, #30000
    bl _memset
loop_scan0:
    ld1.16b {v0}, [x19]
    cmeq.16b v1, v0, #0
    shrn v2.8b, v1.8h, #4
    fmov x4, d2
    rbit x4, x4
    clz x4, x4
    lsr x4, x4, #2
    mov x5, x4
    cmp x4, #16
    b.eq loop_scan2
    add x19, x19, x5
    b loop_scan1
loop_scan2:
    add x19, x19, #16
    b loop_scan0
loop_scan1:
    mov x0, x20
    bl _free
    ldp x19, x20, [sp], #16
    ldp x29, x30, [sp], #16
    eor w0, w0, w0
    ret

If this is correct how would I optimize [>>] or [>>>>] loops that moves the pointers to the right 2 and 4 times till to find a memory cell with value 0? Because Brainfuck benchmark has eamples that contain [>>>>] but not [>]; that is why, I am not sure if my implementation is correct.


Solution

  • I have successfully optimized memory scanning. I hope my code is clear: (For now I assume every brainfuck code is well-written):

                    TokenType::MemoryScan(n) => { 
                        let loop_label = format!("loop_memory_scan{}", label_counter);
                        label_counter += 1;
                        // After the zeros in different indexes are found the following ands
                        // keep only the ones in the exact posiitions we want so rbit+clz find
                        // only the ones in the positions we want.
                        let constant = match *n {
                            // [>]: if reegister is zero (not found (no xf)) jump back.
                            1 => &format!("    cbz x8, {}\n", loop_label), 
                            // For [>>] since it moves pointer two times, keep the ones in even position: 0, 2, 4 ... 
                            2 => "    ands x8, x8, #0x0f0f0f0f0f0f0f0f\n",
                            // For [>>>>] since it moves pointer four times, keep the ones in the positions divible by 4: 0, 4, 8, 12 
                            4 => "    ands x8, x8, #0x000f000f000f000f\n",
                            // The above logic is the same for [>>>>>>>>] as well.  
                            8 => "    ands x8, x8, #0x0000000f0000000f\n",  
                            _ => unreachable!(),
                        };
                        // Apply the finding the first matching logic
                        assembly.push_str(&format!("{}:\n", loop_label));  
                        assembly.push_str("    ld1.16b {v0}, [x19], #16\n");  
                        assembly.push_str("    cmeq.16b v0, v0, #0\n");        
                        assembly.push_str("    shrn.8b v0, v0, #4\n");
                        assembly.push_str("    fmov x8, d0\n");  
                        assembly.push_str(constant);  
                        if *n != 1 {
                            // If zero flag is set, it means ands found no matching 0s in the desires position
                            // So we simply jump to the loop again
                            assembly.push_str(&format!("    b.eq {}\n", loop_label));
                        }  
                        // Since we do not have ctz (count trailing zeros) in ARM we use rbit + clz (reverse and count leading zeros) instead
                        assembly.push_str("    rbit x9, x8\n");  
                        assembly.push_str("    clz x9, x9\n");  
                        // We have additional post increment +16. So we must decrement that firstly
                        assembly.push_str("    sub x19, x19, #16\n");  
                        // Divide by four and increment the pointer
                        assembly.push_str("    add x19, x19, x9, lsr #2\n"); 
                    },