assemblyoptimizationx86intelbootloader

Optimizing an incrementing ASCII decimal counter in video RAM on 7th gen Intel Core


I'm trying to optimize the following subroutine for a specific Kaby Lake CPU (i5-7300HQ), ideally to make the code at least 10 times faster compared to its original form. The code runs as a floppy-style bootloader in 16-bit real mode. It displays a ten digit decimal counter on screen, counting 0 - 9999999999 and then halting.

I have taken a look at Agner's Optimization Guides for Microarchitecture and Assembly, Instruction Performance Table and Intel's Optimization Reference Manual.

Only sensible optimization I've been able to do so far is swapping the loop instruction for dec + jnz, explanation here.

Another possible optimization might be swapping the lodsb for mov + dec, but the information I've found about that has been conflicting, with some saying it helps slightly and others that it might actually hurt the performance on modern CPUs.

I also tried switching to 32-bit mode and keeping the entire counter in an unused register pair to eliminate any memory access, but after reading into it a bit I realized that those ten bits will get cached immediately and the difference in latency between L1 cache and registers is only about a factor of three, so definitely not worth the added overhead of working with the counter in that format.

(editor's note: add reg latency is 1 cycle, add [mem] latency is about 6 cycles, including the 5 cycle store-forwarding latency. Or much worse if [mem] is uncacheable like video RAM.)

org 7c00h

pos equ 2*(2*80-2)  ;address on screen

;init
cli
mov ax,3
int 10h
mov ax,0b800h
mov es,ax
jmp 0:start

start:
    push cs
    pop ds
    std

    mov ah, 4Eh
    xor cx, cx
    mov bl,'9'

countloop:
    mov cl,10           ;number of digits to add to
    mov si,counter+9    ;start of counter
    mov di,pos          ;screen position

    stc                 ;set carry for first adc
next_digit:
    lodsb               ;load digit
    adc al,0
    cmp bl, al
    jnc print
    add al,-10          ;propagate carry if resulting digit > 9
print:
    mov [si+1],al       ;save new digit
    stosw               ;print

    ;replaced loop with a faster equivalent
    ;loop next_digit
    dec cl
    jnz next_digit

    jnc countloop

    jmp $

counter:
    times 10 db '0'

    times 510-($-$$) db 0
    dw 0aa55h

My question is - what can I do to achieve the desired increase in speed? What other materials can I study to gain more understanding of the underlying concepts?

Note: this is a school assignment. While a straight answer would definitely help, I'd much more appreciate explanations or pointers to relevant study material, as we have been given none.

EDIT: Changed code to a minimal reproducible example


Solution

  • Thanks to the feedback and discussion that took place here (especially thanks to Peter and his dedication), I was able to identify the main source of the slowdown - writing to VRAM, as that memory is uncacheable.

    The only two meaningful optimizations are thus breaking out of the loop as soon as we lose the carry while adding (so that we don't unnecessarily add zero to every single digit and spend time printing it to screen) and combining as many WORD-sized writes into DWORD-sized ones. These two combined were able to push me across the 10x speedup mark.

    My solution (speedup x10.3):

    org 7c00h
    bits 16             ;enables prefixes for 32bit instructions
    pos equ 2*(2*80-2)  ;address on screen
    
    ;init textmode and vram, fix CS
    cli
    mov ax, 3
    int 10h
    mov ax, 0B800h
    mov es, ax
    jmp 0:start
    
    start:
        ;fix segments and stack
        mov bp, 7C00h
        xor ax, ax
        mov ds, ax
        mov ss, ax
        mov sp, bp
    
        ;print initial zeroes
        std
        mov ax, (4Eh << 8) + '0'
        mov cx, 10
        mov di, pos
        sub di, 2
        rep stosw
    
        ;set color into upper byte of DX
        mov dh, 4Eh
    
    counter_loop:
        cmp cx, 5           ;check whether we are incrementing the first two digits
        je two_digit_loop   ;if so, assume values are set correctly
    
        ;reset values back to start
        mov bx, counter     ;set counter pointer to first two digits
        mov ax, [bx]        ;load first two digits
        mov di, pos         ;set destination index to the position of the rightmost digit on the screen
        mov cx, 5           ;set number of digit pairs to 5
    
    two_digit_loop:
        ;increment and adjust
        inc ax
        aaa
        jc carry
    
        ;no carry, update digits and return
        mov dl, al
        or dl, 30h              ;digit to ascii
        mov [es:di - 2], dx     ;write character to screen
        mov [bx], al            ;save value to memory
        jmp counter_loop
    
    carry:
        mov edx, 4E304E30h      ;load '00' in colour
        mov [bx], ax            ;save value to memory
        cmp ax, 0A00h           ;test second digit overflow
        jge continue
    
        ;no carry on second digit, write and return
        or dl, ah               ;digit to ASCII if not 0x0A
        mov [es:di - 4], edx    ;write both characters at once
        jmp counter_loop
    
    continue:
        ;propagate carry to next digit pair
        mov [es:di - 4], edx    ;write zero as both characters (double-sized write)
        mov [bx + 1], ch        ;save zero as upper value to memory
    
        ;continue to next digit pair
        add bx, 2           ;move memory to next digit pair
        mov ax, [bx]        ;load next digit pair
        sub di, 4           ;move display pointer by two char+colour pairs
        dec cx              ;and decrement counter
        jne two_digit_loop
    
        ;we ran out of digits to increment, display arrow and halt
        mov ax, 4E18h
        stosw
        jmp $
    
    ;counter, positioned at least 64B away from the code to prevent nuking the instruction pipeline
    align 128
    counter:
        times 10 db 0
    
    times 510 - ($-$$) db 0
    dw 0aa55h