recursionassemblyx86flickerflood-fill

Avoiding flicker while using a recursive flood fill algorithm


The wikipedia article about flood fill describes the following recursive algorithm:

Flood-fill (node):

  1. If node is not Inside return.
  2. Set the node
  3. Perform Flood-fill one step to the south of node.
  4. Perform Flood-fill one step to the north of node
  5. Perform Flood-fill one step to the west of node
  6. Perform Flood-fill one step to the east of node
  7. Return.

Because I couldn't find too much examples of flood-filling using the assembly language, I implemented this myself (using FASM) and made - I think - a rather decent version. There's much worse you can find here and here!

For those that are inquisitive, performance-wise the order that the neighboring pixels are visited does not change much. So it does not need to be South->North->West->East. Any of the 24 permutations will do, but better not pick an ordering where you don't immediately follow a direction by its opposite direction, because those will double the stack usage! The good-ones are SNWE, NSWE, SNEW, NSEW, WESN, EWSN, WENS, and EWNS.

The program works correctly but also produces flicker on screen. That's probably due to using a recursive solution for flood-filling... But is it?

; ------------------------------
; BL is OldColor
; BH is NewColor
; CX is X
; DX is Y
; IN (bx,cx,dx)
FloodWhile:
    pusha
    mov   al, bl                ; NewColor
    mov   ah, 0Ch
    mov   si, ax                ; CONST
    shr   bx, 8                 ; -> BL is OldColor, BH is DisplayPage

    cmp   al, bl                ; NewColor must be different from OldColor
    je    .z
    mov   ah, 0Dh               ; BIOS.ReadPixel
    int   10h                   ; -> AL
    cmp   al, bl                ; First pixel must have OldColor
    jne   .z
    call  .Flood                ; -> (AX)
.z: popa
    ret
; - - - - - - - - - - - - - - -
; IN (bx,cx,dx,si) OUT () MOD (ax)
.Flood:
    push  di
    mov   di, 12
    mov   ax, si                ; -> AL is NewColor, AH is FunctionID
    int   10h                   ; BIOS.WritePixel

.a: add   cx, [.z+di]           ; Try a neighboring pixel (SNWE)
    cmp   cx, 320
    jnb   .b
    add   dx, [.z+di+2]
    cmp   dx, 200
    jnb   .b
    mov   ah, 0Dh               ; BIOS.ReadPixel
    int   10h                   ; -> AL
    cmp   al, bl
    jne   .b
    call  .Flood
.b: sub   di, 4
    jns   .a
    dec   cx                    ; Return from East

    pop   di
    ret
;         East  West   North South  <-- SNWE
.z: dw    +2,0, -1,+1, 0,-2, 0,+1
; ------------------------------

Solution

  • That's probably due to using a recursive solution for flood-filling... But is it?

    It is true that recursion is infamous for its potential for overflowing the stack, but recursion need not be slow per se. The first code snippet that I'll show below will use a re-worked recursive algorithm and it will run about 62 times faster than yours. Its iterative friend comes in at just 10% faster.

    The program works correctly but also produces flicker on screen.

    Flicker can go away by relying on very fast graphics routines, or by applying graphics routines of any speed to an offscreen memory buffer followed by updating the real screen in one very fast copy operation. The three code snippets that I show below will be using a hybrid approach: simultaneously writing to an offscreen buffer and to the real screen.

    The major changes include:

    The tests ran on the legacy 320x200 256-color screen.

    Examples of flood-filled shapes

    Flood-filling a circle (R=75) producing a plain disc:

    asm stack heap reads writes pixels/second
    Question 90 35300 0 69925 17481 86865
    Good 103 17650 0 69925 17481 5323081 61x
    Better 150 0 1192 69925 17481 5943896 68x
    Best 206 0 12 35557 17481 9578630 110x

    Flood-filling a labyrinth of concentric circles:

    asm stack heap reads writes pixels/second
    Question 90 38156 0 58273 14568 86791
    Good 103 19078 0 58273 14568 5524459 63x
    Better 150 0 156 58273 14568 5924359 68x
    Best 206 0 168 38092 14568 9410852 108x

    62x faster than the original

    This is the recursive solution but with optimizations applied, eg. the code does but a single address calculation based on X and Y, and then applies suitable increments to the scanline address and to the offset within the scanline.

    ; ------------------------------
    ; AL is OldColor
    ; AH is NewColor
    ; BX is X
    ; CX is Y
    ; GS is DoubleBuffer (64KB)
    ; IN (ax,bx,cx,GS)
    GoodFloodWhile:
        push  es
        pusha
        mov   di, 0A000h
        mov   es, di
        mov   di, bx                ; X
        imul  bp, cx, 320           ; Y * 320
    
        cmp   al, ah                ; NewColor must be different from OldColor
        je    .z
        cmp   [GS:bp+di], ah        ; First pixel must have OldColor
        jne   .z
        call  .Flood
    .z: popa
        pop   es
        ret
    ; - - - - - - - - - - - - - - -
        ALIGN 16
    ; IN (ax,di,bp,es,GS) OUT ()
    .Flood:
        mov   [es:bp+di], al        ; Center pixel on Screen
        mov   [GS:bp+di], al        ; Center pixel in Double Buffer
    
        inc   di                    ; Try pixel to the right
        cmp   di, 320
        jnb   .NotR
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotR
        call  .Flood
    .NotR:
        dec   di
    
        dec   di                    ; Try pixel to the left
        js    .NotL
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotL
        call  .Flood
    .NotL:
        inc   di
    
        add   bp, 320               ; Try pixel downward
        cmp   bp, 200*320
        jnb   .NotD
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotD
        call  .Flood
    .NotD:
    
        sub   bp, 640               ; Try pixel upward
        jb    .NotU
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotU
        call  .Flood
    .NotU:
        add   bp, 320
    
        ret
    ; ------------------------------
    

    68x faster than the original

    This iterative solution uses a small (2KB) ringbuffer where it doesn't just record X and Y coordinates, but rather stores scanline addresses and offsets within the scanline. The motivation here was to avoid the many address calculations that require multiplication with the bytes-per-scanline value.
    Always choose a power-of-two for the size of the ringbuffer. That way, the wraparound for the read- and write pointers can be just one simple and operation. The ringbuffer works FirstInFirstOut (FIFO) as opposed to the stack that works LastInFirstOut (LIFO). This is an important fact because if you were to use a LIFO buffer for your iterative solution that buffer would also consume a lot of bytes (just like it was with the recursion's stack).
    Tip: provided that you insert a suitable delay, this can be a good code to animate a liquid running through pipes...

    ; ------------------------------
    ; AL is OldColor
    ; AH is NewColor
    ; BX is X
    ; CX is Y
    ; GS is DoubleBuffer (64KB)
    ; IN (ax,bx,cx,GS)
    BetterFloodWhile:
        push  es
        pusha
        mov   di, 0A000h
        mov   es, di
        mov   di, bx                ; X
        imul  bp, cx, 320           ; Y * 320
    
        cmp   al, ah                ; NewColor must be different from OldColor
        je    .z
        cmp   [GS:bp+di], ah        ; First pixel must have OldColor
        jne   .z
        xor   si, si                ; SI is OffsetReadPointer in FIFO buffer
        xor   bx, bx                ; BX is OffsetWritePointer in FIFO buffer
        call  .Store                ; -> BX
        call  .Flood                ; -> (BX SI DI BP)
    .z: popa
        pop   es
        ret
    ; - - - - - - - - - - - - - - -
        ALIGN 16
    ; IN (ax,bx,si,di,bp,es,GS) OUT () MOD (bx,si,di,bp)
    .Flood:
        mov   di, [Buffer+si]       ; Offset in scanline
        mov   bp, [Buffer+si+2]     ; Address of scanline
        add   si, 4
        and   si, 2047              ; 2KB Ringbuffer (FIFO)
    
        inc   di                    ; Try pixel to the right
        cmp   di, 320
        jnb   .NotR
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotR
        call  .Store                ; -> BX
    .NotR:
        dec   di
    
        dec   di                    ; Try pixel to the left
        js    .NotL
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotL
        call  .Store                ; -> BX
    .NotL:
        inc   di
    
        add   bp, 320               ; Try pixel downward
        cmp   bp, 200*320
        jnb   .NotD
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotD
        call  .Store                ; -> BX
    .NotD:
    
        sub   bp, 640               ; Try pixel upward
        jb    .NotU
        cmp   [GS:bp+di], ah        ; OldColor
        jne   .NotU
        call  .Store                ; -> BX
    .NotU:
    
        cmp   si, bx
        jne   .Flood
        ret    
    ; - - - - - - - - - - - - - - -
        ALIGN 16
    ; IN (al,bx,di,bp,es,GS) OUT (bx)
    .Store:
        mov   [es:bp+di], al        ; Center pixel on Screen
        mov   [GS:bp+di], al        ; Center pixel in Double Buffer
        mov   [Buffer+bx], di
        mov   [Buffer+bx+2], bp
        add   bx, 4
        and   bx, 2047              ; 2KB Ringbuffer (FIFO)
        ret
    ; ------------------------------
    

    109x faster than the original

    The focus here is to find long runs of pixels (aka spans) that can be drawn with an efficient line-drawing method.
    Code alignment proved to be important in this code.
    It was the excellent work of Lode Vandevenne that inspired me to write this optimal solution.

    ; ------------------------------
    ; AL is OldColor
    ; AH is NewColor
    ; BX is X
    ; CX is Y
    ; GS is DoubleBuffer (64KB)
        ALIGN 16
        DB 4 dup 0
    ; IN (ax,bx,cx,GS)
    BestFloodWhile:
        push  es
        pusha
        mov   dx, cx                ; Y
        cmp   al, ah                ; NewColor must be different from OldColor
        je    .Done
        xor   si, si
        xor   cx, cx
    ; - - - - - - - - - - - - - - -
    .Again:
        imul  di, dx, 320
        cmp   [GS:di+bx+0], ah      ; OldColor
        jne   .z                    ; Got colored in the mean time
        lea   bp, [word bx+0]
        add   bx, cx                ; Avoids retesting CX pixels
        
    .a: add   bx, 1                 ; X++ Extend to the right
        cmp   bx, 320
        jnb   .b
        cmp   [GS:di+bx], ah        ; OldColor
        je    .a
    .b: xchg  bp, bx
    
    .c: dec   bx                    ; X-- Extend to the left
        js    .d
        cmp   [GS:di+bx], ah        ; OldColor
        je    .c
    .d: inc   bx                    ; X++
    
        sub   bp, bx                ; -> BP is length of this line
        mov   cx, 0A000h            ; NewColor line on screen
        call  .Line                 ; -> CX=0 (ES)
        mov   cx, GS                ; NewColor line in Double Buffer
        call  .Line                 ; -> CX=0 (ES)
    
        sub   di, 320
        dec   dx                    ; Y-- Check above
        js    .e
        push  bp                    ; (1)
        call  .Scan                 ; -> BX SI (BP=0)
        pop   bp                    ; (1)
        sub   bx, bp
    .e: inc   dx                    ; Y++
    
        add   di, 640
        inc   dx                    ; Y++ Check below
        cmp   dx, 200
        jnb   .f
        call  .Scan                 ; -> BX SI (BP=0)
    .f: dec   dx                    ; Y--
    ; - - - - - - - - - - - - - - -
    .z: sub   si, 6
        jb    .Done
        mov   bx, [Buffer+si]       ; X
        mov   dx, [Buffer+si+2]     ; Y
        mov   cx, [Buffer+si+4]     ; L-1
        jmp   .Again
    ; - - - - - - - - - - - - - - -
    .Done:
        popa
        pop   es
        ret
    ; - - - - - - - - - - - - - - -
        ALIGN 16
    ; IN (al,bx,cx,di,bp) OUT (cx=0) MOD (es)
    .Line:                          ; Draw BP long line at [cx:di+bx]
        mov   es, cx
        add   di, bx
        mov   cx, bp                ; BP is 1+
        test  di, 1                 ; Use aligned words
        jz    .y
        stosb
        dec   cx                    ; -> CX is 0+
    .y: shr   cx, 1                 ; -> CF
        push  ax
        mov   ah, al
        rep stosw
        pop   ax
        jnc   .x
        stosb
    .x: sub   di, bp
        sub   di, bx
        ret
    ; - - - - - - - - - - - - - - -
        ALIGN 16
    ; IN (ah,bx,cx=0,dx,si,di,bp,GS) OUT (bx,si) MOD (bp=0)
    .Scan:                          ; Scan BP long line at [GS:di+bx]
        cmp   [GS:di+bx], ah        ; OldColor
        je    .v                    ; Part of a span
    .w: add   bx, 1                 ; X++
        dec   bp
        jnz   .Scan
        ret
    .v: add   si, word 6            ; Begin a new span
        mov   [Buffer+si-6], bx     ; X
        mov   [Buffer+si-4], dx     ; Y
        mov   [Buffer+si-2], cx     ; L-1 Counts the additional pixels on the right
        inc   bx                    ; X++
        dec   bp
        jz    .t
    .u: cmp   [GS:di+bx], ah        ; OldColor
        jne   .w                    ; End current span
        inc   word [Buffer+si-2]    ; Cont current span, add to 'additional pixels'
        inc   bx                    ; X++
        dec   bp
        jnz   .u
    .t: ret
    ; ------------------------------