assemblyarmmulticoreraspberry-pi-picorp2040

Launching core 1 on the RP2040 with baremetal assembly


As I understand the documentation, 2.8.2, the process of launching core 1 is to send a sequence of values, with the final 3 being the vector table, stack pointer, and entry point, over the FIFO, while core 1 will echo the values back to you.

From the c code provided by the documentation, I wrote out this assembly:

    .cpu cortex-m0
    .thumb
ent:
    ldr r0, =0x20001000
    mov sp, r0              @init stack pointer

    ldr r0, =0xe000ed08
    ldr r3, [r0]            @vector table offset register
core:
    mov r7, pc
    b fifo_drain
    sev
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core

    mov r7, pc
    b fifo_drain
    sev
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core

    mov r1, #1
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #1
    bne core

    mov r1, r3              @vector table
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r3
    bne core

    mov r1, sp              @stack pointer
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, sp
    bne core

    mov r1, pc
    add r1, #2              @entry point
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read

    ldr r0, =0xd0000000
    ldr r1, [r0]
    cmp r1, #1
    beq led

The sequence of values sent over the FIFO is {0, 0, 1, vt, sp, ent}, and when the value isn't echoed back, the sequence starts over. The entry point is simply the last 4 lines, where the core reads the CPUID register from the SIO, and turns on the LED (GPIO25) if the cpu id is 1.

The sequence seems to get stuck in a loop at the vector table, which makes sense since I barely understand it, the FIFO just doesn't echo it back. Also, the documentation has a note next to the entry point that says "don't forget the thumb bit!", whatever that means.

Edit:

Updated code, same problem:

    .cpu cortex-m0
    .thumb
ent:
    ldr r0, =0x20001000
    mov sp, r0              @init stack pointer

    ldr r0, =0xe000ed08
    ldr r1, =0x20000000
    str r1, [r0]            @init vtor

    ldr r0, =0xd0000000
    ldr r1, [r0]
    cmp r1, #1
    beq led
    
    b core
    
.thumb_func
core:
    mov r7, pc
    b fifo_drain
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core
    
    mov r7, pc
    b fifo_drain
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core
    
    mov r1, #1
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #1
    bne core
    
    ldr r3, =0x20000000
    mov r1, r3              @vector table
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r3
    bne core
    
    mov r1, sp              @stack pointer
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, sp
    bne core
    
    ldr r3, =0x20000001
    mov r1, r3              @entry point
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r3
    bne core
    
    b loop
    
.thumb_func
fifo_stat:
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #15
    and r1, r1, r2
    mov pc, r7

.thumb_func
fifo_writ:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #2
    and r3, r3, r2
    beq fifo_writ
    
    ldr r0, =0xd0000054
    str r1, [r0]
    sev
    mov pc, r7

.thumb_func
fifo_read:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #1
    and r3, r3, r2
    beq _wfe

    ldr r0, =0xd0000058
    ldr r1, [r0]
    mov pc, r7

.thumb_func
fifo_drain:
    ldr r0, =0xd0000058
    ldr r1, [r0]
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #1
    and r1, r1, r2
    bne fifo_drain
    sev
    mov pc, r7
    
.thumb_func
_wfe:
    wfe
    b fifo_read

.thumb_func
led:
    movs r1, #32            @io_bank
    ldr r0, =0x4000f000
    str r1, [r0]            @release reset on io_bank

    movs r1, #5             @sio
    ldr r0, =0x400140cc
    str r1, [r0]            @assign sio to gpio25_ctrl

    movs r1, #1
    lsl r1, r1, #25
    
    ldr r0, =0xd0000024
    str r1, [r0]            @enable output

    ldr r0, =0xd0000014
    str r1, [r0]            @turn on the led
    
.thumb_func
loop:
    nop
    b loop

Solution

  • My core zero code is a mixture of C and assembly language. I think we can sort your questions out though.

    My bootstrap looks like this

    .cpu cortex-m0
    .thumb
    
        ldr r1,=0xD0000000 ;@SIO_CPUID
        ldr r0,[r1]
        cmp r0,#0
        bne core_one
    
        ;@ core_zero
        ldr r0,=0x20002000
        mov sp,r0
        bl zero_entry
        b .
    
    core_one:
        ;@ core_one
        bl notmain
        b .
    
    .align
    .ltorg
    
    
    ;@ ----------------------------------
    .balign 0x100
    
    .thumb_func
    .globl PUT32
    PUT32:
        str r1,[r0]
        bx lr
    
    .thumb_func
    .globl GET32
    GET32:
        ldr r0,[r0]
        bx lr
    
    .globl SEV
    .thumb_func
    SEV:
        sev
        bx lr
    
    .globl WFE
    .thumb_func
    WFE:
        wfe
        bx lr
    
    .globl DELAY
    .thumb_func
    DELAY:
        sub r0,#1
        bne DELAY
        bx lr
    

    And I link for 0x20000000 and build my uf2 file for sram/0x20000000 as the destination for the binary. It depends on circumstances, but you need to know where your code is running.

    My core zero code looks like this

    extern void PUT32 ( unsigned int, unsigned int );
    extern unsigned int GET32 ( unsigned int );
    
    extern void SEV ( void );
    extern void WFE ( void );
    
    #define SIO_BASE                    0xD0000000
    
    #define SIO_FIFO_ST                 (SIO_BASE+0x50)
    #define SIO_FIFO_WR                 (SIO_BASE+0x54)
    #define SIO_FIFO_RD                 (SIO_BASE+0x58)
    
    static void fifo_flush ( void )
    {
        while(1)
        {
            if((GET32(SIO_FIFO_ST)&0x1) == 0) break; //zero if empty
            GET32(SIO_FIFO_RD);
        }
        SEV();
    }
    
    static unsigned int fifo_send ( unsigned int cmd )
    {
        while(1)
        {
            if((GET32(SIO_FIFO_ST)&0x2) != 0) break; //one if ready
        }
        PUT32(SIO_FIFO_WR,cmd);
        SEV();
        while(1)
        {
            if((GET32(SIO_FIFO_ST)&0x1) == 0) //zero if  empty
            {
                WFE();
            }
            else
            {
                break;
            }
        }
        return(GET32(SIO_FIFO_RD));
    }
    
    unsigned int zero_entry ( void )
    {
        unsigned int ra;
    
        while(1)
        {
            fifo_flush();
            ra=fifo_send(0);
            if(ra!=0) continue;
            fifo_flush();
            ra=fifo_send(0);
            if(ra!=0) continue;
            ra=fifo_send(1);
            if(ra!=1) continue;
            ra=fifo_send(0x20000000); //vector_table
            if(ra!=0x20000000) continue;
            ra=fifo_send(0x20003000);    //stack pointer
            if(ra!=0x20003000) continue;
            ra=fifo_send(0x20000001);    //entry
            if(ra!=0x20000001) continue;
            break;
        }
        return(0);
    }
    

    And if interested my core one code looks like this

    void PUT32 ( unsigned int, unsigned int );
    unsigned int GET32 ( unsigned int );
    void DELAY ( unsigned int );
    
    #define RESETS_BASE                 0x4000C000
    
    #define RESETS_RESET_RW             (RESETS_BASE+0x0+0x0000)
    #define RESETS_RESET_XOR            (RESETS_BASE+0x0+0x1000)
    #define RESETS_RESET_SET            (RESETS_BASE+0x0+0x2000)
    #define RESETS_RESET_CLR            (RESETS_BASE+0x0+0x3000)
    
    #define RESETS_WDSEL_RW             (RESETS_BASE+0x4+0x0000)
    #define RESETS_WDSEL_XOR            (RESETS_BASE+0x4+0x1000)
    #define RESETS_WDSEL_SET            (RESETS_BASE+0x4+0x2000)
    #define RESETS_WDSEL_CLR            (RESETS_BASE+0x4+0x3000)
    
    #define RESETS_RESET_DONE_RW        (RESETS_BASE+0x8+0x0000)
    #define RESETS_RESET_DONE_XOR       (RESETS_BASE+0x8+0x1000)
    #define RESETS_RESET_DONE_SET       (RESETS_BASE+0x8+0x2000)
    #define RESETS_RESET_DONE_CLR       (RESETS_BASE+0x8+0x3000)
    
    #define SIO_BASE                    0xD0000000
    
    #define SIO_GPIO_OUT_RW             (SIO_BASE+0x10)
    #define SIO_GPIO_OUT_SET            (SIO_BASE+0x14)
    #define SIO_GPIO_OUT_CLR            (SIO_BASE+0x18)
    #define SIO_GPIO_OUT_XOR            (SIO_BASE+0x1C)
    
    #define SIO_GPIO_OE_RW              (SIO_BASE+0x20)
    #define SIO_GPIO_OE_SET             (SIO_BASE+0x24)
    #define SIO_GPIO_OE_CLR             (SIO_BASE+0x28)
    #define SIO_GPIO_OE_XOR             (SIO_BASE+0x2C)
    
    #define IO_BANK0_BASE               0x40014000
    
    #define IO_BANK0_GPIO25_STATUS_RW   (IO_BANK0_BASE+0x0C8+0x0000)
    #define IO_BANK0_GPIO25_STATUS_XOR  (IO_BANK0_BASE+0x0C8+0x1000)
    #define IO_BANK0_GPIO25_STATUS_SET  (IO_BANK0_BASE+0x0C8+0x2000)
    #define IO_BANK0_GPIO25_STATUS_CLR  (IO_BANK0_BASE+0x0C8+0x3000)
    
    #define IO_BANK0_GPIO25_CTRL_RW     (IO_BANK0_BASE+0x0CC+0x0000)
    #define IO_BANK0_GPIO25_CTRL_XOR    (IO_BANK0_BASE+0x0CC+0x1000)
    #define IO_BANK0_GPIO25_CTRL_SET    (IO_BANK0_BASE+0x0CC+0x2000)
    #define IO_BANK0_GPIO25_CTRL_CLR    (IO_BANK0_BASE+0x0CC+0x3000)
    
    int notmain ( void )
    {
        //release reset on IO_BANK0
        PUT32(RESETS_RESET_CLR,1<<5); //IO_BANK0
        //wait for reset to be done
        while(1)
        {
            if((GET32(RESETS_RESET_DONE_RW)&(1<<5))!=0) break;
        }
    
        //output disable
        PUT32(SIO_GPIO_OE_CLR,1<<25);
        //turn off pin 25
        PUT32(SIO_GPIO_OUT_CLR,1<<25);
    
        //set the function select to SIO (software controlled I/O)
        PUT32(IO_BANK0_GPIO25_CTRL_RW,5);
    
        //output enable
        PUT32(SIO_GPIO_OE_SET,1<<25);
        while(1)
        {
            //turn on the led
            PUT32(SIO_GPIO_OUT_SET,1<<25);
            DELAY(0x100000);
            //turn off the led
            PUT32(SIO_GPIO_OUT_CLR,1<<25);
            DELAY(0x100000);
        }
        return(0);
    }
    

    What does the thumb bit mean? If you look at the bx instruction or other related information in the ARM documentation (armv6-m architectural reference manual). This goes back to the full sized cores that can run arm and thumb code. Since instructions in both modes are aligned they chose to use the lsbit for branch by address instructions to determine the mode to use at the branch destination (originally only the bx instruction but later pop and others). If the lsbit is set then it is branching to a thumb instruction, if reset then branching to an arm instruction.

    The cortex-ms they chose to go with a vector table (makes sense based on the target market for the product) instead of hardcoded addresses like the prior full sized cores (ARM7, ARM9, ARM10, ARM11). As documented in the architectural reference manual the first word is a value to put in the stack pointer to save that step in the boot process and the second is the reset vector.

    Now ARM chose to make it such that you had to put a thumb function pointer address in there meaning the lsbit is ORRed with one. I emphasize ORRed with one and not ADD one, because if you use your tools properly (IMO) then the tool will set the lsbit and ADDing one you will then break it.

    Letting the tools do the work

    .cpu cortex-m0
    .thumb
    
    .thumb_func
    .global _start
    _start:
    .word 0x20001000
    .word reset
    .word hang
    .word hang
    
    .word hang
    .word hang
    .word hang
    .word hang
    
    .word hang
    .word hang
    .word hang
    .word hang
    
    .word hang
    .word hang
    .word hang
    .word hang
    
    .thumb_func
    reset:
        bl notmain
        b hang
    .thumb_func
    hang:   b .
    

    (This does not work on a pico, this is a what does the thumb it mean).

    .thumb_func causes the next label it finds in the code to be a thumb function address not just a plain old address.

    So this gives

    00200000 <_start>:
      200000:   20001000    andcs   r1, r0, r0
      200004:   00200041    eoreq   r0, r0, r1, asr #32
      200008:   00200047    eoreq   r0, r0, r7, asr #32
      20000c:   00200047    eoreq   r0, r0, r7, asr #32
      200010:   00200047    eoreq   r0, r0, r7, asr #32
      200014:   00200047    eoreq   r0, r0, r7, asr #32
      200018:   00200047    eoreq   r0, r0, r7, asr #32
      20001c:   00200047    eoreq   r0, r0, r7, asr #32
      200020:   00200047    eoreq   r0, r0, r7, asr #32
      200024:   00200047    eoreq   r0, r0, r7, asr #32
      200028:   00200047    eoreq   r0, r0, r7, asr #32
      20002c:   00200047    eoreq   r0, r0, r7, asr #32
      200030:   00200047    eoreq   r0, r0, r7, asr #32
      200034:   00200047    eoreq   r0, r0, r7, asr #32
      200038:   00200047    eoreq   r0, r0, r7, asr #32
      20003c:   00200047    eoreq   r0, r0, r7, asr #32
    
    00200040 <reset>:
      200040:   f000 f81a   bl  200078 <notmain>
      200044:   e7ff        b.n 200046 <hang>
    
    00200046 <hang>:
      200046:   e7fe        b.n 200046 <hang>
    

    Built and linked for a different mcu, not the pci. reset is at 0x00200040 and hang at 0x00200046. The tools have done the work for us, because we used .thumb_func and put the address orred with one.

    And everything is happy and this mcu will boot, or at least it won't hang right after reset.

    The longer way to do this, there is no .arm_func so for ARM and thumb you can instead do

    .type reset,%function
    reset:
    

    It does not have to be immediately before the label, but you have to do the extra work to type in the label name.

    If I take your code and change it like this:

        ldr r1, =one_entry
        mov r7, pc
        b fifo_writ
        mov r7, pc
        b fifo_read
    
    .thumb_func
    one_entry:
        ldr r0, =0xd0000000
        ldr r1, [r0]
        cmp r1, #1
        beq led
    

    Then I get

    2000005a:   4907        ldr r1, [pc, #28]   ; (20000078 <one_entry+0x14>)
    2000005c:   467f        mov r7, pc
    2000005e:   e011        b.n 20000084 <fifo_writ>
    20000060:   467f        mov r7, pc
    20000062:   e00e        b.n 20000082 <fifo_read>
    
    20000064 <one_entry>:
    20000064:   4805        ldr r0, [pc, #20]   ; (2000007c <one_entry+0x18>)
    20000066:   6801        ldr r1, [r0, #0]
    20000068:   2901        cmp r1, #1
    2000006a:   d00c        beq.n   20000086 <led>
    2000006c:   e7fe        b.n 2000006c <one_entry+0x8>
    2000006e:   46c0        nop         ; (mov r8, r8)
    20000070:   20001000    andcs   r1, r0, r0
    20000074:   e000ed08    and lr, r0, r8, lsl #26
    20000078:   20000065    andcs   r0, r0, r5, rrx
    2000007c:   d0000000    andle   r0, r0, r0
    

    The tool has created the address to the entry point for core one with the lsbit set. 20000065

    Now the next problem you have is

    mov r1, sp              @stack pointer
    

    You are taking core zeros stack pointer address at this point in core zeros execution and setting that for core one. If you end core zero in an infinite loop after starting core one, then this can work. But if you want to keep doing things with core zero you need to give core one its own stack pointer. In my example you can see that I give core zero 0x20002000 and core one 0x20003000. This would have been very painful to debug as core one would start but you would have random chaos that changes every time you change the code.

    And to your VTOR problem. I also tried just reading the VTOR and it did not work. Originally my code had a special vector table:

    .globl vector_table
    vector_table:
        b reset
        .balign 4
        .word reset ;@ has to be offset 4
        .word loop
        .word loop
        .word loop
    

    And I set the vector table, instead of read it

    ldr r1,=0xE000ED08 ;@ VTOR
    ldr r0,=vector_table
    str r0,[r1]
    

    For core zero which is probably borrowed from other pico code I wrote that might have actually used the table. The b reset because we don't actually get to use the reset vector for core zero so this was my kludge. Could have done alignment stuff and put the vector table somewhere else in memory (and yes for both cores I set the stack pointer myself, initially, but for the above example assumed that core one was doing it itself).

    And used that same address vector_table for core one. In this case I could have then read it and it would have worked. You have only provided a fraction so we do not know what you did with the VTOR for core zero before this code, but I assume you did not set it, since your code is not working.

    You/we are not using a vector table in these examples so just need to make it happy, so I just forced 0x20000000 and it then worked.

    I believe you need to fix all three addresses, the vector table, the entry point, and the stack pointer in order to have success.


    From your rewrite, I made these modifications.

        .cpu cortex-m0
        .thumb
    ent:
        ldr r0, =0x20001000
        mov sp, r0              @init stack pointer
    
        ldr r0, =0xe000ed08
        ldr r1, =0x20000000
        str r1, [r0]            @init vtor
    
        ldr r0, =0xd0000000
        ldr r1, [r0]
        cmp r1, #1
        beq led
    
        b core
    
    .thumb_func
    core:
        mov r7, pc
        b fifo_drain
        mov r1, #0
        mov r7, pc
        b fifo_writ
        mov r7, pc
        b fifo_read
        cmp r1, #0
        bne core
    
        mov r7, pc
        b fifo_drain
        mov r1, #0
        mov r7, pc
        b fifo_writ
        mov r7, pc
        b fifo_read
        cmp r1, #0
        bne core
    
        mov r1, #1
        mov r7, pc
        b fifo_writ
        mov r7, pc
        b fifo_read
        cmp r1, #1
        bne core
    
        ldr r4, =0x20000000
        mov r1, r4              @vector table
        mov r7, pc
        b fifo_writ
        mov r7, pc
        b fifo_read
        cmp r1, r4
        bne core
    
        mov r4, sp              @stack pointer
        mov r1, r4
        mov r7, pc
        b fifo_writ
        mov r7, pc
        b fifo_read
        cmp r1, r4
        bne core
    
        ldr r4, =0x20000001
        mov r1, r4              @entry point
        mov r7, pc
        b fifo_writ
        mov r7, pc
        b fifo_read
        cmp r1, r4
        bne core
    
        b loop
    
    .thumb_func
    fifo_stat:
        ldr r0, =0xd0000050
        ldr r1, [r0]
        mov r2, #15
        and r1, r1, r2
        mov pc, r7
    
    .thumb_func
    fifo_writ:
        ldr r0, =0xd0000050
        ldr r3, [r0]
        mov r2, #2
        and r3, r3, r2
        beq fifo_writ
    
        ldr r0, =0xd0000054
        str r1, [r0]
        sev
        mov pc, r7
    
    .thumb_func
    fifo_read:
        ldr r0, =0xd0000050
        ldr r3, [r0]
        mov r2, #1
        and r3, r3, r2
        beq _wfe
    
        ldr r0, =0xd0000058
        ldr r1, [r0]
        mov pc, r7
    
    .thumb_func
    fifo_drain:
        ldr r0, =0xd0000058
        ldr r1, [r0]
        ldr r0, =0xd0000050
        ldr r1, [r0]
        mov r2, #1
        and r1, r1, r2
        bne fifo_drain
        sev
        mov pc, r7
    
    .thumb_func
    _wfe:
        wfe
        b fifo_read
    
    ;@ ----------------------------------
    .balign 0x100
    
    .thumb_func
    led:
        movs r1, #32            @io_bank
        ldr r0, =0x4000f000
        str r1, [r0]            @release reset on io_bank
    
        movs r1, #5             @sio
        ldr r0, =0x400140cc
        str r1, [r0]            @assign sio to gpio25_ctrl
    
        movs r1, #1
        lsl r1, r1, #25
    
        ldr r0, =0xd0000024
        str r1, [r0]            @enable output
    
        ldr r0, =0xd0000014
        str r1, [r0]            @turn on the led
    
    .thumb_func
    loop:
        nop
        b loop
    

    First in a couple of places you used r3 to save the value you wanted to compare against after writing and reading back. But r3 is used both in the write and read so its contents are lost.

    Second the program was larger than 0x100 bytes, there is something strange that I would have to understand how I figured it out, so by avoiding the boundary then it worked.

    As used above sp did not need to go to r4, but I did it to shotgun the problem.

    If I remove the items not needed (the write to VTOR, a b core up front. And I used bl and bx lr to call and return, this saved enough instructions to make the binary less than 0x100 bytes. And it can be used without putting that boundary in.

        .cpu cortex-m0
        .thumb
    ent:
        ldr r0, =0x20001000
        mov sp, r0              @init stack pointer
    
        ldr r0, =0xd0000000
        ldr r1, [r0]
        cmp r1, #1
        beq led
    
    core:
        bl fifo_drain
        mov r1, #0
        bl fifo_writ
        bl fifo_read
        cmp r1, #0
        bne core
    
        b fifo_drain
        mov r1, #0
        bl fifo_writ
        bl fifo_read
        cmp r1, #0
        bne core
    
        mov r1, #1
        bl fifo_writ
        bl fifo_read
        cmp r1, #1
        bne core
    
        ldr r4, =0x20000000
        mov r1, r4              @vector table
        bl fifo_writ
        bl fifo_read
        cmp r1, r4
        bne core
    
        mov r1, sp              @stack pointer
        bl fifo_writ
        bl fifo_read
        cmp r1, sp
        bne core
    
        ldr r4, =0x20000001
        mov r1, r4              @entry point
        bl fifo_writ
        bl fifo_read
        cmp r1, r4
        bne core
    
        b loop
    
    fifo_stat:
        ldr r0, =0xd0000050
        ldr r1, [r0]
        mov r2, #15
        and r1, r1, r2
        bx lr
    
    fifo_writ:
        ldr r0, =0xd0000050
        ldr r3, [r0]
        mov r2, #2
        and r3, r3, r2
        beq fifo_writ
    
        ldr r0, =0xd0000054
        str r1, [r0]
        sev
        bx lr
    
    fifo_read:
        ldr r0, =0xd0000050
        ldr r3, [r0]
        mov r2, #1
        and r3, r3, r2
        beq _wfe
    
        ldr r0, =0xd0000058
        ldr r1, [r0]
        bx lr
    
    fifo_drain:
        ldr r0, =0xd0000058
        ldr r1, [r0]
        ldr r0, =0xd0000050
        ldr r1, [r0]
        mov r2, #1
        and r1, r1, r2
        bne fifo_drain
        sev
        bx lr
    
    _wfe:
        wfe
        bl fifo_read
    
    led:
        movs r1, #32            @io_bank
        ldr r0, =0x4000f000
        str r1, [r0]            @release reset on io_bank
    
        movs r1, #5             @sio
        ldr r0, =0x400140cc
        str r1, [r0]            @assign sio to gpio25_ctrl
    
        movs r1, #1
        lsl r1, r1, #25
    
        ldr r0, =0xd0000024
        str r1, [r0]            @enable output
    
        ldr r0, =0xd0000014
        str r1, [r0]            @turn on the led
    
    loop:
        nop
        b loop
    

    Note the instruction set allows for things like this:

    fifo_drain:
        ldr r0, =0xd0000050
        ldr r1, [r0,#8] @0xd0000058
        ldr r1, [r0] @0xd0000050
        mov r2, #1
        and r1, r1, r2
        bne fifo_drain
        sev
        bx lr
    

    not as brute force and simple to read, but saves instructions.

    For someone just learning ARM assembly language, and I presume the rp2040 at the same time. I am quite impressed, keep up the excellent work. This particular mcu is very very cool, but also poorly documented. The ARM instruction set is well documented, but with ARM vs thumb and then unified syntax vs not (fortunately you did not hit the difference). And the this 0x100 byte thing, which I cannot remember how I figured out, I think I looked at their code and figured it from that, but I would have to re-research the whole thing. If you want to confirm this for yourself, take a version that is just under 0x100 bytes and then add some nops in the body somewhere to stretch it past 0x100 bytes. Note with the simple changes described and removing unused/needed code I got yours down to

    216 bytes read (0xD8)
    

    216 bytes...

    bottom line.

    You had the right idea on the three parameters but they needed some work. And then a simple oops on using a register outside a function call that got used within a function call. Then the crazy 0x100 byte thing. This is the thing with bare-metal, hard to debug, gotta grind your way through, do not give up.

    The mov r7,pc thing, I am actually impressed about, not critical of - a lot of folks would struggle with the two instructions ahead thing.