As I understand the documentation, 2.8.2, the process of launching core 1 is to send a sequence of values, with the final 3 being the vector table, stack pointer, and entry point, over the FIFO, while core 1 will echo the values back to you.
From the c code provided by the documentation, I wrote out this assembly:
.cpu cortex-m0
.thumb
ent:
ldr r0, =0x20001000
mov sp, r0 @init stack pointer
ldr r0, =0xe000ed08
ldr r3, [r0] @vector table offset register
core:
mov r7, pc
b fifo_drain
sev
mov r1, #0
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #0
bne core
mov r7, pc
b fifo_drain
sev
mov r1, #0
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #0
bne core
mov r1, #1
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #1
bne core
mov r1, r3 @vector table
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, r3
bne core
mov r1, sp @stack pointer
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, sp
bne core
mov r1, pc
add r1, #2 @entry point
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
ldr r0, =0xd0000000
ldr r1, [r0]
cmp r1, #1
beq led
The sequence of values sent over the FIFO is {0, 0, 1, vt, sp, ent}, and when the value isn't echoed back, the sequence starts over. The entry point is simply the last 4 lines, where the core reads the CPUID register from the SIO, and turns on the LED (GPIO25) if the cpu id is 1.
The sequence seems to get stuck in a loop at the vector table, which makes sense since I barely understand it, the FIFO just doesn't echo it back. Also, the documentation has a note next to the entry point that says "don't forget the thumb bit!", whatever that means.
Edit:
Updated code, same problem:
.cpu cortex-m0
.thumb
ent:
ldr r0, =0x20001000
mov sp, r0 @init stack pointer
ldr r0, =0xe000ed08
ldr r1, =0x20000000
str r1, [r0] @init vtor
ldr r0, =0xd0000000
ldr r1, [r0]
cmp r1, #1
beq led
b core
.thumb_func
core:
mov r7, pc
b fifo_drain
mov r1, #0
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #0
bne core
mov r7, pc
b fifo_drain
mov r1, #0
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #0
bne core
mov r1, #1
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #1
bne core
ldr r3, =0x20000000
mov r1, r3 @vector table
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, r3
bne core
mov r1, sp @stack pointer
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, sp
bne core
ldr r3, =0x20000001
mov r1, r3 @entry point
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, r3
bne core
b loop
.thumb_func
fifo_stat:
ldr r0, =0xd0000050
ldr r1, [r0]
mov r2, #15
and r1, r1, r2
mov pc, r7
.thumb_func
fifo_writ:
ldr r0, =0xd0000050
ldr r3, [r0]
mov r2, #2
and r3, r3, r2
beq fifo_writ
ldr r0, =0xd0000054
str r1, [r0]
sev
mov pc, r7
.thumb_func
fifo_read:
ldr r0, =0xd0000050
ldr r3, [r0]
mov r2, #1
and r3, r3, r2
beq _wfe
ldr r0, =0xd0000058
ldr r1, [r0]
mov pc, r7
.thumb_func
fifo_drain:
ldr r0, =0xd0000058
ldr r1, [r0]
ldr r0, =0xd0000050
ldr r1, [r0]
mov r2, #1
and r1, r1, r2
bne fifo_drain
sev
mov pc, r7
.thumb_func
_wfe:
wfe
b fifo_read
.thumb_func
led:
movs r1, #32 @io_bank
ldr r0, =0x4000f000
str r1, [r0] @release reset on io_bank
movs r1, #5 @sio
ldr r0, =0x400140cc
str r1, [r0] @assign sio to gpio25_ctrl
movs r1, #1
lsl r1, r1, #25
ldr r0, =0xd0000024
str r1, [r0] @enable output
ldr r0, =0xd0000014
str r1, [r0] @turn on the led
.thumb_func
loop:
nop
b loop
My core zero code is a mixture of C and assembly language. I think we can sort your questions out though.
My bootstrap looks like this
.cpu cortex-m0
.thumb
ldr r1,=0xD0000000 ;@SIO_CPUID
ldr r0,[r1]
cmp r0,#0
bne core_one
;@ core_zero
ldr r0,=0x20002000
mov sp,r0
bl zero_entry
b .
core_one:
;@ core_one
bl notmain
b .
.align
.ltorg
;@ ----------------------------------
.balign 0x100
.thumb_func
.globl PUT32
PUT32:
str r1,[r0]
bx lr
.thumb_func
.globl GET32
GET32:
ldr r0,[r0]
bx lr
.globl SEV
.thumb_func
SEV:
sev
bx lr
.globl WFE
.thumb_func
WFE:
wfe
bx lr
.globl DELAY
.thumb_func
DELAY:
sub r0,#1
bne DELAY
bx lr
And I link for 0x20000000 and build my uf2 file for sram/0x20000000 as the destination for the binary. It depends on circumstances, but you need to know where your code is running.
My core zero code looks like this
extern void PUT32 ( unsigned int, unsigned int );
extern unsigned int GET32 ( unsigned int );
extern void SEV ( void );
extern void WFE ( void );
#define SIO_BASE 0xD0000000
#define SIO_FIFO_ST (SIO_BASE+0x50)
#define SIO_FIFO_WR (SIO_BASE+0x54)
#define SIO_FIFO_RD (SIO_BASE+0x58)
static void fifo_flush ( void )
{
while(1)
{
if((GET32(SIO_FIFO_ST)&0x1) == 0) break; //zero if empty
GET32(SIO_FIFO_RD);
}
SEV();
}
static unsigned int fifo_send ( unsigned int cmd )
{
while(1)
{
if((GET32(SIO_FIFO_ST)&0x2) != 0) break; //one if ready
}
PUT32(SIO_FIFO_WR,cmd);
SEV();
while(1)
{
if((GET32(SIO_FIFO_ST)&0x1) == 0) //zero if empty
{
WFE();
}
else
{
break;
}
}
return(GET32(SIO_FIFO_RD));
}
unsigned int zero_entry ( void )
{
unsigned int ra;
while(1)
{
fifo_flush();
ra=fifo_send(0);
if(ra!=0) continue;
fifo_flush();
ra=fifo_send(0);
if(ra!=0) continue;
ra=fifo_send(1);
if(ra!=1) continue;
ra=fifo_send(0x20000000); //vector_table
if(ra!=0x20000000) continue;
ra=fifo_send(0x20003000); //stack pointer
if(ra!=0x20003000) continue;
ra=fifo_send(0x20000001); //entry
if(ra!=0x20000001) continue;
break;
}
return(0);
}
And if interested my core one code looks like this
void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
void DELAY ( unsigned int );
#define RESETS_BASE 0x4000C000
#define RESETS_RESET_RW (RESETS_BASE+0x0+0x0000)
#define RESETS_RESET_XOR (RESETS_BASE+0x0+0x1000)
#define RESETS_RESET_SET (RESETS_BASE+0x0+0x2000)
#define RESETS_RESET_CLR (RESETS_BASE+0x0+0x3000)
#define RESETS_WDSEL_RW (RESETS_BASE+0x4+0x0000)
#define RESETS_WDSEL_XOR (RESETS_BASE+0x4+0x1000)
#define RESETS_WDSEL_SET (RESETS_BASE+0x4+0x2000)
#define RESETS_WDSEL_CLR (RESETS_BASE+0x4+0x3000)
#define RESETS_RESET_DONE_RW (RESETS_BASE+0x8+0x0000)
#define RESETS_RESET_DONE_XOR (RESETS_BASE+0x8+0x1000)
#define RESETS_RESET_DONE_SET (RESETS_BASE+0x8+0x2000)
#define RESETS_RESET_DONE_CLR (RESETS_BASE+0x8+0x3000)
#define SIO_BASE 0xD0000000
#define SIO_GPIO_OUT_RW (SIO_BASE+0x10)
#define SIO_GPIO_OUT_SET (SIO_BASE+0x14)
#define SIO_GPIO_OUT_CLR (SIO_BASE+0x18)
#define SIO_GPIO_OUT_XOR (SIO_BASE+0x1C)
#define SIO_GPIO_OE_RW (SIO_BASE+0x20)
#define SIO_GPIO_OE_SET (SIO_BASE+0x24)
#define SIO_GPIO_OE_CLR (SIO_BASE+0x28)
#define SIO_GPIO_OE_XOR (SIO_BASE+0x2C)
#define IO_BANK0_BASE 0x40014000
#define IO_BANK0_GPIO25_STATUS_RW (IO_BANK0_BASE+0x0C8+0x0000)
#define IO_BANK0_GPIO25_STATUS_XOR (IO_BANK0_BASE+0x0C8+0x1000)
#define IO_BANK0_GPIO25_STATUS_SET (IO_BANK0_BASE+0x0C8+0x2000)
#define IO_BANK0_GPIO25_STATUS_CLR (IO_BANK0_BASE+0x0C8+0x3000)
#define IO_BANK0_GPIO25_CTRL_RW (IO_BANK0_BASE+0x0CC+0x0000)
#define IO_BANK0_GPIO25_CTRL_XOR (IO_BANK0_BASE+0x0CC+0x1000)
#define IO_BANK0_GPIO25_CTRL_SET (IO_BANK0_BASE+0x0CC+0x2000)
#define IO_BANK0_GPIO25_CTRL_CLR (IO_BANK0_BASE+0x0CC+0x3000)
int notmain ( void )
{
//release reset on IO_BANK0
PUT32(RESETS_RESET_CLR,1<<5); //IO_BANK0
//wait for reset to be done
while(1)
{
if((GET32(RESETS_RESET_DONE_RW)&(1<<5))!=0) break;
}
//output disable
PUT32(SIO_GPIO_OE_CLR,1<<25);
//turn off pin 25
PUT32(SIO_GPIO_OUT_CLR,1<<25);
//set the function select to SIO (software controlled I/O)
PUT32(IO_BANK0_GPIO25_CTRL_RW,5);
//output enable
PUT32(SIO_GPIO_OE_SET,1<<25);
while(1)
{
//turn on the led
PUT32(SIO_GPIO_OUT_SET,1<<25);
DELAY(0x100000);
//turn off the led
PUT32(SIO_GPIO_OUT_CLR,1<<25);
DELAY(0x100000);
}
return(0);
}
What does the thumb bit mean? If you look at the bx instruction or other related information in the ARM documentation (armv6-m architectural reference manual). This goes back to the full sized cores that can run arm and thumb code. Since instructions in both modes are aligned they chose to use the lsbit for branch by address instructions to determine the mode to use at the branch destination (originally only the bx instruction but later pop and others). If the lsbit is set then it is branching to a thumb instruction, if reset then branching to an arm instruction.
The cortex-ms they chose to go with a vector table (makes sense based on the target market for the product) instead of hardcoded addresses like the prior full sized cores (ARM7, ARM9, ARM10, ARM11). As documented in the architectural reference manual the first word is a value to put in the stack pointer to save that step in the boot process and the second is the reset vector.
Now ARM chose to make it such that you had to put a thumb function pointer address in there meaning the lsbit is ORRed with one. I emphasize ORRed with one and not ADD one, because if you use your tools properly (IMO) then the tool will set the lsbit and ADDing one you will then break it.
Letting the tools do the work
.cpu cortex-m0
.thumb
.thumb_func
.global _start
_start:
.word 0x20001000
.word reset
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.thumb_func
reset:
bl notmain
b hang
.thumb_func
hang: b .
(This does not work on a pico, this is a what does the thumb it mean).
.thumb_func causes the next label it finds in the code to be a thumb function address not just a plain old address.
So this gives
00200000 <_start>:
200000: 20001000 andcs r1, r0, r0
200004: 00200041 eoreq r0, r0, r1, asr #32
200008: 00200047 eoreq r0, r0, r7, asr #32
20000c: 00200047 eoreq r0, r0, r7, asr #32
200010: 00200047 eoreq r0, r0, r7, asr #32
200014: 00200047 eoreq r0, r0, r7, asr #32
200018: 00200047 eoreq r0, r0, r7, asr #32
20001c: 00200047 eoreq r0, r0, r7, asr #32
200020: 00200047 eoreq r0, r0, r7, asr #32
200024: 00200047 eoreq r0, r0, r7, asr #32
200028: 00200047 eoreq r0, r0, r7, asr #32
20002c: 00200047 eoreq r0, r0, r7, asr #32
200030: 00200047 eoreq r0, r0, r7, asr #32
200034: 00200047 eoreq r0, r0, r7, asr #32
200038: 00200047 eoreq r0, r0, r7, asr #32
20003c: 00200047 eoreq r0, r0, r7, asr #32
00200040 <reset>:
200040: f000 f81a bl 200078 <notmain>
200044: e7ff b.n 200046 <hang>
00200046 <hang>:
200046: e7fe b.n 200046 <hang>
Built and linked for a different mcu, not the pci. reset is at 0x00200040 and hang at 0x00200046. The tools have done the work for us, because we used .thumb_func and put the address orred with one.
And everything is happy and this mcu will boot, or at least it won't hang right after reset.
The longer way to do this, there is no .arm_func so for ARM and thumb you can instead do
.type reset,%function
reset:
It does not have to be immediately before the label, but you have to do the extra work to type in the label name.
If I take your code and change it like this:
ldr r1, =one_entry
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
.thumb_func
one_entry:
ldr r0, =0xd0000000
ldr r1, [r0]
cmp r1, #1
beq led
Then I get
2000005a: 4907 ldr r1, [pc, #28] ; (20000078 <one_entry+0x14>)
2000005c: 467f mov r7, pc
2000005e: e011 b.n 20000084 <fifo_writ>
20000060: 467f mov r7, pc
20000062: e00e b.n 20000082 <fifo_read>
20000064 <one_entry>:
20000064: 4805 ldr r0, [pc, #20] ; (2000007c <one_entry+0x18>)
20000066: 6801 ldr r1, [r0, #0]
20000068: 2901 cmp r1, #1
2000006a: d00c beq.n 20000086 <led>
2000006c: e7fe b.n 2000006c <one_entry+0x8>
2000006e: 46c0 nop ; (mov r8, r8)
20000070: 20001000 andcs r1, r0, r0
20000074: e000ed08 and lr, r0, r8, lsl #26
20000078: 20000065 andcs r0, r0, r5, rrx
2000007c: d0000000 andle r0, r0, r0
The tool has created the address to the entry point for core one with the lsbit set. 20000065
Now the next problem you have is
mov r1, sp @stack pointer
You are taking core zeros stack pointer address at this point in core zeros execution and setting that for core one. If you end core zero in an infinite loop after starting core one, then this can work. But if you want to keep doing things with core zero you need to give core one its own stack pointer. In my example you can see that I give core zero 0x20002000 and core one 0x20003000. This would have been very painful to debug as core one would start but you would have random chaos that changes every time you change the code.
And to your VTOR problem. I also tried just reading the VTOR and it did not work. Originally my code had a special vector table:
.globl vector_table
vector_table:
b reset
.balign 4
.word reset ;@ has to be offset 4
.word loop
.word loop
.word loop
And I set the vector table, instead of read it
ldr r1,=0xE000ED08 ;@ VTOR
ldr r0,=vector_table
str r0,[r1]
For core zero which is probably borrowed from other pico code I wrote that might have actually used the table. The b reset because we don't actually get to use the reset vector for core zero so this was my kludge. Could have done alignment stuff and put the vector table somewhere else in memory (and yes for both cores I set the stack pointer myself, initially, but for the above example assumed that core one was doing it itself).
And used that same address vector_table for core one. In this case I could have then read it and it would have worked. You have only provided a fraction so we do not know what you did with the VTOR for core zero before this code, but I assume you did not set it, since your code is not working.
You/we are not using a vector table in these examples so just need to make it happy, so I just forced 0x20000000 and it then worked.
I believe you need to fix all three addresses, the vector table, the entry point, and the stack pointer in order to have success.
From your rewrite, I made these modifications.
.cpu cortex-m0
.thumb
ent:
ldr r0, =0x20001000
mov sp, r0 @init stack pointer
ldr r0, =0xe000ed08
ldr r1, =0x20000000
str r1, [r0] @init vtor
ldr r0, =0xd0000000
ldr r1, [r0]
cmp r1, #1
beq led
b core
.thumb_func
core:
mov r7, pc
b fifo_drain
mov r1, #0
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #0
bne core
mov r7, pc
b fifo_drain
mov r1, #0
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #0
bne core
mov r1, #1
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, #1
bne core
ldr r4, =0x20000000
mov r1, r4 @vector table
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, r4
bne core
mov r4, sp @stack pointer
mov r1, r4
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, r4
bne core
ldr r4, =0x20000001
mov r1, r4 @entry point
mov r7, pc
b fifo_writ
mov r7, pc
b fifo_read
cmp r1, r4
bne core
b loop
.thumb_func
fifo_stat:
ldr r0, =0xd0000050
ldr r1, [r0]
mov r2, #15
and r1, r1, r2
mov pc, r7
.thumb_func
fifo_writ:
ldr r0, =0xd0000050
ldr r3, [r0]
mov r2, #2
and r3, r3, r2
beq fifo_writ
ldr r0, =0xd0000054
str r1, [r0]
sev
mov pc, r7
.thumb_func
fifo_read:
ldr r0, =0xd0000050
ldr r3, [r0]
mov r2, #1
and r3, r3, r2
beq _wfe
ldr r0, =0xd0000058
ldr r1, [r0]
mov pc, r7
.thumb_func
fifo_drain:
ldr r0, =0xd0000058
ldr r1, [r0]
ldr r0, =0xd0000050
ldr r1, [r0]
mov r2, #1
and r1, r1, r2
bne fifo_drain
sev
mov pc, r7
.thumb_func
_wfe:
wfe
b fifo_read
;@ ----------------------------------
.balign 0x100
.thumb_func
led:
movs r1, #32 @io_bank
ldr r0, =0x4000f000
str r1, [r0] @release reset on io_bank
movs r1, #5 @sio
ldr r0, =0x400140cc
str r1, [r0] @assign sio to gpio25_ctrl
movs r1, #1
lsl r1, r1, #25
ldr r0, =0xd0000024
str r1, [r0] @enable output
ldr r0, =0xd0000014
str r1, [r0] @turn on the led
.thumb_func
loop:
nop
b loop
First in a couple of places you used r3 to save the value you wanted to compare against after writing and reading back. But r3 is used both in the write and read so its contents are lost.
Second the program was larger than 0x100 bytes, there is something strange that I would have to understand how I figured it out, so by avoiding the boundary then it worked.
As used above sp did not need to go to r4, but I did it to shotgun the problem.
If I remove the items not needed (the write to VTOR, a b core up front. And I used bl and bx lr to call and return, this saved enough instructions to make the binary less than 0x100 bytes. And it can be used without putting that boundary in.
.cpu cortex-m0
.thumb
ent:
ldr r0, =0x20001000
mov sp, r0 @init stack pointer
ldr r0, =0xd0000000
ldr r1, [r0]
cmp r1, #1
beq led
core:
bl fifo_drain
mov r1, #0
bl fifo_writ
bl fifo_read
cmp r1, #0
bne core
b fifo_drain
mov r1, #0
bl fifo_writ
bl fifo_read
cmp r1, #0
bne core
mov r1, #1
bl fifo_writ
bl fifo_read
cmp r1, #1
bne core
ldr r4, =0x20000000
mov r1, r4 @vector table
bl fifo_writ
bl fifo_read
cmp r1, r4
bne core
mov r1, sp @stack pointer
bl fifo_writ
bl fifo_read
cmp r1, sp
bne core
ldr r4, =0x20000001
mov r1, r4 @entry point
bl fifo_writ
bl fifo_read
cmp r1, r4
bne core
b loop
fifo_stat:
ldr r0, =0xd0000050
ldr r1, [r0]
mov r2, #15
and r1, r1, r2
bx lr
fifo_writ:
ldr r0, =0xd0000050
ldr r3, [r0]
mov r2, #2
and r3, r3, r2
beq fifo_writ
ldr r0, =0xd0000054
str r1, [r0]
sev
bx lr
fifo_read:
ldr r0, =0xd0000050
ldr r3, [r0]
mov r2, #1
and r3, r3, r2
beq _wfe
ldr r0, =0xd0000058
ldr r1, [r0]
bx lr
fifo_drain:
ldr r0, =0xd0000058
ldr r1, [r0]
ldr r0, =0xd0000050
ldr r1, [r0]
mov r2, #1
and r1, r1, r2
bne fifo_drain
sev
bx lr
_wfe:
wfe
bl fifo_read
led:
movs r1, #32 @io_bank
ldr r0, =0x4000f000
str r1, [r0] @release reset on io_bank
movs r1, #5 @sio
ldr r0, =0x400140cc
str r1, [r0] @assign sio to gpio25_ctrl
movs r1, #1
lsl r1, r1, #25
ldr r0, =0xd0000024
str r1, [r0] @enable output
ldr r0, =0xd0000014
str r1, [r0] @turn on the led
loop:
nop
b loop
Note the instruction set allows for things like this:
fifo_drain:
ldr r0, =0xd0000050
ldr r1, [r0,#8] @0xd0000058
ldr r1, [r0] @0xd0000050
mov r2, #1
and r1, r1, r2
bne fifo_drain
sev
bx lr
not as brute force and simple to read, but saves instructions.
For someone just learning ARM assembly language, and I presume the rp2040 at the same time. I am quite impressed, keep up the excellent work. This particular mcu is very very cool, but also poorly documented. The ARM instruction set is well documented, but with ARM vs thumb and then unified syntax vs not (fortunately you did not hit the difference). And the this 0x100 byte thing, which I cannot remember how I figured out, I think I looked at their code and figured it from that, but I would have to re-research the whole thing. If you want to confirm this for yourself, take a version that is just under 0x100 bytes and then add some nops in the body somewhere to stretch it past 0x100 bytes. Note with the simple changes described and removing unused/needed code I got yours down to
216 bytes read (0xD8)
216 bytes...
bottom line.
You had the right idea on the three parameters but they needed some work. And then a simple oops on using a register outside a function call that got used within a function call. Then the crazy 0x100 byte thing. This is the thing with bare-metal, hard to debug, gotta grind your way through, do not give up.
The mov r7,pc thing, I am actually impressed about, not critical of - a lot of folks would struggle with the two instructions ahead thing.