I'm trying to write as tiny code as possible to extract the firmware of Infineon's XMC4500 microcontroller.
The code must fit into a 30 byte buffer which allows me to have 15 machine instructions using Thumb 16-bit instruction set.
Starting with C my attempt is to dump flash memory through a single GPIO pin (see original question) following this nifty trick.
Basically what I'm doing is:
EDIT:
#include "XMC4500.h"
void main() {
// start dumping at memory address 0x00000000
unsigned int* p = (uint32_t *)(0x0u);
// configure port1 output (push-pull)
PORT1->IOCR0 = 0x8080u;
for(;;) {
int i = 32;
int data = *(p++);
do {
// clock low
PORT1->OUT = 0x0;
// clock high with data bits
PORT1->OUT = 0x2u | data;
data >>= 1;
} while (--i > 0);
}
}
main:
; PORT1->IOCR0 = 0x8080UL
ldr r1, =0x48028100 ; load port1 base address to R1
movw r2, #0x8080 ; move 0x8080 to R2
str r2, [r1, #0x10]
main_1:
; start copying at address 0x00000000
; R12 is known to be zeroed
ldr.w r2, [r12], #0x4 ; int data = *(p++)
movs r3, #32 ; int i = 32
main_2:
; PORT1->OUT = 0x0
; clock low
; R12 is known to be zeroed
str r12, [r1]
; PORT1->OUT = 0x2 | data
; clock high with data bits
orr r4, r2, #0x2
str r4, [r1]
asrs r2, r2, #0x1 ; data >>= 1
subs r3, r3, #0x1 ; i--
bne.n main_2 ; while (--i > 0)
b.n main_1 ; while(true)
However code size is still too big to meet my requirements.
Is there anything I can do to further shrink down my code? Anything that can be optimized or left out?
If the high bits of Port1 don't change during this process, and you can ensure that you read the data bit slightly after the clock goes high, you could try something like this:
#define P1_DEFAULT = ?//constant high bits of port 1, zeros in low two bits
int* dp=0; //maybe use a register which is known to be zeroed.
PORT1->IOCR0 = 0x8080; //should be 3 ins
for(;;){
int i=32; //
int data=*(dp++); //LDMIA instruction may do load and increment in 1 step.
do{
PORT1->OUT = P1_DEFAULT #clock low
PORT1->OUT = P1_DEFAULT + 2+ (data&1); #clock high with data
data>>=1;
} while (--i>0);
}
This should remove three port read, one port write and a conditional.
Do it all in one function to avoid any call overhead. I would start with the generated assembly for this, and see what you can do to improve it.