RISC-V GCC Force Linker Relaxation with GP Register to Address Static Data

I'm writing a bare-metal firmware for a virtual RISC-V SoC. Below is code that communicates with a physical device on the SoC and it is using constant data from the my_data array that sits in ROM. The linker script contains a definition for __global_pointer$ that is equal to the start of this constant data array. My assumption is, that the linker will use this register for relaxation to create code-size efficient code.

void main(void);

__attribute__((naked, noreturn)) void _start(void) {
  __asm__ __volatile__("":::"memory");
  __asm__ __volatile__ (".option push");
  __asm__ __volatile__ (".option norelax");
  __asm__ __volatile__ ("add x1, x0, x0");
  __asm__ __volatile__ ("la x2, __stack_start$");
  __asm__ __volatile__ ("la x3, __global_pointer$");
  __asm__ __volatile__ ("add x4, x0, x0");
  // ... rest of registers
  __asm__ __volatile__ ("add x31, x0, x0");
  __asm__ __volatile__ (".option pop");
  __asm__ __volatile__("":::"memory");

  main();
  __asm__ __volatile__ ("j _start");
}

typedef struct {
  volatile unsigned int regs [0x1000];
} phy_t __attribute__ ((aligned (0x8000)));

extern phy_t __phy$;

const unsigned int my_data[3] = {
 0x12345678u,
 0x23456789u,
 0x3456789Au,
};

void main() {
  __phy$.regs[0x0000] = my_data[0];
  __phy$.regs[0x0001] = my_data[1];
  __phy$.regs[0x0002] = my_data[2];
}

OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", "elf32-littleriscv")
OUTPUT_ARCH(riscv)

MEMORY {
    PHY   ( W ) : ORIGIN = 0x00000000, LENGTH = 0x00008000
    RAM   (AWL) : ORIGIN = 0x00010000, LENGTH = 0x00010000
    ROM   (XR ) : ORIGIN = 0x00020000, LENGTH = 0x00010000
}

SECTIONS {
    .text : ALIGN(4) {
        PROVIDE(__text_start__$ = .);
        *(.text .text.*)
        PROVIDE(__text_end__$ = .);
    } > ROM

    .rodata : ALIGN(4) {
        PROVIDE(__rodata_start__$ = .);
        PROVIDE(__global_pointer$ = .);
        *(.rodata .rodata.*)
        *(.constdata .constdata.*)
        *(.srodata .srodata.*)
        PROVIDE(__rodata_end__$ = .);
    } > ROM

    .data : ALIGN(4) {
        PROVIDE(__data_source__$ = LOADADDR(.data));
        PROVIDE(__data_start__$ = .);
        *(.data_begin .data_begin.*)
        *(.data .data.*)
        *(.data_end .data_end.*)
        *(.sdata .sdata.*)
        PROVIDE(__data_end__$ = .);
    } > RAM AT>ROM

    .stack (NOLOAD) : ALIGN(4) {
        PROVIDE(__stack_end$ = .);
        . = ORIGIN(RAM) + LENGTH(RAM);
        PROVIDE(__stack_start$ = .);
    } > RAM

    .phy (NOLOAD) : ALIGN(16) {
        PROVIDE(__phy$ = .);
    } > PHY
}

CFLAGS=-march=rv32ic -mabi=ilp32 -Os -mcmodel=medlow -Xlinker --emit-relocs
LDFLAGS=-nostdlib -static -ffreestanding

all:
    riscv-unknown-elf-gcc $(CFLAGS) $(LDFLAGS) -o firmware.elf -x c firmware.c -T firmware.ld
    riscv-unknown-elf-objdump -M numeric -d -r firmware.elf > firmware.lst

However, the listing contains the data inlined as lui+addi instruction pairs instead. The map output contains the my_data array in the correct section .srodata, putting it in the .sdata section via __attribute__((section(".sdata"))) doesn't help either. I have tried options -msmall-data-limit=512, -fno-pie, -Xlinker --relax-gp, -Xlinker gpsize=512 on GCC 15.1.0. Is there a way to force relaxation via the GP register?

00020020 <main>:
   20020:  12345737  lui  x14,0x12345
   20024:  67870713  addi  x14,x14,1656 # 12345678 <my_data+0x12325630>
            20024: R_RISCV_NONE *ABS*+0x4
            20024: R_RISCV_RELAX    *ABS*
   20028:  00e02023  sw  x14,0(x0) # 0 <__phy$>
            20028: R_RISCV_GPREL_S  __phy$
            20028: R_RISCV_RELAX    *ABS*
   2002c:  23456737  lui  x14,0x23456
   20030:  00000793  li  x15,0
            20030: R_RISCV_GPREL_I  __phy$
            20030: R_RISCV_RELAX    *ABS*
   20034:  78970713  addi  x14,x14,1929 # 23456789 <my_data+0x23436741>
   20038:  c3d8      sw  x14,4(x15)
   2003a:  34568737  lui  x14,0x34568
   2003e:  89a70713  addi  x14,x14,-1894 # 3456789a <my_data+0x34547852>
   20042:  c798      sw  x14,8(x15)
   20044:  8082      ret

...
.rodata         0x0000000000020048        0xc
                [!provide]                        PROVIDE (__rodata_start__$ = .)
                0x0000000000020048                PROVIDE (__global_pointer$ = .)
 *(.rodata .rodata.*)
 *(.constdata .constdata.*)
 *(.srodata .srodata.*)
 .srodata       0x0000000000020048        0xc
                0x0000000000020048                my_data
 *(.sdata .sdata.*)
                [!provide]                        PROVIDE (__rodata_end__$ = .)

Solution

As pointed out by Andrey Turkin, using volatile forces the compiler to not inline the constants anymore and produce memory accesses instead. The updated code is as follows:

typedef struct {
  volatile unsigned int regs [0x1000];
} phy_t __attribute__ ((aligned (0x8000)));

extern phy_t __phy$;

const unsigned int my_data[3] = {
 0x12345678u,
 0x23456789u,
 0x3456789Au,
};

void main() {
  __phy$.regs[0x0000] = ((volatile unsigned int*) my_data)[0];
  __phy$.regs[0x0001] = ((volatile unsigned int*) my_data)[1];
  __phy$.regs[0x0002] = ((volatile unsigned int*) my_data)[2];
}

Another issue is the use of a weak symbol via the `PROVIDE` in the linker script. The fixed linker script is as follows:

OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", "elf32-littleriscv")
OUTPUT_ARCH(riscv)

MEMORY {
    PHY   ( W ) : ORIGIN = 0x00000000, LENGTH = 0x00008000
    RAM   (AWL) : ORIGIN = 0x00010000, LENGTH = 0x00010000
    ROM   (XR ) : ORIGIN = 0x00020000, LENGTH = 0x00010000
}

SECTIONS {
    .text : ALIGN(4) {
        PROVIDE(__text_start__$ = .);
        *(.text .text.*)
        PROVIDE(__text_end__$ = .);
    } > ROM

    .rodata : ALIGN(4) {
        PROVIDE(__rodata_start__$ = .);
        *(.rodata .rodata.*)
        *(.constdata .constdata.*)
        *(.srodata .srodata.*)
        PROVIDE(__rodata_end__$ = .);

        /*
           - Do not use PROVIDE(__global_pointer$ = ...) here.
           Otherwise, __global_pointer$ will be ignored during relaxation.
           - The 0x800 is the maximum 12 bit immediate encoded
           in the RISC-V instructions.
           - The 0x38 is the safety margin used during relaxation
           where no relaxation is performed near the gp address.
        */
        __global_pointer$ = __rodata_start__$ + 0x800 - 0x38;
    } > ROM

    .data : ALIGN(4) {
        PROVIDE(__data_source__$ = LOADADDR(.data));
        PROVIDE(__data_start__$ = .);
        *(.data_begin .data_begin.*)
        *(.data .data.*)
        *(.data_end .data_end.*)
        *(.sdata .sdata.*)
        PROVIDE(__data_end__$ = .);
    } > RAM AT>ROM

    .stack (NOLOAD) : ALIGN(4) {
        PROVIDE(__stack_end$ = .);
        . = ORIGIN(RAM) + LENGTH(RAM);
        PROVIDE(__stack_start$ = .);
    } > RAM

    .phy (NOLOAD) : ALIGN(16) {
        PROVIDE(__phy$ = .);
    } > PHY
}

This now produces relaxations that can use the ZERO and GP registers for addressing:

main:
 addi   sp,sp,-16
 sw ra,12(sp)
 sw s0,8(sp)
 addi   s0,sp,16
 lui    a5,0x0assembly
    R_RISCV_HI20 my_data
    R_RISCV_RELAX *ABS*
 mv a5,a5
    R_RISCV_LO12_I my_data
    R_RISCV_RELAX *ABS*
 lw a4,0(a5)
 lui    a5,0x0
    R_RISCV_HI20 __phy$
    R_RISCV_RELAX *ABS*
 sw a4,0(a5) # 0 <main>
    R_RISCV_LO12_S __phy$
    R_RISCV_RELAX *ABS*
 lui    a5,0x0
    R_RISCV_HI20 my_data+0x4
    R_RISCV_RELAX *ABS*+0x4
 addi   a5,a5,4 # 4 <main+0x4>
    R_RISCV_LO12_I my_data+0x4
    R_RISCV_RELAX *ABS*+0x4
 lw a4,0(a5)
 ...
 sw a4,8(a5)
 li a5,0
 mv a0,a5
 lw ra,12(sp)
 lw s0,8(sp)
 addi   sp,sp,16
 ret

The resulting instructions after linking are not fully optimal, but now make use of the GP register as below:

main:
 addi   sp,sp,-16
 sw ra,12(sp)
 sw s0,8(sp)
 addi   s0,sp,16
 addi   a5,gp,56 # 20038 <my_data>
 lw a4,0(a5)
 sw a4,0(zero) # 0 <__phy$>
 addi   a5,gp,60 # 2003c <my_data+0x4>
 lw a4,0(a5)
 li a5,0
 sw a4,4(a5)
 addi   a5,gp,64 # 20040 <my_data+0x8>
 lw a4,0(a5)
 li a5,0
 sw a4,8(a5)
 li a5,0
 mv a0,a5
 lw ra,12(sp)
 lw s0,8(sp)
 addi   sp,sp,16
 ret