I have a M2 apple machine with Asahi Linux running on it. I'm trying to write a kernel module and in the module I want to measure the number of cycles spent on running a set of assembly instructions. Now I know that the default ARM performance registers don't work on M1 or M2 and apple has implemented their own PMC registers.
The problem with directly reading the PMC0 register(asm eq. MRS X0, PMC0
) is that it has to be initialized, enabled and events have to be defined via apple's performance monitor control register(PMCR0) first, which is a lot of work to include in my own module.
I have seen Asahi team's work for Perf here which works fine. I'm not sure if I could actually use Perf in my case.
So my question is, is it possible to use Perf tool to measure the cycles on some part of a kernel module? I'd also very much appreciate any other ideas.
Here's how I used already implemented code from Asahi to setup the performance counters on M1 in my own kernel module:
void setup_counter(int pmc_counter){
//configure
u64 val, user_bit, kernel_bit;
user_bit = BIT(get_bit_offset(pmc_counter, PMCR1_COUNT_A64_EL0_0_7));
kernel_bit = BIT(get_bit_offset(pmc_counter, PMCR1_COUNT_A64_EL1_0_7));
val = read_sysreg_s(SYS_IMP_APL_PMCR1_EL1);
val |= user_bit;
val |= kernel_bit;
write_sysreg_s(val, SYS_IMP_APL_PMCR1_EL1);
isb();
//enable
write_sysreg_s(0xff, sys_reg(3, 1, 15, 0, 0));
isb();
//start
val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
val &= ~(PMCR0_IMODE | PMCR0_IACT);
val |= FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ);
write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
isb();
}
And here's the header file:
#ifndef __TIMING_h
#define __TIMING_h
#include <linux/bits.h>
#include <asm/sysreg.h>
/* Counters */
#define SYS_IMP_APL_PMC0_EL1 sys_reg(3, 2, 15, 0, 0)
#define SYS_IMP_APL_PMC1_EL1 sys_reg(3, 2, 15, 1, 0)
#define SYS_IMP_APL_PMC2_EL1 sys_reg(3, 2, 15, 2, 0)
#define SYS_IMP_APL_PMC3_EL1 sys_reg(3, 2, 15, 3, 0)
#define SYS_IMP_APL_PMC4_EL1 sys_reg(3, 2, 15, 4, 0)
#define SYS_IMP_APL_PMC5_EL1 sys_reg(3, 2, 15, 5, 0)
#define SYS_IMP_APL_PMC6_EL1 sys_reg(3, 2, 15, 6, 0)
#define SYS_IMP_APL_PMC7_EL1 sys_reg(3, 2, 15, 7, 0)
#define SYS_IMP_APL_PMC8_EL1 sys_reg(3, 2, 15, 9, 0)
#define SYS_IMP_APL_PMC9_EL1 sys_reg(3, 2, 15, 10, 0)
/* Core PMC control register */
#define SYS_IMP_APL_PMCR0_EL1 sys_reg(3, 1, 15, 0, 0)
#define PMCR0_CNT_ENABLE_0_7 GENMASK(7, 0)
#define PMCR0_IMODE GENMASK(10, 8)
#define PMCR0_IMODE_OFF 0
#define PMCR0_IMODE_PMI 1
#define PMCR0_IMODE_AIC 2
#define PMCR0_IMODE_HALT 3
#define PMCR0_IMODE_FIQ 4
#define PMCR0_IACT BIT(11)
#define PMCR0_PMI_ENABLE_0_7 GENMASK(19, 12)
#define PMCR0_STOP_CNT_ON_PMI BIT(20)
#define PMCR0_CNT_GLOB_L2C_EVT BIT(21)
#define PMCR0_DEFER_PMI_TO_ERET BIT(22)
#define PMCR0_ALLOW_CNT_EN_EL0 BIT(30)
#define PMCR0_CNT_ENABLE_8_9 GENMASK(33, 32)
#define PMCR0_PMI_ENABLE_8_9 GENMASK(45, 44)
#define SYS_IMP_APL_PMCR1_EL1 sys_reg(3, 1, 15, 1, 0)
#define PMCR1_COUNT_A64_EL0_0_7 GENMASK(15, 8)
#define PMCR1_COUNT_A64_EL1_0_7 GENMASK(23, 16)
#define PMCR1_COUNT_A64_EL0_8_9 GENMASK(41, 40)
#define PMCR1_COUNT_A64_EL1_8_9 GENMASK(49, 48)
#define SYS_IMP_APL_PMCR2_EL1 sys_reg(3, 1, 15, 2, 0)
#define SYS_IMP_APL_PMCR3_EL1 sys_reg(3, 1, 15, 3, 0)
#define SYS_IMP_APL_PMCR4_EL1 sys_reg(3, 1, 15, 4, 0)
#define SYS_IMP_APL_PMESR0_EL1 sys_reg(3, 1, 15, 5, 0)
#define PMESR0_EVT_CNT_2 GENMASK(7, 0)
#define PMESR0_EVT_CNT_3 GENMASK(15, 8)
#define PMESR0_EVT_CNT_4 GENMASK(23, 16)
#define PMESR0_EVT_CNT_5 GENMASK(31, 24)
#define SYS_IMP_APL_PMESR1_EL1 sys_reg(3, 1, 15, 6, 0)
#define PMESR1_EVT_CNT_6 GENMASK(7, 0)
#define PMESR1_EVT_CNT_7 GENMASK(15, 8)
#define PMESR1_EVT_CNT_8 GENMASK(23, 16)
#define PMESR1_EVT_CNT_9 GENMASK(31, 24)
#define SYS_IMP_APL_PMSR_EL1 sys_reg(3, 1, 15, 13, 0)
#define PMSR_OVERFLOW GENMASK(9, 0)
#define get_bit_offset(index, mask) (__ffs(mask) + (index))
extern void setup_counter(int pmc_counter);
#define read_clk_counter() read_sysreg_s(SYS_IMP_APL_PMC0_EL1);
#define read_instruction_counter() read_sysreg_s(SYS_IMP_APL_PMC1_EL1);
#endif /* __TIMING_h */