linuxperformancex86amd-processorintel-pmu

How to use rdpmc instruction on AMD (EPYC) processor?


This program displays the count of actual CPU core cycles executed by the current core (using the related PMC which I believe is UNHALTED_CORE_CYCLES)

#include <unistd.h>
#include <cstdio>

int main(int argc, char* argv[]){

    unsigned long a, d, c, result;

    c = (1UL<<30)+1;
    __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));

    result = (a | (d << 32)); 
    printf("Current cycles  : %lu\n", result);

}

It works well on Intel processors, but displays a "Segmentation fault" on AMD ones (7001 and 7002). My first guess was to find a new c value related to CPU_CLOCKS_UNHALTED AMD event (0x76) without success for the moment


Solution

  • The number is wrong, AMD uses different RDPMC values than Intel. Depending on the processor, multiple events are directly supported through rdpmc, please refer to this AMD manual for further information (section rdpmc).

    The core cycle number should be 0 in your case.

    This code works for me to count PERF_COUNT_HW_INSTRUCTIONS

    #include <asm/unistd.h>
    #include <linux/perf_event.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <sys/ioctl.h>
    #include <unistd.h>
    
    static long
    perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
                    int cpu, int group_fd, unsigned long flags) {
        int ret;
    
        ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
        return ret;
    }
    
    #define rdpmc(counter, low, high)                \
        __asm__ __volatile__("rdpmc"                 \
                             : "=a"(low), "=d"(high) \
                             : "c"(counter))
    
    
    int main() {
        unsigned long values1, values2;
        unsigned int fixed0, low, high;
        struct perf_event_attr pe;
        int fd, i;
    
        //PERF_COUNT_HW_INSTRUCTIONS
        // Performance counter 1 on AMD
        // 1 << 30 on Intel
        fixed0 = 1;
    
        memset(&pe, 0, sizeof(struct perf_event_attr));
        pe.type = PERF_TYPE_HARDWARE;
        pe.size = sizeof(struct perf_event_attr);
        pe.config = PERF_COUNT_HW_INSTRUCTIONS;
        pe.disabled = 1;
        pe.exclude_kernel = 0;
        pe.exclude_hv = 0;
        pe.exclude_idle = 0;
    
        fd = perf_event_open(&pe, 0, -1, -1, 0);
        if (fd == -1) {
            fprintf(stderr, "Error opening leader %llx\n", pe.config);
            exit(EXIT_FAILURE);
        }
        for (i = 1; i <= 50; i++) {
            ioctl(fd, PERF_EVENT_IOC_RESET, 0);
            ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    
            rdpmc(fixed0, low, high);
            values1 = ((unsigned long)high << 32) + (unsigned long)low;
            asm volatile("lfence": : :"memory");        // test ()
            rdpmc(fixed0, low, high);
            values2 = ((unsigned long)high << 32) + (unsigned long)low;
            
            ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
            printf(" %lu\n", values2-values1);
        }
        close(fd);
    }
    
    

    Tested on Ryzen 7950X