traceperfbpfftraceebpf

Is it possible to use eBPF or perf to calculate time spent in individual traced functions?


Currently, I can tell trace-cmd (an ftrace front-end) to use ftrace's function graph infrastructure to trace a specified workload. Using some trace data processing tools, I can look at the frequency with which functions were called and the corresponding total duration across each of the function calls. Example output might look like the following, where I trace ext4 function calls generated by my workload:

# trace-cmd post-processing example output
# function name, frequency, total duration (us)
ext4_alloc_inode(),1,35.567
ext4_read_inode_bitmap(),1,12.076
ext4_init_io_end(),2,41.216
ext4_journal_check_start(),39,716.3
ext4_da_write_end(),10,102.661
ext4_da_get_block_prep(),9,257.46999999999997
ext4_block_write_begin(),2,88.069
ext4_bio_write_page(),1,33.016

I've seen the incredible flame graphs that can be generated by eBPF, systemtap, perf, etc, which lead me to believe that I should be able to achieve output similar to that of my trace-cmd output using perf or eBPF. However, there are a few obstacles:

  1. My workload can run for up to 15 minutes, which could generate enormous logs if I were to use perf. For reference, you can view example output that traced bash for far less time.
  2. I am not familiar enough with eBPF to determine whether I can achieve similar data/output as my trace-cmd post-processing example output.

For those of you who are more familiar with eBPF or perf than I am, I would like to know:

If you are feeling extra generous, sample code/commands that would help me achieve my goal would be appreciated.


Solution

  • Absolutely, it is possible to use eBPF to do that.

    There's actually a tool that does something similar in the bcc tools. funclatency traces a set of functions and display the histogram of their latencies (time spent in the function at each call):

    # ./funclatency do_sys_open
    Tracing do_sys_open... Hit Ctrl-C to end.
    ^C
         nsecs           : count     distribution
           0 -> 1        : 0        |                                      |
           2 -> 3        : 0        |                                      |
           4 -> 7        : 0        |                                      |
           8 -> 15       : 0        |                                      |
          16 -> 31       : 0        |                                      |
          32 -> 63       : 0        |                                      |
          64 -> 127      : 0        |                                      |
         128 -> 255      : 0        |                                      |
         256 -> 511      : 0        |                                      |
         512 -> 1023     : 0        |                                      |
        1024 -> 2047     : 0        |                                      |
        2048 -> 4095     : 124      |****************                      |
        4096 -> 8191     : 291      |**************************************|
        8192 -> 16383    : 36       |****                                  |
       16384 -> 32767    : 16       |**                                    |
       32768 -> 65535    : 8        |*                                     |
       65536 -> 131071   : 0        |                                      |
      131072 -> 262143   : 0        |                                      |
      262144 -> 524287   : 0        |                                      |
      524288 -> 1048575  : 0        |                                      |
     1048576 -> 2097151  : 0        |                                      |
     2097152 -> 4194303  : 1        |                                      |
    Detaching...
    

    Starting from funclatency, it's rather easy to derive a tool that does what you're looking for:

    from __future__ import print_function
    from bcc import BPF
    from time import sleep, strftime
    import argparse
    
    parser = argparse.ArgumentParser(
        description="Time functions and print time spent in each function",
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("-p", "--pid", type=int, help="trace this PID only")
    parser.add_argument("pattern", help="search expression for functions")
    args = parser.parse_args()
    
    # define BPF program
    bpf_text = """
    #include <uapi/linux/ptrace.h>
    
    struct stats_t {
        u64 time;
        u64 freq;
    };
    BPF_HASH(start, u32);
    BPF_HASH(ipaddr, u32);
    BPF_HASH(stats, u64, struct stats_t);
    
    int trace_func_entry(struct pt_regs *ctx)
    {
        u64 pid_tgid = bpf_get_current_pid_tgid();
        u32 pid = pid_tgid;
        u64 ts = bpf_ktime_get_ns();
    
        u64 ip = PT_REGS_IP(ctx);
        ipaddr.update(&pid, &ip);
        start.update(&pid, &ts);
    
        return 0;
    }
    
    int trace_func_return(struct pt_regs *ctx)
    {
        u64 *tsp, delta;
        u64 pid_tgid = bpf_get_current_pid_tgid();
        u32 pid = pid_tgid;
    
        // calculate delta time
        tsp = start.lookup(&pid);
        if (tsp == 0) {
            return 0;   // missed start
        }
        delta = bpf_ktime_get_ns() - *tsp;
        start.delete(&pid);
    
        // store as histogram
        u64 ip, *ipp = ipaddr.lookup(&pid);
        if (ipp) {
            ip = *ipp;
            struct stats_t *stat = stats.lookup(&ip);
            if (stat) {
                stat->time += delta;
                stat->freq++;
            } else {
                struct stats_t s = {};
                s.time = delta;
                s.freq = 1;
                stats.update(&ip, &s);
            }
            ipaddr.delete(&pid);
        }
    
        return 0;
    }
    """
    
    # load BPF program
    b = BPF(text=bpf_text)
    pattern = args.pattern.replace('*', '.*')
    pattern = '^' + pattern + '$'
    b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry")
    b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return")
    matched = b.num_open_kprobes()
    
    if matched == 0:
        print("0 functions matched by \"%s\". Exiting." % args.pattern)
        exit()
    # header
    print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
        (matched / 2, args.pattern))
    
    while (1):
        try:
            sleep(99999999)
        except KeyboardInterrupt:
            break
    
    print()
    print("%-36s %8s %16s" % ("FUNC", "COUNT", "TIME (nsecs)"))
    stats = b.get_table("stats")
    for k, v in stats.items():
        print("%-36s %8s %16s" % (BPF.sym(k.value, -1), v.freq, v.time))
    

    This tool uses BPF to compute the frequency and total duration of each function call directly in the kernel. The aggregated values are only copied to userspace at the end, before displaying them. Output:

    # ./time-spent do_sys_*
    Tracing 6 functions for "do_sys_*"... Hit Ctrl-C to end.
    ^C
    FUNC                                    COUNT     TIME (nsecs)
    do_sys_poll                              6783     213928440379
    do_sys_open                              3127         43172656
    do_sys_ftruncate.constprop.15              39           189623
    

    This solution relies on bcc to load BPF programs in the kernel, but it's also possible to do everything in C. You can find a tutorial to get started with bcc on the repository.