Currently, I can tell trace-cmd
(an ftrace front-end) to use ftrace's function graph infrastructure to trace a specified workload. Using some trace data processing tools, I can look at the frequency with which functions were called and the corresponding total duration across each of the function calls. Example output might look like the following, where I trace ext4 function calls generated by my workload:
# trace-cmd post-processing example output
# function name, frequency, total duration (us)
ext4_alloc_inode(),1,35.567
ext4_read_inode_bitmap(),1,12.076
ext4_init_io_end(),2,41.216
ext4_journal_check_start(),39,716.3
ext4_da_write_end(),10,102.661
ext4_da_get_block_prep(),9,257.46999999999997
ext4_block_write_begin(),2,88.069
ext4_bio_write_page(),1,33.016
I've seen the incredible flame graphs that can be generated by eBPF
, systemtap
, perf
, etc, which lead me to believe that I should be able to achieve output similar to that of my trace-cmd
output using perf
or eBPF
. However, there are a few obstacles:
perf
. For reference, you can view example output that traced bash for far less time.eBPF
to determine whether I can achieve similar data/output as my trace-cmd
post-processing example output.For those of you who are more familiar with eBPF
or perf
than I am, I would like to know:
eBPF
or perf
to achieve my goal?If you are feeling extra generous, sample code/commands that would help me achieve my goal would be appreciated.
Absolutely, it is possible to use eBPF to do that.
There's actually a tool that does something similar in the bcc tools. funclatency traces a set of functions and display the histogram of their latencies (time spent in the function at each call):
# ./funclatency do_sys_open
Tracing do_sys_open... Hit Ctrl-C to end.
^C
nsecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 124 |**************** |
4096 -> 8191 : 291 |**************************************|
8192 -> 16383 : 36 |**** |
16384 -> 32767 : 16 |** |
32768 -> 65535 : 8 |* |
65536 -> 131071 : 0 | |
131072 -> 262143 : 0 | |
262144 -> 524287 : 0 | |
524288 -> 1048575 : 0 | |
1048576 -> 2097151 : 0 | |
2097152 -> 4194303 : 1 | |
Detaching...
Starting from funclatency, it's rather easy to derive a tool that does what you're looking for:
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
parser = argparse.ArgumentParser(
description="Time functions and print time spent in each function",
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-p", "--pid", type=int, help="trace this PID only")
parser.add_argument("pattern", help="search expression for functions")
args = parser.parse_args()
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
struct stats_t {
u64 time;
u64 freq;
};
BPF_HASH(start, u32);
BPF_HASH(ipaddr, u32);
BPF_HASH(stats, u64, struct stats_t);
int trace_func_entry(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid;
u64 ts = bpf_ktime_get_ns();
u64 ip = PT_REGS_IP(ctx);
ipaddr.update(&pid, &ip);
start.update(&pid, &ts);
return 0;
}
int trace_func_return(struct pt_regs *ctx)
{
u64 *tsp, delta;
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid;
// calculate delta time
tsp = start.lookup(&pid);
if (tsp == 0) {
return 0; // missed start
}
delta = bpf_ktime_get_ns() - *tsp;
start.delete(&pid);
// store as histogram
u64 ip, *ipp = ipaddr.lookup(&pid);
if (ipp) {
ip = *ipp;
struct stats_t *stat = stats.lookup(&ip);
if (stat) {
stat->time += delta;
stat->freq++;
} else {
struct stats_t s = {};
s.time = delta;
s.freq = 1;
stats.update(&ip, &s);
}
ipaddr.delete(&pid);
}
return 0;
}
"""
# load BPF program
b = BPF(text=bpf_text)
pattern = args.pattern.replace('*', '.*')
pattern = '^' + pattern + '$'
b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry")
b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return")
matched = b.num_open_kprobes()
if matched == 0:
print("0 functions matched by \"%s\". Exiting." % args.pattern)
exit()
# header
print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
(matched / 2, args.pattern))
while (1):
try:
sleep(99999999)
except KeyboardInterrupt:
break
print()
print("%-36s %8s %16s" % ("FUNC", "COUNT", "TIME (nsecs)"))
stats = b.get_table("stats")
for k, v in stats.items():
print("%-36s %8s %16s" % (BPF.sym(k.value, -1), v.freq, v.time))
This tool uses BPF to compute the frequency and total duration of each function call directly in the kernel. The aggregated values are only copied to userspace at the end, before displaying them. Output:
# ./time-spent do_sys_*
Tracing 6 functions for "do_sys_*"... Hit Ctrl-C to end.
^C
FUNC COUNT TIME (nsecs)
do_sys_poll 6783 213928440379
do_sys_open 3127 43172656
do_sys_ftruncate.constprop.15 39 189623
This solution relies on bcc to load BPF programs in the kernel, but it's also possible to do everything in C. You can find a tutorial to get started with bcc on the repository.