Issue with fentry BPF program attaching to open system call

I'm attempting to write a tracing eBPF program using the fentry attach type to hook into the open system call. Here's a minimal example:

SEC("fentry/__x64_sys_open")
int BPF_PROG(trace_sys_open, const char *filename, int flags, umode_t mode)
{
    if (flags & O_CREAT) {
        bpf_printk("fentry: open() with O_CREAT\n");
    }
    return 0;
}

However, when I attempt to load the BPF program, I get the following error:

$ sudo ./file_tracing
libbpf: prog 'trace_sys_open': BPF program load failed: Permission denied
libbpf: prog 'trace_sys_open': -- BEGIN PROG LOAD LOG --
reg type unsupported for arg#0 function trace_sys_open#15
0: R1=ctx(off=0,imm=0) R10=fp0
; int BPF_PROG(trace_sys_open, const char *filename, int flags, umode_t mode)
0: (79) r1 = *(u64 *)(r1 +8)
func '__x64_sys_open' doesn't have 2-th argument
invalid bpf_context access off=8 size=8
processed 1 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
-- END PROG LOAD LOG --
libbpf: prog 'trace_sys_open': failed to load: -13
libbpf: failed to load object 'file_tracing_bpf'
failed to load BPF object: -13

It seems that the BPF loader expects a different function signature. According to the kernel source (fs/open.c), the open syscall is defined as:

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;
    return do_sys_open(AT_FDCWD, filename, flags, mode);
}

So I tried updating the prototype to explicitly include __user, like so:

SEC("fentry/__x64_sys_open")
int BPF_PROG(trace_sys_open, const char __user *filename, int flags, umode_t mode)
{
    if (flags & O_CREAT) {
        bpf_printk("fentry: open() with O_CREAT\n");
    }
    return 0;
}

But this fails to compile:

$ clang -O2 -g -target bpf -I../helpers -I.output -D__TARGET_ARCH_x86 -c file_tracing.bpf.c -o .output/file_tracing.bpf.o
file_tracing.bpf.c:24:48: error: expected ')'
int BPF_PROG(trace_sys_open, const char __user *filename, int flags, umode_t mode)
                                               ^
file_tracing.bpf.c:24:5: note: to match this '('
int BPF_PROG(trace_sys_open, const char __user *filename, int flags, umode_t mode)
    ^
/usr/include/bpf/bpf_tracing.h:430:11: note: expanded from macro 'BPF_PROG'
____##name(unsigned long long *ctx, ##args);                                \
          ^
file_tracing.bpf.c:24:5: error: too many arguments to function call, expected 2, have 4
int BPF_PROG(trace_sys_open, const char __user *filename, int flags, umode_t mode)
    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/include/bpf/bpf_tracing.h:435:20: note: expanded from macro 'BPF_PROG'
        return ____##name(___bpf_ctx_cast(args));                           \
               ~~~~~~~~~~ ^~~~~~~~~~~~~~~~~~~~~
/usr/include/bpf/bpf_tracing.h:410:39: note: expanded from macro '___bpf_ctx_cast'
#define ___bpf_ctx_cast(args...)      ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
                                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/include/bpf/bpf_helpers.h:184:29: note: expanded from macro '___bpf_apply'
#define ___bpf_apply(fn, n) ___bpf_concat(fn, n)
                            ^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
<scratch space>:26:1: note: expanded from here
___bpf_ctx_cast3
^
/usr/include/bpf/bpf_tracing.h:400:39: note: expanded from macro '___bpf_ctx_cast3'
#define ___bpf_ctx_cast3(x, args...)  ___bpf_ctx_cast2(args), (void *)ctx[2]
                                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/include/bpf/bpf_tracing.h:399:63: note: expanded from macro '___bpf_ctx_cast2'
#define ___bpf_ctx_cast2(x, args...)  ___bpf_ctx_cast1(args), (void *)ctx[1]
                                                              ^
file_tracing.bpf.c:24:5: note: '____trace_sys_open' declared here
/usr/include/bpf/bpf_tracing.h:429:48: note: expanded from macro 'BPF_PROG'
static __always_inline typeof(name(0))                                      \
                                                                            ^
<scratch space>:20:1: note: expanded from here
____trace_sys_open
^
file_tracing.bpf.c:24:48: error: expected ')'
int BPF_PROG(trace_sys_open, const char __user *filename, int flags, umode_t mode)
                                               ^
file_tracing.bpf.c:24:5: note: to match this '('
int BPF_PROG(trace_sys_open, const char __user *filename, int flags, umode_t mode)
    ^
/usr/include/bpf/bpf_tracing.h:439:11: note: expanded from macro 'BPF_PROG'
____##name(unsigned long long *ctx, ##args)
          ^
file_tracing.bpf.c:26:9: error: use of undeclared identifier 'flags'
    if (flags & O_CREAT) {
        ^
4 errors generated.
make: *** [Makefile:28: .output/file_tracing.bpf.o] Error 1

As an alternative, I tried attaching to do_sys_openat2() and using the raw struct pt_regs *ctx approach:

SEC("fentry/__x64_sys_open")
int trace_sys_open(struct pt_regs *ctx)
{
    int flags = (int) PT_REGS_PARM2(ctx);
    if (flags & O_CREAT) {
        return 0;
    }
    return 0;
}

This version compiles and loads without issue.

However, when I try to delegate to a helper function:

SEC("fentry/__x64_sys_open")
int trace_sys_open(struct pt_regs *ctx)
{
    int flags = (int) PT_REGS_PARM2(ctx);
    if (flags & O_CREAT) {
        return handle_event_tracing(ctx);
    }
    return 0;
}

It fails again at load time:

$ sudo ./file_tracing
libbpf: prog 'trace_sys_open': BPF program load failed: Permission denied
libbpf: prog 'trace_sys_open': -- BEGIN PROG LOAD LOG --
reg type unsupported for arg#0 function trace_sys_open#13
0: R1=ctx(off=0,imm=0) R10=fp0
; int trace_sys_open(struct pt_regs *ctx)
0: (bf) r6 = r1                       ; R1=ctx(off=0,imm=0) R6_w=ctx(off=0,imm=0)
; int flags = (int) PT_REGS_PARM2(ctx);
1: (79) r1 = *(u64 *)(r6 +104)
invalid bpf_context access off=104 size=8
processed 2 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
-- END PROG LOAD LOG --
libbpf: prog 'trace_sys_open': failed to load: -13
libbpf: failed to load object 'file_tracing_bpf'
failed to load BPF object: -13

It seems like passing struct pt_regs * to this helper function breaks the verifier due to an incompatible or unsafe context pointer.

Heres a reproducable example:

// test.bpf.c

#include <vmlinux.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

#define TASK_COMM_LEN 16
#define O_CREAT 0100

struct event {
    __u64 ts;
    pid_t pid;
    __u32 tid;
    uid_t uid;
    char comm[TASK_COMM_LEN];
    __u64 cgroup_id;
};

struct {
    __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    __uint(key_size, sizeof(u32));
    __uint(value_size, sizeof(u32));
} events SEC(".maps");

static __always_inline
int handle_event_tracing(void *ctx)
{
    struct event event = {};
    event.cgroup_id = bpf_get_current_cgroup_id();
    u64 pid_tgid = bpf_get_current_pid_tgid();
    event.pid = pid_tgid >> 32;
    event.tid = pid_tgid & 0xFFFFFFFF;
    event.uid = bpf_get_current_uid_gid() >> 32;
    bpf_get_current_comm(&event.comm, sizeof(event.comm));
    bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
                  &event, sizeof(event));
    return 0;
}

SEC("fentry/__x64_sys_open")
int trace_sys_open(struct pt_regs *ctx)
{
    int flags = (int) PT_REGS_PARM2(ctx);
    if (flags & O_CREAT) {
        return handle_event_tracing(ctx);
    }
    return 0;
}

char LICENSE[] SEC("license") = "GPL";

$ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h
$ clang -O2 -g -I. -target bpf -D__TARGET_ARCH_x86 -c test.bpf.c -o test.bpf.o
$ bpftool gen skeleton test.bpf.o > test.skel.h
$ sudo bpftool prog load ./test.bpf.o /sys/fs/bpf/test_prog

Solution

It seems that the BPF loader expects a different function signature. According to the kernel source (fs/open.c).

Well, notice how SYSCALL_DEFINE3 is used, this is not a normal function declaration. This macro is define here as:

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

The SYSCALL_DEFINEx can be found here. It is a syscall wrapper, as the comments explain:

 * Instead of the generic __SYSCALL_DEFINEx() definition, the x86 version takes
 * struct pt_regs *regs as the only argument of the syscall stub(s) named as:
 * __x64_sys_*()         - 64-bit native syscall
 * __ia32_sys_*()        - 32-bit native syscall or common compat syscall
 * __ia32_compat_sys_*() - 32-bit compat syscall
 * __x64_compat_sys_*()  - 64-bit X32 compat syscall

 * Finally the
 * arguments are passed to the __do_sys_*() function which is the actual
 * syscall.

So. You are attaching to a syscall wrapper. The reason it works this ways is because the syscall calling convention (which arguments go in what register) is different from the normal SYS V calling convention. So it has to be translated. Keep this in mind when reading arguments using the PT_REGS_PARAMx macros since those translate based on the SYS V calling convention, not the syscall one.

As an alternative, I tried attaching to do_sys_openat2() and using the raw struct pt_regs *ctx approach

You are halfway there. The first argument is indeed pt_regs but you discarded the BPF_PROG so your context is no longer unpacked correctly (more on this later). The only reason it does not error is because you always return 0 thus the compile will have optimized away int flags = (int) PT_REGS_PARM2(ctx);

However, when I try to delegate to a helper function

It fails again at load time

The moment you try to do anything other than return 0; in the if body, the compiler cannot optimize away the line int flags = (int) PT_REGS_PARM2(ctx); which causes a bad read of the context.

That is because there is a misunderstanding between the context type and arguments.

You are using a fentry program. The context type of which is unsigned long long * (an array of 64 bit values, the arguments). So a plain version of your fentry program should look something like this:

SEC("fentry/__x64_sys_open")
int trace_sys_open(__u64 *ctx)
{
    struct pt_regs *regs = (struct pt_regs *) ctx[0];
    int flags = (int) PT_REGS_PARM2_SYSCALL(regs);
    if (flags & O_CREAT) {
        return handle_event_tracing(ctx);
    }
    return 0;
}

The BPF_PROG macro typically does the context unpacking and casting for you. So with that helper the program should look something like this:

SEC("fentry/__x64_sys_open")
int BPF_PROG(trace_sys_open, struct pt_regs *regs)
{
    int flags = (int) PT_REGS_PARM2_SYSCALL(regs);
    if (flags & O_CREAT) {
        return handle_event_tracing(ctx);
    }
    return 0;
}

We still use ctx that is because the macro preserves the original context under that name so you can pass it to helper functions.

Also, we use PT_REGS_PARM2_SYSCALL instead of PT_REGS_PARM2 which accounts for the calling convention difference.

Keep in mind that all of the above is true because you are using an fentry program type. When using the kprobe program type, your context would indeed have been struct pt_regs *. If you want to write a kprobe that attaches to the syscall wrapper you would get something like:

SEC("kprobe/__x64_sys_open")
int trace_sys_open(struct pt_regs *ctx)
{
    struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx);
    int flags;
    bpf_probe_read_kernel(&flags, sizeof(flags), &PT_REGS_PARM2_SYSCALL(regs));
    if (flags & O_CREAT) {
        return handle_event_tracing(ctx);
    }
    return 0;
}

So similar. You could be mistaken for thinking the pt_regs given to this program are the same as the once that __x64_sys_open operates on, but that isn't so. The pt_regs struct that is the arguments are created when the syscall happens. The pt_regs the kprobe gets are created when the probe triggers, and the actual arguments are in the first parameter. The PT_REGS_SYSCALL_REGS macro unpacks the outer pt_regs and gives you the actual syscall pt_regs.

I hope that clears up the confusion.