A root-owned setuid bit daemon switches back to the real user and creates a mount namespace.
A user-owned executable with CAP_SYS_ADMIN
and CAP_SYS_CHROOT
bits set tries to enter that namespace and fails.
daemon.c
:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/capability.h>
#include <sys/prctl.h>
#include <unistd.h>
int main(int argc, const char* argv[])
{
prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != -1 || (perror(0), exit(1), 0);
setuid(getuid()) != -1 || (perror(0), exit(1), 0);
setgid(getgid()) != -1 || (perror(0), exit(1), 0);
cap_t cap = cap_init();
cap || (perror(0), exit(1), 0);
cap_value_t cap_values[] = {CAP_SYS_ADMIN};
cap_set_flag(cap, CAP_EFFECTIVE, sizeof(cap_values) / sizeof(cap_values[0]), cap_values, CAP_SET) != -1 || (perror(0), exit(1), 0);
cap_set_flag(cap, CAP_PERMITTED, sizeof(cap_values) / sizeof(cap_values[0]), cap_values, CAP_SET) != -1 || (perror(0), exit(1), 0);
cap_set_proc(cap) != -1 || (perror(0), exit(1), 0);
cap_free(cap) != -1 || (perror(0), exit(1), 0);
unshare(CLONE_NEWNS) != -1 || (perror(0), exit(1), 0);
pause();
return 0;
}
client.c
:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <errno.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <unistd.h>
int main(int argc, const char* argv[])
{
argc == 2 || (perror(0), exit(1), 0);
const int fd = syscall(SYS_pidfd_open, atoi(argv[1]), 0);
fd != -1 || (perror(0), exit(1), 0);
setns(fd, CLONE_NEWNS) != -1 || (perror(0), exit(1), 0);
return 0;
}
build-run.sh
:
#!/bin/bash
gcc -o daemon{,.c} -lcap
gcc -o client{,.c} -lcap
sudo chown root:root ./daemon; sudo chmod u+s ./daemon
sudo setcap cap_sys_admin,cap_sys_chroot+ep ./client
./daemon &
./client $!
Gives "Operation not permitted" - the setns()
call fails. It has the correct capabilities, user namespace is the same. What's wrong?
Try using:
sudo setcap cap_sys_admin,cap_sys_chroot,cap_sys_ptrace=ep ./client
It looks like this detail of requiring the cap_sys_ptrace
capability is buried in the kernel patch in a code comment:
+ * This syscall gets a copy of a file descriptor from another process
+ * based on the pidfd, and file descriptor number. It requires that
+ * the calling process has the ability to ptrace the process represented
+ * by the pidfd. The process which is having its file descriptor copied
+ * is otherwise unaffected.