linuxsetuidlinux-namespaces

Can't enter mount namespace created by a setuid process


A root-owned setuid bit daemon switches back to the real user and creates a mount namespace.

A user-owned executable with CAP_SYS_ADMIN and CAP_SYS_CHROOT bits set tries to enter that namespace and fails.

daemon.c:

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/capability.h>
#include <sys/prctl.h>
#include <unistd.h>

int main(int argc, const char* argv[])
{
  prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != -1 || (perror(0), exit(1), 0);

  setuid(getuid()) != -1 || (perror(0), exit(1), 0);
  setgid(getgid()) != -1 || (perror(0), exit(1), 0);

  cap_t cap = cap_init();
  cap || (perror(0), exit(1), 0);
  cap_value_t cap_values[] = {CAP_SYS_ADMIN};
  cap_set_flag(cap, CAP_EFFECTIVE, sizeof(cap_values) / sizeof(cap_values[0]), cap_values, CAP_SET) != -1 || (perror(0), exit(1), 0);
  cap_set_flag(cap, CAP_PERMITTED, sizeof(cap_values) / sizeof(cap_values[0]), cap_values, CAP_SET) != -1 || (perror(0), exit(1), 0);
  cap_set_proc(cap) != -1 || (perror(0), exit(1), 0);
  cap_free(cap) != -1 || (perror(0), exit(1), 0);

  unshare(CLONE_NEWNS) != -1 || (perror(0), exit(1), 0);

  pause();

  return 0;
}

client.c:

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <errno.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <unistd.h>

int main(int argc, const char* argv[])
{
  argc == 2 || (perror(0), exit(1), 0);

  const int fd = syscall(SYS_pidfd_open, atoi(argv[1]), 0);
  fd != -1 || (perror(0), exit(1), 0);

  setns(fd, CLONE_NEWNS) != -1 || (perror(0), exit(1), 0);

  return 0;
}

build-run.sh:

#!/bin/bash

gcc -o daemon{,.c} -lcap
gcc -o client{,.c} -lcap

sudo chown root:root ./daemon; sudo chmod u+s ./daemon
sudo setcap cap_sys_admin,cap_sys_chroot+ep ./client

./daemon &
./client $!

Gives "Operation not permitted" - the setns() call fails. It has the correct capabilities, user namespace is the same. What's wrong?


Solution

  • Try using:

    sudo setcap cap_sys_admin,cap_sys_chroot,cap_sys_ptrace=ep ./client
    

    It looks like this detail of requiring the cap_sys_ptrace capability is buried in the kernel patch in a code comment:

    + * This syscall gets a copy of a file descriptor from another process
    + * based on the pidfd, and file descriptor number. It requires that
    + * the calling process has the ability to ptrace the process represented
    + * by the pidfd. The process which is having its file descriptor copied
    + * is otherwise unaffected.