linux-kernelpage-fault

Userfaultfd write protection appears unsupported when checking through the UFFDIO_API ioctl


I am trying to use the write protection feature of Linux's userfaultfd, but it does not appear to be enabled in my kernel even though I am using version 5.13 (write protection should be fully supported in 5.10+).

When I run

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <unistd.h>

#define errExit(msg)        \
    do {                    \
        perror(msg);        \
        exit(EXIT_FAILURE); \
    } while (0)

static int has_bit(uint64_t val, uint64_t bit) {
    return (val & bit) == bit;
}

int main() {
    long uffd;     /* userfaultfd file descriptor */
    struct uffdio_api uffdio_api;

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1)
        errExit("userfaultfd");

    uffdio_api.api = UFFD_API;
    uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
        errExit("ioctl-UFFDIO_API");

    printf("UFFDIO_API: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_API));
    printf("UFFDIO_REGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_REGISTER));
    printf("UFFDIO_UNREGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_UNREGISTER));
    printf("UFFDIO_WRITEPROTECT: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_WRITEPROTECT));
    printf("UFFD_FEATURE_PAGEFAULT_FLAG_WP: %d\n", has_bit(uffdio_api.features, UFFD_FEATURE_PAGEFAULT_FLAG_WP));
}

The output is

UFFDIO_API: 1
UFFDIO_REGISTER: 1
UFFDIO_UNREGISTER: 1
UFFDIO_WRITEPROTECT: 0
UFFD_FEATURE_PAGEFAULT_FLAG_WP: 1

The UFFD_FEATURE_PAGEFAULT_FLAG_WP feature is enabled, but the UFFDIO_WRITEPROTECT ioctl is marked as not supported, which is necessary to enable write protection.

What might lead to this feature being disabled, and how can I enable it?

I am using Ubuntu MATE 21.10 with Linux kernel version 5.13.0-30-generic.

EDIT:

It seems like despite the man page section on the UFFD_API ioctl (https://man7.org/linux/man-pages/man2/ioctl_userfaultfd.2.html), this might be the intended behavior for a system where write protection is enabled. However, when I run a full program that spawns a poller thread and writes to the protected memory, the poller thread does not receive any notification.

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

#define errExit(msg)        \
    do {                    \
        perror(msg);        \
        exit(EXIT_FAILURE); \
    } while (0)

static int page_size;

static void* fault_handler_thread(void* arg) {
    long uffd;                  /* userfaultfd file descriptor */
    uffd = (long) arg;

    /* Loop, handling incoming events on the userfaultfd
       file descriptor. */

    for (;;) {
        /* See what poll() tells us about the userfaultfd. */

        struct pollfd pollfd;
        int nready;
        pollfd.fd = uffd;
        pollfd.events = POLLIN;
        nready = poll(&pollfd, 1, -1);
        if (nready == -1)
            errExit("poll");

        printf("\nfault_handler_thread():\n");
        printf(
            "    poll() returns: nready = %d; "
            "POLLIN = %d; POLLERR = %d\n",
            nready, (pollfd.revents & POLLIN) != 0,
            (pollfd.revents & POLLERR) != 0);

        // received fault, exit the program
        exit(EXIT_FAILURE);
    }
}

int main() {
    long uffd;     /* userfaultfd file descriptor */
    char* addr;    /* Start of region handled by userfaultfd */
    uint64_t len;  /* Length of region handled by userfaultfd */
    pthread_t thr; /* ID of thread that handles page faults */
    struct uffdio_api uffdio_api;
    struct uffdio_register uffdio_register;
    struct uffdio_writeprotect uffdio_wp;
    int s;

    page_size = sysconf(_SC_PAGE_SIZE);
    len = page_size;

    /* Create and enable userfaultfd object. */

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1)
        errExit("userfaultfd");

    uffdio_api.api = UFFD_API;
    uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
        errExit("ioctl-UFFDIO_API");

    addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (addr == MAP_FAILED)
        errExit("mmap");

    printf("Address returned by mmap() = %p\n", addr);

    /* Register the memory range of the mapping we just created for
       handling by the userfaultfd object. */

    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
        errExit("ioctl-UFFDIO_REGISTER");

    printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
    printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");

    uffdio_wp.range.start = (unsigned long) addr;
    uffdio_wp.range.len = len;
    uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
    if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
        errExit("ioctl-UFFDIO_WRITEPROTECT");

    /* Create a thread that will process the userfaultfd events. */

    s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
    if (s != 0) {
        errno = s;
        errExit("pthread_create");
    }

    /* Main thread now touches memory in the mapping, touching
       locations 1024 bytes apart. This will trigger userfaultfd
       events for all pages in the region. */

    usleep(100000);

    size_t l;
    l = 0xf; /* Ensure that faulting address is not on a page
                boundary, in order to test that we correctly
                handle that case in fault_handling_thread(). */
    char i = 0;
    while (l < len) {
        printf("Write address %p in main(): ", addr + l);
        addr[l] = i++;
        printf("%d\n", addr[l]);
        l += 1024;
        usleep(100000); /* Slow things down a little */
    }

    exit(EXIT_SUCCESS);
}

Solution

  • I found the solution. The write-protected pages must be touched after registering but before marking them as write-protected. This is an undocumented requirement, from what I can tell.

    In other words, add

    for (size_t i = 0; i < len; i += page_size) {
        addr[i] = 0;
    }
    

    between registering and write-protecting.

    It works if I change the full example to

    #define _GNU_SOURCE
    #include <errno.h>
    #include <fcntl.h>
    #include <inttypes.h>
    #include <linux/userfaultfd.h>
    #include <poll.h>
    #include <pthread.h>
    #include <signal.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <sys/ioctl.h>
    #include <sys/mman.h>
    #include <sys/syscall.h>
    #include <sys/types.h>
    #include <unistd.h>
    
    #define errExit(msg)        \
        do {                    \
            perror(msg);        \
            exit(EXIT_FAILURE); \
        } while (0)
    
    static int page_size;
    
    static void* fault_handler_thread(void* arg) {
        long uffd;                  /* userfaultfd file descriptor */
        uffd = (long) arg;
    
        /* Loop, handling incoming events on the userfaultfd
           file descriptor. */
    
        for (;;) {
            /* See what poll() tells us about the userfaultfd. */
    
            struct pollfd pollfd;
            int nready;
            pollfd.fd = uffd;
            pollfd.events = POLLIN;
            nready = poll(&pollfd, 1, -1);
            if (nready == -1)
                errExit("poll");
    
            printf("\nfault_handler_thread():\n");
            printf(
                "    poll() returns: nready = %d; "
                "POLLIN = %d; POLLERR = %d\n",
                nready, (pollfd.revents & POLLIN) != 0,
                (pollfd.revents & POLLERR) != 0);
    
            // received fault, exit the program
            exit(EXIT_FAILURE);
        }
    }
    
    int main() {
        long uffd;     /* userfaultfd file descriptor */
        char* addr;    /* Start of region handled by userfaultfd */
        uint64_t len;  /* Length of region handled by userfaultfd */
        pthread_t thr; /* ID of thread that handles page faults */
        struct uffdio_api uffdio_api;
        struct uffdio_register uffdio_register;
        struct uffdio_writeprotect uffdio_wp;
        int s;
    
        page_size = sysconf(_SC_PAGE_SIZE);
        len = page_size;
    
        /* Create and enable userfaultfd object. */
    
        uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
        if (uffd == -1)
            errExit("userfaultfd");
    
        uffdio_api.api = UFFD_API;
        uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
        if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
            errExit("ioctl-UFFDIO_API");
    
        addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (addr == MAP_FAILED)
            errExit("mmap");
    
        printf("Address returned by mmap() = %p\n", addr);
    
        /* Register the memory range of the mapping we just created for
           handling by the userfaultfd object. */
    
        uffdio_register.range.start = (unsigned long) addr;
        uffdio_register.range.len = len;
        uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
            errExit("ioctl-UFFDIO_REGISTER");
    
        printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
        printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");
    
        for (size_t i = 0; i < len; i += page_size) {
            addr[i] = 0;
        }
    
        uffdio_wp.range.start = (unsigned long) addr;
        uffdio_wp.range.len = len;
        uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
        if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
            errExit("ioctl-UFFDIO_WRITEPROTECT");
    
        /* Create a thread that will process the userfaultfd events. */
    
        s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
        if (s != 0) {
            errno = s;
            errExit("pthread_create");
        }
    
        /* Main thread now touches memory in the mapping, touching
           locations 1024 bytes apart. This will trigger userfaultfd
           events for all pages in the region. */
    
        usleep(100000);
    
        size_t l;
        l = 0xf; /* Ensure that faulting address is not on a page
                    boundary, in order to test that we correctly
                    handle that case in fault_handling_thread(). */
        char i = 0;
        while (l < len) {
            printf("Write address %p in main(): ", addr + l);
            addr[l] = i++;
            printf("%d\n", addr[l]);
            l += 1024;
            usleep(100000); /* Slow things down a little */
        }
    
        exit(EXIT_SUCCESS);
    }