I had recently been reading Linux Device Drivers 3rd edition, and had reached chapter 15: Memory Mapping and DMA.
I had also come across linux-kernel-labs, specifically their exercises in the Memory mapping lab.
I tried doing the second exercise, which is to implement a device driver that maps non-contiguous physical memory (e.g. obtained via vmalloc()
) to userspace.
It read in the book that vmalloc()
does not obtain physically contiguous memory, so each page needs to be mapped separately.
Here is my try -
/*
* PSO - Memory Mapping Lab(#11)
*
* Exercise #2: memory mapping using vmalloc'd kernel areas
*/
#include <linux/version.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <asm/io.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
MODULE_DESCRIPTION("simple mmap driver");
MODULE_AUTHOR("PSO");
MODULE_LICENSE("Dual BSD/GPL");
#define MY_MAJOR 42
/* how many pages do we actually vmalloc */
#define NPAGES 16
/* character device basic structure */
static struct cdev mmap_cdev;
/* pointer to the vmalloc'd area, rounded up to a page boundary */
static char *vmalloc_area;
static int my_open(struct inode *inode, struct file *filp)
{
return 0;
}
static int my_release(struct inode *inode, struct file *filp)
{
return 0;
}
static int my_mmap(struct file *filp, struct vm_area_struct *vma)
{
int i;
long length = vma->vm_end - vma->vm_start;
unsigned long start = vma->vm_start;
char *vmalloc_area_ptr = vmalloc_area;
unsigned long pfn;
if (length > NPAGES * PAGE_SIZE)
return -EIO;
/* TODO 1: map pages individually */
for (i = 0; i < length; i += PAGE_SIZE) {
pfn = vmalloc_to_pfn(vmalloc_area_ptr + i);
remap_pfn_range(vma, vma->vm_start + i, pfn, PAGE_SIZE, vma->vm_page_prot);
}
return 0;
}
static const struct file_operations mmap_fops = {
.owner = THIS_MODULE,
.open = my_open,
.release = my_release,
.mmap = my_mmap,
};
static int __init my_init(void)
{
int ret = 0;
int i;
ret = register_chrdev_region(MKDEV(MY_MAJOR, 0), 1, "maps");
if (ret < 0) {
pr_err("could not register region\n");
goto out;
}
/* TODO 1: allocate NPAGES using vmalloc */
vmalloc_area = (char *) vmalloc(NPAGES * PAGE_SIZE);
if (!vmalloc_area) {
pr_err("Failed to allocate vmalloc area\n");
ret = -ENOMEM;
goto out_unreg;
}
/* TODO 1: mark pages as reserved */
for (i = 0; i < NPAGES * PAGE_SIZE; i += PAGE_SIZE) {
SetPageReserved(vmalloc_to_page((void*) vmalloc_area + i));
}
/* TODO 1: write data in each page */
for (i = 0; i < NPAGES * PAGE_SIZE; i += PAGE_SIZE) {
vmalloc_area[i + 0] = 0xdd;
vmalloc_area[i + 1] = 0xcc;
vmalloc_area[i + 2] = 0xbb;
vmalloc_area[i + 3] = 0xaa;
}
cdev_init(&mmap_cdev, &mmap_fops);
mmap_cdev.owner = THIS_MODULE;
ret = cdev_add(&mmap_cdev, MKDEV(MY_MAJOR, 0), 1);
if (ret < 0) {
pr_err("could not add device\n");
goto out_vfree;
}
return 0;
out_vfree:
vfree(vmalloc_area);
out_unreg:
unregister_chrdev_region(MKDEV(MY_MAJOR, 0), 1);
out:
return ret;
}
static void __exit my_exit(void)
{
int i;
cdev_del(&mmap_cdev);
/* TODO 1: clear reservation on pages and free mem.*/
if (vmalloc_area) {
for (i = 0; i < NPAGES * PAGE_SIZE; i += PAGE_SIZE) {
ClearPageReserved(vmalloc_to_page((void*)vmalloc_area + i));
}
vfree(vmalloc_area);
}
unregister_chrdev_region(MKDEV(MY_MAJOR, 0), 1);
}
module_init(my_init);
module_exit(my_exit);
The point of writing the first 4 bytes of each page is so I can test for those values in user-space after mapping the memory.
Here is the program that I wrote to test this driver -
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>
int main(void) {
int fd, i, page_size = getpagesize();
void* mapped_memory = NULL;
fd = open("/dev/maps0", O_RDONLY);
if (fd < 0) {
printf("Failed to open /dev/maps\n");
return -1;
}
mapped_memory = mmap(NULL, page_size*16, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
if (mapped_memory == MAP_FAILED) {
printf("Mapping failed\n");
return -1;
}
printf("Mapped memory is at %p\n", mapped_memory);
printf("[%x]\n", ((char*)mapped_memory)[0]);
return 0;
}
The problem is, when I load the driver and try to test it with the program, it crashes and I get the following output -
Mapped memory is at 0x7f502b436000
Bus error (core dumped)
Can anyone point me to what I'm doing wrong?
P.S. I know the book uses the nopage
function of vm_operations_struct
, but I wanted to follow the lab and try doing it my own way.
TL;DR: use MAP_SHARED
.
There is a check in remap_pfn_range()
that ensures that if a mapping is copy-on-write (CoW), the requested range to remap must be exactly from vma->vm_start
to vma->vm_end
(i.e. it has to be physically contiguous).
/* [...]
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
* See vm_normal_page() for details.
*/
if (is_cow_mapping(vma->vm_flags)) {
if (addr != vma->vm_start || end != vma->vm_end)
return -EINVAL;
vma->vm_pgoff = pfn;
}
A mapping is considered CoW if its vma->vm_flags
don't have VM_SHARED
set and have VM_MAYWRITE
set (i.e. the mapping is not shared and may be made writable through mprotect
).
In your case, the VMA is considered CoW and the check fails because you are mapping one page at a time, so you will never match both vma->vm_start
and vma->vm_end
. Your remap_pfn_range()
is therefore failing with -EINVAL
, and you are missing it because you don't check the return value for error.
You have 3 options:
mmap
the whole area with MAP_SHARED
.mmap
single pages separately with MAP_PRIVATE
.VM_MAYWRITE
from vma->vm_flags
before mapping the pages to userspace to disallow making the page writeable in the future (i.e. with mprotect
), which will in turn make it non-CoW.Number 1 above is what IMHO makes the most sense and (from what I've seen) the most common option when mapping special devices.
P.S.: note that printf("[%x]\n", ((char*)mapped_memory)[0]);
is wrong, it will read a single char
(one byte) and promote it to int
with sign extension so you'll get [ffffffdd]
. You should do ((unsigned*)mapped_memory)[0])
instead if you wish to get [aabbccdd]
.