I'm trying to read the raw gpu memory from a userspace application. The idea is to mmap /sys/bus/pci/devices/[device addr]/resource1
from the application and do loads and stores to it.
The device here is an Nvidia 3060Ti with 8GiB of on-board memory. The BAR is configured to be resizable, so all 8GiB of the memory should be accessible:
(base) [xps] pcimem git:(master) ✗ ls -lah /sys/bus/pci/devices/0000:01:00.0/resource*
-r--r--r-- 1 root root 4,0K avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource
-rw------- 1 root root 16M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource0
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1_wc
-rw------- 1 root root 32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3
-rw------- 1 root root 32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3_wc
-rw------- 1 root root 128 avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource5
Accessing the memory using pcimem doesn't work. Writing 0 to a location would return zero on the next read, but would return 0x000000005665BDF5
on any subsequent reads. The value 0x000000005665BDF5
is same across all locations after the first read.
Benchmarking these (failed) reads/writes seem to suggest that they actually do reach the GPU. The read latency is around 900ns which is close to a PCIe round trip time.
I have tried mmap
ing the framebuffer directly (/dev/fb0
) and read/write to it. This works, and I see similar read/write latencies. But, the frame buffer is way too small for my usecase.
CUDA doesn't work because on a read from the device memory, GPU would move that page to the host.
Is there a way to access the memory on the GPU from Linux?
My goal here is to be able to map the GPU's memory in the userspace application and use it as memory expansion. The userspace application (running on the CPU) would allocate and access data-structures directly on the GPU's memory.
TIA
The solution is to use vulcan API to allocate a heap on the GPU and access it. However, since x86 cannot cache MMIO addresses, every access would go to the GPU over the PCIe.
The implementation has about the same latency as Nvidia's server solution.
Here is a quick and dirty implementation in C++ that abstracts the GPU as a heap memory and allows malloc()
and free()
on it.
To find out the heap types, check: http://vulkan.gpuinfo.org/displayreport.php?id=14928#memory
You'd need that to check which flag your GPU supports when making the call to findMemoryType()
from createVertexBuffer()
#include <chrono>
#include <vulkan/vulkan.h>
#include <algorithm>
#include <array>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <optional>
#include <set>
#include <stdexcept>
#include <vector>
#include "libvram/libvram.hh"
class VRamWrapper;
VRamWrapper *vrw_obj;
const size_t DEV_EXT_LEN = 1;
const char *deviceExtensions[] = {VK_KHR_SWAPCHAIN_EXTENSION_NAME};
struct QueueFamilyIndices {
std::optional<uint32_t> graphicsFamily;
bool isComplete() { return graphicsFamily.has_value(); }
};
class VRamWrapper {
public:
void init() { initVulkan(); }
void *malloc(size_t bytes) { return this->createVertexBuffer(bytes); }
void free(void *buf) { assert(0); }
private:
VkInstance instance;
VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
VkDevice device;
VkQueue graphicsQueue;
std::vector<VkBuffer> buffers;
std::vector<VkDeviceMemory> bufferMemories;
void initVulkan() {
createInstance();
pickPhysicalDevice();
createLogicalDevice();
}
void cleanup() {
for (auto buf : buffers) {
vkDestroyBuffer(device, buf, nullptr);
}
for (auto mem : bufferMemories) {
vkFreeMemory(device, mem, nullptr);
}
vkDestroyDevice(device, nullptr);
vkDestroyInstance(instance, nullptr);
}
void createInstance() {
VkApplicationInfo appInfo{};
appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
appInfo.pApplicationName = "Hello Triangle";
appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.pEngineName = "No Engine";
appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.apiVersion = VK_API_VERSION_1_0;
VkInstanceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
createInfo.pApplicationInfo = &appInfo;
createInfo.enabledLayerCount = 0;
createInfo.pNext = nullptr;
if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) {
throw std::runtime_error("failed to create instance!");
}
}
void pickPhysicalDevice() {
uint32_t deviceCount = 0;
vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);
if (deviceCount == 0) {
throw std::runtime_error("failed to find GPUs with Vulkan support!");
}
std::vector<VkPhysicalDevice> devices(deviceCount);
vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());
for (const auto &device : devices) {
if (isDeviceSuitable(device)) {
physicalDevice = device;
break;
}
}
if (physicalDevice == VK_NULL_HANDLE) {
throw std::runtime_error("failed to find a suitable GPU!");
}
}
void createLogicalDevice() {
QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
std::set<uint32_t> uniqueQueueFamilies = {indices.graphicsFamily.value()};
float queuePriority = 1.0f;
for (uint32_t queueFamily : uniqueQueueFamilies) {
VkDeviceQueueCreateInfo queueCreateInfo{};
queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
queueCreateInfo.queueFamilyIndex = queueFamily;
queueCreateInfo.queueCount = 1;
queueCreateInfo.pQueuePriorities = &queuePriority;
queueCreateInfos.push_back(queueCreateInfo);
}
VkPhysicalDeviceFeatures deviceFeatures{};
VkDeviceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
createInfo.queueCreateInfoCount =
static_cast<uint32_t>(queueCreateInfos.size());
createInfo.pQueueCreateInfos = queueCreateInfos.data();
createInfo.pEnabledFeatures = &deviceFeatures;
createInfo.enabledExtensionCount = static_cast<uint32_t>(DEV_EXT_LEN);
createInfo.ppEnabledExtensionNames = deviceExtensions;
createInfo.enabledLayerCount = 0;
if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) !=
VK_SUCCESS) {
throw std::runtime_error("failed to create logical device!");
}
vkGetDeviceQueue(device, indices.graphicsFamily.value(), 0, &graphicsQueue);
}
void *createVertexBuffer(size_t bytes) {
VkBuffer buffer;
VkDeviceMemory bufferMemory;
VkBufferCreateInfo bufferInfo{};
bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
bufferInfo.size = bytes;
bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
throw std::runtime_error("failed to create vertex buffer!");
}
VkMemoryRequirements memRequirements;
vkGetBufferMemoryRequirements(device, buffer, &memRequirements);
assert(memRequirements.size == bytes);
VkMemoryAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocInfo.allocationSize = memRequirements.size;
allocInfo.memoryTypeIndex =
findMemoryType(memRequirements.memoryTypeBits,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
if (auto res = vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory);
res != VK_SUCCESS) {
throw std::runtime_error("failed to allocate vertex buffer memory");
}
vkBindBufferMemory(device, buffer, bufferMemory, 0);
void *data;
auto res = vkMapMemory(device, bufferMemory, 0, bytes, 0, &data);
if (res != VK_SUCCESS) {
throw std::runtime_error("Map failed");
}
fprintf(stderr, "Map completed. Allocated %lu MiB at %p\n",
(bytes) / (1024UL * 1024), data);
this->buffers.push_back(buffer);
this->bufferMemories.push_back(bufferMemory);
return data;
}
uint32_t findMemoryType(uint32_t typeFilter,
VkMemoryPropertyFlags properties) {
VkPhysicalDeviceMemoryProperties memProperties;
vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
if ((typeFilter & (1 << i)) &&
(memProperties.memoryTypes[i].propertyFlags & properties) ==
properties) {
return i;
}
}
throw std::runtime_error("failed to find suitable memory type!");
}
bool isDeviceSuitable(VkPhysicalDevice device) {
QueueFamilyIndices indices = findQueueFamilies(device);
bool extensionsSupported = checkDeviceExtensionSupport(device);
return indices.isComplete() &&
extensionsSupported /* && swapChainAdequate */;
}
bool checkDeviceExtensionSupport(VkPhysicalDevice device) {
uint32_t extensionCount;
vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
nullptr);
std::vector<VkExtensionProperties> availableExtensions(extensionCount);
vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
availableExtensions.data());
std::set<std::string> requiredExtensions(deviceExtensions,
deviceExtensions + DEV_EXT_LEN);
for (const auto &extension : availableExtensions) {
requiredExtensions.erase(extension.extensionName);
}
return requiredExtensions.empty();
}
QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) {
QueueFamilyIndices indices;
uint32_t queueFamilyCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
nullptr);
std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
queueFamilies.data());
int i = 0;
for (const auto &queueFamily : queueFamilies) {
if (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
indices.graphicsFamily = i;
}
if (indices.isComplete()) {
break;
}
i++;
}
return indices;
}
};
void ctor_libvram() {
fprintf(stderr, "%s() called\n", __FUNCTION__);
vrw_obj = new VRamWrapper();
vrw_obj->init();
}
void *libvram::malloc(size_t bytes) {
return vrw_obj->malloc(bytes);
}
void libvram::free(void *ptr) {
vrw_obj->free(ptr);
}