c++opengl-esgpueglmesa

Ways to reduce memory transfer overhead in OpenGL ES 2.0 / OpenGL 2.1 with EGL 1.4 for rendering offscreen


I'm fairly early into trying to do some image manipulation tasks on a headless embedded device running linux with a MALI 400 GPU. This supports OpenGL ES 2.0 through an official driver, and possibly a mostly complete OpenGL 2.1 through an unofficial LIMA driver.

Specifically, I have images which come into a DMA mapped memory via an external system, and I load them to a (MONO8/LUMINANCE) texture, run the shader program which renders into another texture, and then read that out for use with glReadPixels. I can post more complete code if it would help anyone, but for now I show just the relevant parts of the setup to avoid clutter (all pretty standard I think):

// Setup code:
display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
eglInitialize(display, &major, &minor);
const EGLint configAttributes[] = {
        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
        EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
        EGL_BLUE_SIZE, 8,
        EGL_GREEN_SIZE, 8,
        EGL_RED_SIZE, 8,
        EGL_ALPHA_SIZE, 8,
        EGL_NONE
  };
eglChooseConfig(display, configAttributes, &config, 1, &numConfigs)
const EGLint pbufferAttributes[] = {
        EGL_WIDTH, 1920,
        EGL_HEIGHT, 1200,
        EGL_NONE
};
surface = eglCreatePbufferSurface(display, config, pbufferAttributes);
const EGLint contextAttributes[] = {
        EGL_CONTEXT_CLIENT_VERSION, 2,
        EGL_NONE
};
context = eglCreateContext(display, config, EGL_NO_CONTEXT, contextAttributes);
eglMakeCurrent(display, surface, surface, context);

... Setup shaders, VBOs, etc ...

// Texture used to load image
glGenTextures(1, &textureID);
glBindTexture(GL_TEXTURE_2D, textureID);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
unsigned char* data = (unsigned char*)malloc(1920 * 1200 * sizeof(unsigned char));
glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, 1920, 1200, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, data); // Bind to dummy data at first, check if we can remove this
glBindBuffer(GL_ARRAY_BUFFER, VBOVertices);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), (GLvoid*)0);
glEnableVertexAttribArray(0);
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), (GLvoid*)(3 * sizeof(GLfloat)));
glEnableVertexAttribArray(1);

// Texture used to render into
GLuint framebuffer;
glGenFramebuffers(1, &framebuffer);
glBindFramebuffer(GL_FRAMEBUFFER, framebuffer);
GLuint texture;
glGenTextures(1, &texture);
glBindTexture(GL_TEXTURE_2D, texture);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 1920, 1200, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, texture, 0);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, textureID);
// LATER: In main loop
auto start = std::chrono::steady_clock::now(); // Start timer for image loading
glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, 1920, 1200, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, image.img); // Load current image into texture from before
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, textureID);
glUniform1i(glGetUniformLocation(shaderProgram, "texture1"), 0);
auto text_loaded = std::chrono::steady_clock::now(); // Texture is loaded, end timer for image loading
glUseProgram(shaderProgram);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glFinish();
auto gl_finished = std::chrono::steady_clock::now(); // All rendering should be done when glFinish returns?
glReadPixels(0, 0, 1920, 1200, GL_RGBA, GL_UNSIGNED_BYTE, preprocessed_img_buf);
auto end = std::chrono::steady_clock::now(); // End timer for readout
// This is needed to get back into MONO8 format
for (int i = 0; i < 1920 * 1200; i++){
  processed_img_buf[i] = preprocessed_img_buf[i * 4];
}
// Print frame times
auto text_loaded_time = std::chrono::duration_cast<std::chrono::microseconds>(text_loaded - start).count();
auto frame_render_time = std::chrono::duration_cast<std::chrono::microseconds>(gl_finished - text_loaded).count();
auto frame_readout_time = std::chrono::duration_cast<std::chrono::microseconds>(end - gl_finished).count();

Attempting to get a loose benchmark with the above code, I am seeing time I can barely believe:

Image loading time: 7148 us
Render time: 85720 us
Readout time: 158734 us
Total frame time: 251602 us

Image loading time: 4797 us
Render time: 85841 us
Readout time: 152563 us
Total frame time: 243201 us

Image loading time: 6018 us
Render time: 85757 us
Readout time: 158420 us
Total frame time: 250195 us

I was going into this expecting glReadPixels to be slow, but not twice the render time/slower than 10 FPS (assuming I am benchmarking in a fairly sane manner). This leads me to think I am doing something else wrong, however everything I have tried seems to be unsupported in some way or another:

So my title question: Is there a way I can do this more optimally in the case I have described?

What would that combination of options/setup look like? I came into this thinking it was do-able, and now I am doubting if this GPU can be useful to me without major time investment. Is that the case or am I a few enums away from a more reasonable frame time?

And some more thoughts/questions I've had, but don't want to consider part of the main question to avoid unfocusing it:


Solution

  • As I mentioned in the comments, you can use the GBM library for your task. Here's a step-by-step guide:

    1. Create a GBM surface with one or more GBM BOs (Buffer Objects).

    2. Proceed to create an EGLSurface from the GBM surface for rendering purposes.

    3. If your graphics driver supports it, you can obtain a DMA file descriptor (fd) for each BO using the DRM PRIME API. This will allow you to mmap these buffers and read their contents as needed.

    4. It's important to note that when creating the GBM surface, ensure that you use a linear format modifier so that the mapped memory's content is meaningful for your use case.

    Below is an example that demonstrates rendering content into a GBM surface and saving the result into a PNG image using the mapped DMA file descriptor:

    meson.build

    project(
        'OpenGL DMA Read Example',
        'c',
        version : '0.1.0',
        meson_version: '>= 0.59.0',
        default_options: [
            'warning_level=2',
            'buildtype=debug'
        ]
    )
    
    c = meson.get_compiler('c')
    
    include_paths = []
    
    include_paths_sys = [
        '/usr/local/include',
        '/usr/include/drm',
        '/usr/include/libdrm',
        '/usr/include/freetype2'
    ]
    
    foreach p : include_paths_sys
        if run_command('[', '-d', p, ']', check : false).returncode() == 0
          include_paths += [include_directories(p)]
        endif
    endforeach
    
    egl_dep             = c.find_library('EGL')
    glesv2_dep          = c.find_library('GLESv2')
    drm_dep             = c.find_library('drm')
    gbm_dep             = c.find_library('gbm')
    freeimage_dep       = c.find_library('freeimage')
    
    executable(
        'dma-read',
        sources : ['main.c'],
        include_directories : include_paths,
        dependencies : [
            egl_dep,
            glesv2_dep,
            drm_dep,
            gbm_dep,
            freeimage_dep
        ])
    

    main.c

    #include <EGL/egl.h>
    #include <errno.h>
    #include <gbm.h>
    #include <fcntl.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <xf86drmMode.h>
    #include <sys/ioctl.h>
    #include <string.h>
    #include <unistd.h>
    #include <sys/mman.h>
    #include <GLES2/gl2.h>
    #include <FreeImage.h>
    #include <linux/dma-buf.h>
    #include <linux/dma-heap.h>
    
    #define PNG_PATH "/tmp/dma_read.png"
    #define DRM_DEVICE "/dev/dri/card0"
    #define WIDTH 512
    #define HEIGHT 512
    #define FORMAT GBM_FORMAT_ARGB8888
    
    static int drmFd, dmaFd;
    static char *map;
    static int offset;
    static unsigned int stride;
    static struct gbm_device *gbmDevice;
    static struct gbm_surface *gbmSurface;
    static struct gbm_bo *gbmBO;
    static EGLDisplay eglDisplay;
    static EGLContext eglContext;
    static EGLSurface eglSurface;
    static EGLConfig eglConfig;
    
    static const EGLint eglConfigAttribs[] =
    {
        EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
        EGL_RED_SIZE, 8,
        EGL_GREEN_SIZE, 8,
        EGL_BLUE_SIZE, 8,
        EGL_ALPHA_SIZE, 8,
        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
        EGL_NONE
    };
    
    static int matchConfigToVisual(EGLDisplay egl_display, EGLint visual_id, EGLConfig *configs, int count)
    {
        for (int i = 0; i < count; ++i)
        {
            EGLint id;
    
            if (!eglGetConfigAttrib(egl_display, configs[i], EGL_NATIVE_VISUAL_ID, &id))
                continue;
    
            if (id == visual_id)
                return i;
        }
    
        return -1;
    }
    
    static int chooseEGLConfiguration(EGLDisplay egl_display, const EGLint *attribs, EGLint visual_id, EGLConfig *config_out)
    {
        EGLint count = 0;
        EGLint matched = 0;
        EGLConfig *configs;
        int config_index = -1;
    
        if (!eglGetConfigs(egl_display, NULL, 0, &count) || count < 1)
        {
            printf("No EGL configs to choose from.\n");
            return 0;
        }
    
        configs = (void**)malloc(count * sizeof *configs);
    
        if (!configs)
            return 0;
    
        if (!eglChooseConfig(egl_display, attribs, configs, count, &matched) || !matched)
        {
            printf("No EGL configs with appropriate attributes.\n");
            goto out;
        }
    
        if (!visual_id)
            config_index = 0;
    
        if (config_index == -1)
            config_index = matchConfigToVisual(egl_display, visual_id, configs, matched);
    
        if (config_index != -1)
            *config_out = configs[config_index];
    
    out:
        free(configs);
        if (config_index == -1)
            return 0;
    
        return 1;
    }
    
    int getDMAFDFromBO(int drmFd, struct gbm_bo *bo)
    {
        struct drm_prime_handle prime_handle;
        memset(&prime_handle, 0, sizeof(prime_handle));
        prime_handle.handle = gbm_bo_get_handle(bo).u32;
        prime_handle.flags = DRM_CLOEXEC | DRM_RDWR;
        prime_handle.fd = -1;
    
        if (ioctl(drmFd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle) != 0)
            goto fail;
    
        if (prime_handle.fd < 0)
            goto fail;
    
        // Set read and write permissions on the file descriptor
        if (fcntl(prime_handle.fd, F_SETFL, fcntl(prime_handle.fd, F_GETFL) | O_RDWR) == -1)
        {
            close(prime_handle.fd);
            goto fail;
        }
    
        printf("Got BO DMA fd using DRM_IOCTL_PRIME_HANDLE_TO_FD.\n");
        return prime_handle.fd;
    
    fail:
    
        prime_handle.fd = gbm_bo_get_fd(bo);
    
        if (prime_handle.fd >= 0)
        {
            printf("Got BO DMA fd using gbm_bo_get_fd().\n");
            return prime_handle.fd;
        }
    
        printf("Failed to get fd for handle %u: %s\n", prime_handle.handle, strerror(errno));
        return -1;
    }
    
    int mapDMA()
    {
        map = mmap(NULL, HEIGHT * stride, PROT_READ | PROT_WRITE, MAP_SHARED, dmaFd, 0);
    
        if (map == MAP_FAILED)
        {
            map = mmap(NULL, HEIGHT * stride, PROT_WRITE, MAP_SHARED, dmaFd, 0);
    
            if (map == MAP_FAILED)
            {
                void **dummy = NULL;
                map = gbm_bo_map(gbmBO, 0, 0, WIDTH, HEIGHT, GBM_BO_TRANSFER_READ, &stride, dummy);
    
                if (!map)
                {
                    printf("Failed to map DMA fd.\n");
                    return 0;
                }
            }
        }
    
        return 1;
    }
    
    int init()
    {
        drmFd = open(DRM_DEVICE, O_RDWR);
    
        if (drmFd < 0)
        {
            printf("Failed to open DRM device %s.\n", DRM_DEVICE);
            return 0;
        }
    
        gbmDevice = gbm_create_device(drmFd);
    
        if (!gbmDevice)
        {
            printf("Failed to create GBM device.\n");
            return 0;
        }
    
        eglDisplay = eglGetDisplay(gbmDevice);
    
        if (eglDisplay == EGL_NO_DISPLAY)
        {
            printf("Failed to get EGL display.\n");
            return 0;
        }
    
        if (!eglInitialize(eglDisplay, NULL, NULL))
        {
            printf("Failed to initialize EGL display.\n");
            return 0;
        }
    
        if (!chooseEGLConfiguration(eglDisplay, eglConfigAttribs, FORMAT, &eglConfig))
        {
            printf("Failed to choose EGL configuration.\n");
            return 0;
        }
    
        eglContext = eglCreateContext(eglDisplay, eglConfig, EGL_NO_CONTEXT, NULL);
    
        if (eglContext == EGL_NO_CONTEXT)
        {
            printf("Failed to create EGL context.\n");
            return 0;
        }
    
        gbmSurface = gbm_surface_create(
            gbmDevice,
            WIDTH,
            HEIGHT,
            FORMAT,
            GBM_BO_USE_RENDERING | GBM_BO_USE_LINEAR);
    
        if (!gbmSurface)
        {
            printf("Failed to create GBM surface.\n");
            return 0;
        }
    
        eglSurface = eglCreateWindowSurface(eglDisplay, eglConfig, (EGLNativeWindowType)gbmSurface, NULL);
    
        if (eglSurface == EGL_NO_SURFACE)
        {
            printf("Failed to create EGL surface.\n");
            return 0;
        }
    
        eglMakeCurrent(eglDisplay,
                       eglSurface,
                       eglSurface,
                       eglContext);
    
        eglSwapBuffers(eglDisplay, eglSurface);
    
        // Create a single BO (calling gbm_surface_lock_front_buffer() again before gbm_surface_release_buffer() would create another BO)
        gbmBO = gbm_surface_lock_front_buffer(gbmSurface);
        gbm_surface_release_buffer(gbmSurface, gbmBO);
    
        stride = gbm_bo_get_stride(gbmBO);
        offset = gbm_bo_get_offset(gbmBO, 0);
        dmaFd = getDMAFDFromBO(drmFd, gbmBO);
    
        if (dmaFd < 0)
            return 0;
    
        if (!mapDMA())
            return 0;
    
        return 1;
    }
    
    void savePNG()
    {
        eglSwapBuffers(eglDisplay, eglSurface);
        gbm_surface_lock_front_buffer(gbmSurface);
    
        struct dma_buf_sync sync;
        sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ;
        ioctl(dmaFd, DMA_BUF_IOCTL_SYNC, &sync);
    
        FIBITMAP *image = FreeImage_ConvertFromRawBits((BYTE*)&map[offset],
                                                       WIDTH,
                                                       HEIGHT,
                                                       stride,
                                                       32,
                                                       0xFF0000, 0x00FF00, 0x0000FF,
                                                       false);
    
        if (FreeImage_Save(FIF_PNG, image, PNG_PATH, PNG_DEFAULT))
            printf("PNG image saved: %s.\n", PNG_PATH);
        else
            printf("Failed to save PNG image: %s.\n", PNG_PATH);
    
        FreeImage_Unload(image);
    
        sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ;
        ioctl(dmaFd, DMA_BUF_IOCTL_SYNC, &sync);
    
        gbm_surface_release_buffer(gbmSurface, gbmBO);
    }
    
    void render()
    {
        glEnable(GL_SCISSOR_TEST);
    
        // Red
        glViewport(0, 0, WIDTH/2, HEIGHT/2);
        glScissor(0, 0, WIDTH/2, HEIGHT/2);
        glClearColor(1.f, 0.f, 0.f, 1.f);
        glClear(GL_COLOR_BUFFER_BIT);
    
        // Green
        glViewport(WIDTH/2, 0, WIDTH/2, HEIGHT/2);
        glScissor(WIDTH/2, 0, WIDTH/2, HEIGHT/2);
        glClearColor(0.f, 1.f, 0.f, 1.f);
        glClear(GL_COLOR_BUFFER_BIT);
    
        // Blue
        glViewport(0, HEIGHT/2, WIDTH/2, HEIGHT/2);
        glScissor(0, HEIGHT/2, WIDTH/2, HEIGHT/2);
        glClearColor(0.f, 0.f, 1.f, 1.f);
        glClear(GL_COLOR_BUFFER_BIT);
    
        // Black
        glViewport(WIDTH/2, HEIGHT/2, WIDTH/2, HEIGHT/2);
        glScissor(WIDTH/2, HEIGHT/2, WIDTH/2, HEIGHT/2);
        glClearColor(0.f, 0.f, 0.f, 1.f);
        glClear(GL_COLOR_BUFFER_BIT);
    }
    
    int main()
    {
        if (!init())
            return 1;
        
        render();
        savePNG();
        return 0;
    }
    

    To test it, place the files in the same directory and run these commands:

    $ meson setup build
    $ cd build
    $ meson compile
    $ ./dma-read
    

    If everything goes well, a PNG file like the one shown should be saved in /tmp/dma_read.png.

    enter image description here