cudanvidiacubprefix-sum

maximum supported size for cub library


Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...

Here is my code:

#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;

bool                    g_verbose = false;  // Whether to display input/output to console
CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
typedef int mytype;

/**
 * Solve inclusive-scan problem
 */

static void solve(mytype *h_in, mytype *h_cpu, int n)
{
    mytype inclusive = 0;
    for (int i = 0; i < n; ++i) {
      inclusive += h_in[i];
      h_cpu[i] = inclusive;
    }
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
    for (int i = 0; i < n; i++) {
      if (h_cpu[i] != h_o[i]) {
        return i + 1;
      }
    }
    return 0;
}

/**
 * Main
 */
int main(int argc, char** argv)
{
    cudaSetDevice(0);
    struct timeval start, end;
    int num_items = 1073741824;
    const int repetitions = 5;
    mytype *h_in, *h_out, *h_cpu;
    const int size = num_items * sizeof(mytype);
    // Allocate host arrays
    h_in = (mytype *)malloc(size);
    h_out = (mytype *)malloc(size);
    h_cpu = (mytype *)malloc(size);


    // Initialize problem and solution
    for (int i = 0; i < num_items; i++) {
        h_in[i] = i;
        h_out[i] = 0;
        h_cpu[i] = 0;
    }

    solve(h_in, h_cpu, num_items);

    // Allocate problem device arrays
    mytype *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array
    mytype *d_out = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;


    CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    gettimeofday(&start, NULL);
    for (long i = 0; i < repetitions; i++) 
        DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    cudaThreadSynchronize();
    gettimeofday(&end, NULL);
    double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;

    cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
    int cmp = compare(h_cpu, h_out, num_items);
    printf("%d\t", num_items);
    if (!cmp)
        printf("\t%7.4fs \n", ctime);
    printf("\n");
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (h_cpu) delete[] h_cpu;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

Solution

  • The problem is here:

    const int size = num_items * sizeof(mytype);
    

    And it can be fixed by changing it to:

    const size_t size = num_items * sizeof(mytype);
    

    The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.

    When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).

    In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.

    Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.

    For reference, here is the fixed code (based on what OP posted here):

    #define CUB_STDERR
    #include <stdio.h>
    #include "cub/util_allocator.cuh"
    #include "cub/device/device_scan.cuh"
    #include <sys/time.h>
    using namespace cub;
    
    bool                    g_verbose = false;  // Whether to display input/output to console
    CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
    typedef int mytype;
    
    /**
     * Solve inclusive-scan problem
     */
    
    static void solve(mytype *h_in, mytype *h_cpu, int n)
    {
        mytype inclusive = 0;
        for (int i = 0; i < n; ++i) {
          inclusive += h_in[i];
          h_cpu[i] = inclusive;
        }
    }
    static int compare(mytype *h_cpu, mytype *h_o, int n)
    {
        for (int i = 0; i < n; i++) {
          if (h_cpu[i] != h_o[i]) {
            return i + 1;
          }
        }
        return 0;
    }
    
    /**
     * Main
     */
    int main(int argc, char** argv)
    {
        cudaSetDevice(0);
        struct timeval start, end;
        int num_items = 1073741824;
        const int repetitions = 5;
        mytype *h_in, *h_out, *h_cpu;
        const size_t size = num_items * sizeof(mytype);
        // Allocate host arrays
        h_in = (mytype *)malloc(size);
        h_out = (mytype *)malloc(size);
        h_cpu = (mytype *)malloc(size);
    
    
        // Initialize problem and solution
        for (int i = 0; i < num_items; i++) {
            h_in[i] = i;
            h_out[i] = 0;
            h_cpu[i] = 0;
        }
    
        solve(h_in, h_cpu, num_items);
    
        // Allocate problem device arrays
        mytype *d_in = NULL;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
    
        // Initialize device input
        CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
    
        // Allocate device output array
        mytype *d_out = NULL;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
    
        // Allocate temporary storage
        void            *d_temp_storage = NULL;
        size_t          temp_storage_bytes = 0;
    
    
        CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
    
        // Run
        gettimeofday(&start, NULL);
        for (long i = 0; i < repetitions; i++) 
            DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
        cudaThreadSynchronize();
        gettimeofday(&end, NULL);
        double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
    
        cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
        int cmp = compare(h_cpu, h_out, num_items);
        printf("%d\t", num_items);
        if (!cmp)
            printf("\t%7.4fs \n", ctime);
        printf("\n");
        if (h_in) delete[] h_in;
        if (h_out) delete[] h_out;
        if (h_cpu) delete[] h_cpu;
        if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
        if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
    
        printf("\n\n");
    
        return 0;
    }