cudagpucub

CUB segmented reduction not producing results


I'm trying to use CUB's segmented-reduction sum primitive, and I'm stuck on it.

Here is my code:

int main() {


     const int N = 7;
     const int num_segments  = 3;
     int d_offsets[]= {0,3,3,7};


    int *h_data       = (int *)malloc(N * sizeof(int));
    int *h_result = (int *)malloc(num_segments * sizeof(int));


    for (int i=0; i<N; i++) {
        h_data[i] = 3;

    }


    int *d_data;
    cudaMalloc((int**)&d_data, N * sizeof(int));
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);


    int           *d_result;
    cudaMalloc((int**)&d_result, num_segments * sizeof(int));

    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;


    cudaMalloc((void**)&d_temp_storage, temp_storage_bytes);


    cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_data, d_result,
        num_segments, d_offsets, d_offsets + 1);


    cudaMemcpy(h_result, d_result, num_segments*sizeof(int), cudaMemcpyDeviceToHost);




    printf("Results:\n");

   for (int i=0; i<num_segments; i++) {
        printf("CUB: %d\n", h_result[i]);

    }


}

But as a result I got this :

Results:
CUB: 0
CUB: 0
CUB: 0

I can't figure out what is the problem exactly. In the real example I have a very large array with segments equal to 400. s can I optimize the code such that I don't need declare and allocate memory for d_offsets.


Solution

  • You have really not seriously tried to debug your code:

    It is inappropriate for you to waste the SO community's time in debugging your code when you have not taken time to do so yourself.

    Still, there's something you could do to avoid having to check for errors, at least, which is use a library of some kind which does it for you (e.g. by throwing on error). If you did that - for example, using my CUDA Runtime API wrappers (sorry for the self-plug), and properly allocate memory for everything you need to, you'd end up with something like this:

    #include <cub/cub.cuh>
    #include <cuda/api_wrappers.h>
    #include <vector>
    #include <cstdlib>
    
    int main() {
    
        const int N = 7;
        const int num_segments  = 3;
        auto h_offsets = std::vector<int> {0,3,3,7};
    
        auto h_data = std::vector<int>(N);
        auto h_results = std::vector<int>(num_segments);
    
        std::fill(h_data.begin(), h_data.end(), 3);
    
        auto current_device = cuda::device::current::get();
        auto d_offsets = cuda::memory::device::make_unique<int[]>(
            current_device, h_offsets.size());
        auto d_data = cuda::memory::device::make_unique<int[]>(
            current_device, N);
        cuda::memory::copy(
            d_offsets.get(), &h_offsets[0], h_offsets.size() * sizeof(int));
        cuda::memory::copy(
            d_data.get(),  &h_data[0], h_data.size() * sizeof(int));
        auto d_results = cuda::memory::device::make_unique<int[]>(
            current_device, num_segments);
    
        auto d_start_offsets = d_offsets.get();
        auto d_end_offsets = d_start_offsets + 1; // aliasing, see CUB documentation
    
        size_t temp_storage_bytes = 0;
    
        // This call merely obtains a value for temp_storage_bytes, passed here
        // as a non-const reference; other arguments are unused
        cub::DeviceSegmentedReduce::Sum(
            nullptr, temp_storage_bytes, d_data.get(), d_results.get(),
            num_segments, d_start_offsets, d_end_offsets);
    
        auto d_temp_storage = cuda::memory::device::make_unique<char[]>(
            current_device, temp_storage_bytes);
    
        cub::DeviceSegmentedReduce::Sum(
            d_temp_storage.get(), temp_storage_bytes, d_data.get(), 
            d_results.get(), num_segments, d_start_offsets, d_end_offsets);
    
        cuda::memory::copy(
            &h_results[0], d_results.get(), num_segments * sizeof(int));
    
        std::cout << "Results:\n";
    
        for (int i=0; i<num_segments; i++) {
            std::cout << "Segment " << i << " data sums up to " << h_results[i] << "\n";
        }
    
        return EXIT_SUCCESS;
    }
    

    which works:

    Results:
    Segment 0 data sums up to 9
    Segment 1 data sums up to 0
    Segment 2 data sums up to 12
    

    Additional tips: