CUB segmented reduction not producing results

I'm trying to use CUB's segmented-reduction sum primitive, and I'm stuck on it.

Here is my code:

int main() {


     const int N = 7;
     const int num_segments  = 3;
     int d_offsets[]= {0,3,3,7};


    int *h_data       = (int *)malloc(N * sizeof(int));
    int *h_result = (int *)malloc(num_segments * sizeof(int));


    for (int i=0; i<N; i++) {
        h_data[i] = 3;

    }


    int *d_data;
    cudaMalloc((int**)&d_data, N * sizeof(int));
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);


    int           *d_result;
    cudaMalloc((int**)&d_result, num_segments * sizeof(int));

    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;


    cudaMalloc((void**)&d_temp_storage, temp_storage_bytes);


    cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_data, d_result,
        num_segments, d_offsets, d_offsets + 1);


    cudaMemcpy(h_result, d_result, num_segments*sizeof(int), cudaMemcpyDeviceToHost);




    printf("Results:\n");

   for (int i=0; i<num_segments; i++) {
        printf("CUB: %d\n", h_result[i]);

    }


}

But as a result I got this :

Results:
CUB: 0
CUB: 0
CUB: 0

I can't figure out what is the problem exactly. In the real example I have a very large array with segments equal to 400. s can I optimize the code such that I don't need declare and allocate memory for d_offsets.

Solution

You have really not seriously tried to debug your code:

You were missing the allocation of memory for d_results (you fixed that)
You were trying to pass a host memory address for a device memory address in d_offsets. Of course this results in a CUDA runtime error - but
You did not check for runtime errors.
You only called the CUB function once - although you must run it twice for it to actually do anything: Once with a nullptr as the scratch space, to get the scratch space size, then again with an actual scratch space to do work. It's an annoying API but that's how it works.

It is inappropriate for you to waste the SO community's time in debugging your code when you have not taken time to do so yourself.

Still, there's something you could do to avoid having to check for errors, at least, which is use a library of some kind which does it for you (e.g. by throwing on error). If you did that - for example, using my CUDA Runtime API wrappers (sorry for the self-plug), and properly allocate memory for everything you need to, you'd end up with something like this:

#include <cub/cub.cuh>
#include <cuda/api_wrappers.h>
#include <vector>
#include <cstdlib>

int main() {

    const int N = 7;
    const int num_segments  = 3;
    auto h_offsets = std::vector<int> {0,3,3,7};

    auto h_data = std::vector<int>(N);
    auto h_results = std::vector<int>(num_segments);

    std::fill(h_data.begin(), h_data.end(), 3);

    auto current_device = cuda::device::current::get();
    auto d_offsets = cuda::memory::device::make_unique<int[]>(
        current_device, h_offsets.size());
    auto d_data = cuda::memory::device::make_unique<int[]>(
        current_device, N);
    cuda::memory::copy(
        d_offsets.get(), &h_offsets[0], h_offsets.size() * sizeof(int));
    cuda::memory::copy(
        d_data.get(),  &h_data[0], h_data.size() * sizeof(int));
    auto d_results = cuda::memory::device::make_unique<int[]>(
        current_device, num_segments);

    auto d_start_offsets = d_offsets.get();
    auto d_end_offsets = d_start_offsets + 1; // aliasing, see CUB documentation

    size_t temp_storage_bytes = 0;

    // This call merely obtains a value for temp_storage_bytes, passed here
    // as a non-const reference; other arguments are unused
    cub::DeviceSegmentedReduce::Sum(
        nullptr, temp_storage_bytes, d_data.get(), d_results.get(),
        num_segments, d_start_offsets, d_end_offsets);

    auto d_temp_storage = cuda::memory::device::make_unique<char[]>(
        current_device, temp_storage_bytes);

    cub::DeviceSegmentedReduce::Sum(
        d_temp_storage.get(), temp_storage_bytes, d_data.get(), 
        d_results.get(), num_segments, d_start_offsets, d_end_offsets);

    cuda::memory::copy(
        &h_results[0], d_results.get(), num_segments * sizeof(int));

    std::cout << "Results:\n";

    for (int i=0; i<num_segments; i++) {
        std::cout << "Segment " << i << " data sums up to " << h_results[i] << "\n";
    }

    return EXIT_SUCCESS;
}

which works:

Results:
Segment 0 data sums up to 9
Segment 1 data sums up to 0
Segment 2 data sums up to 12

Additional tips:

Always investigate compiler warnings.
Use cuda-memcheck to avoid memory leaks / initializing on the wrong device/host side, etc.
If you're using the CUDA Runtime API directly, you must check every call for errors.