cudacuda-streams

What's the capacity of a CUDA stream (=queue)?


A CUDA stream is a queue of tasks: memory copies, event firing, event waits, kernel launches, callbacks...

But - these queues don't have infinite capacity. In fact, empirically, I find that this limit is not super-high, e.g. in the thousands, not millions.

My questions:

  1. Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?
  2. How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?

Solution

  • Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?

    The "capacity" behaves differently based on actions/tasks you enqueue.

    Here is a demonstration:

    If we enqueue a single host function/callback in the midst of a number of kernel calls, on a Tesla V100 on CUDA 11.4 I observe a "capacity" for ~1000 enqueued items. However if I alternate kernel calls and host functions, I observe a capacity for ~100 enqueued items.

    // test case with alternating kernels and callbacks
    
    $ cat t2042a.cu
    #include <iostream>
    #include <vector>
    #include <mutex>
    #include <condition_variable>
    
    #define CUDACHECK(x) x
    // empty kernel
    __global__ void NoOpKernel() {}
    
    // for blocking stream to wait for host signal
    class Event {
     private:
      std::mutex mtx_condition_;
      std::condition_variable condition_;
      bool signalled = false;
    
     public:
      void Signal() {
        {
          std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
          signalled = true;
        }
        condition_.notify_all();
      }
    
      void Wait() {
        std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
        while (!signalled) {
          condition_.wait(lock);
        }
      }
    };
    
    void CUDART_CB block_op_host_fn(void* arg) {
      Event* evt = (Event*)arg;
      evt->Wait();
    }
    
    int main() {
      cudaStream_t stream;
      CUDACHECK(cudaStreamCreate(&stream));
    
      int num_events = 60; // 50 is okay, 60 will hang
      std::vector<std::shared_ptr<Event>> event_vec;
    
      for (int i = 0; i < num_events; i++) {
        std::cout << "Queuing NoOp " << i << std::endl;
        NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
        std::cout << "Queued NoOp " << i << std::endl;
    
        event_vec.push_back(std::make_shared<Event>());
        cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());
    
        std::cout << "Queued block_op " << i << std::endl;
      }
    
    
      for (int i = 0; i < num_events; i++) {
        event_vec[i]->Signal();
      }
    
      // clean up
      CUDACHECK(cudaDeviceSynchronize());
      CUDACHECK(cudaStreamDestroy(stream));
      return 0;
    }
    $ nvcc -o t2042a t2042a.cu
    $ ./t2042a
    Queuing NoOp 0
    Queued NoOp 0
    Queued block_op 0
    Queuing NoOp 1
    Queued NoOp 1
    Queued block_op 1
    Queuing NoOp 2
    Queued NoOp 2
    Queued block_op 2
    Queuing NoOp 3
    Queued NoOp 3
    Queued block_op 3
    Queuing NoOp 4
    Queued NoOp 4
    Queued block_op 4
    Queuing NoOp 5
    Queued NoOp 5
    Queued block_op 5
    Queuing NoOp 6
    Queued NoOp 6
    Queued block_op 6
    Queuing NoOp 7
    Queued NoOp 7
    Queued block_op 7
    Queuing NoOp 8
    Queued NoOp 8
    Queued block_op 8
    Queuing NoOp 9
    Queued NoOp 9
    Queued block_op 9
    Queuing NoOp 10
    Queued NoOp 10
    Queued block_op 10
    Queuing NoOp 11
    Queued NoOp 11
    Queued block_op 11
    Queuing NoOp 12
    Queued NoOp 12
    Queued block_op 12
    Queuing NoOp 13
    Queued NoOp 13
    Queued block_op 13
    Queuing NoOp 14
    Queued NoOp 14
    Queued block_op 14
    Queuing NoOp 15
    Queued NoOp 15
    Queued block_op 15
    Queuing NoOp 16
    Queued NoOp 16
    Queued block_op 16
    Queuing NoOp 17
    Queued NoOp 17
    Queued block_op 17
    Queuing NoOp 18
    Queued NoOp 18
    Queued block_op 18
    Queuing NoOp 19
    Queued NoOp 19
    Queued block_op 19
    Queuing NoOp 20
    Queued NoOp 20
    Queued block_op 20
    Queuing NoOp 21
    Queued NoOp 21
    Queued block_op 21
    Queuing NoOp 22
    Queued NoOp 22
    Queued block_op 22
    Queuing NoOp 23
    Queued NoOp 23
    Queued block_op 23
    Queuing NoOp 24
    Queued NoOp 24
    Queued block_op 24
    Queuing NoOp 25
    Queued NoOp 25
    Queued block_op 25
    Queuing NoOp 26
    Queued NoOp 26
    Queued block_op 26
    Queuing NoOp 27
    Queued NoOp 27
    Queued block_op 27
    Queuing NoOp 28
    Queued NoOp 28
    Queued block_op 28
    Queuing NoOp 29
    Queued NoOp 29
    Queued block_op 29
    Queuing NoOp 30
    Queued NoOp 30
    Queued block_op 30
    Queuing NoOp 31
    Queued NoOp 31
    Queued block_op 31
    Queuing NoOp 32
    Queued NoOp 32
    Queued block_op 32
    Queuing NoOp 33
    Queued NoOp 33
    Queued block_op 33
    Queuing NoOp 34
    Queued NoOp 34
    Queued block_op 34
    Queuing NoOp 35
    Queued NoOp 35
    Queued block_op 35
    Queuing NoOp 36
    Queued NoOp 36
    Queued block_op 36
    Queuing NoOp 37
    Queued NoOp 37
    Queued block_op 37
    Queuing NoOp 38
    Queued NoOp 38
    Queued block_op 38
    Queuing NoOp 39
    Queued NoOp 39
    Queued block_op 39
    Queuing NoOp 40
    Queued NoOp 40
    Queued block_op 40
    Queuing NoOp 41
    Queued NoOp 41
    Queued block_op 41
    Queuing NoOp 42
    Queued NoOp 42
    Queued block_op 42
    Queuing NoOp 43
    Queued NoOp 43
    Queued block_op 43
    Queuing NoOp 44
    Queued NoOp 44
    Queued block_op 44
    Queuing NoOp 45
    Queued NoOp 45
    Queued block_op 45
    Queuing NoOp 46
    Queued NoOp 46
    Queued block_op 46
    Queuing NoOp 47
    Queued NoOp 47
    Queued block_op 47
    Queuing NoOp 48
    Queued NoOp 48
    Queued block_op 48
    Queuing NoOp 49
    Queued NoOp 49
    Queued block_op 49
    Queuing NoOp 50
    Queued NoOp 50
    Queued block_op 50
    Queuing NoOp 51
    Queued NoOp 51
    Queued block_op 51
    Queuing NoOp 52
    Queued NoOp 52
    Queued block_op 52
    Queuing NoOp 53
    Queued NoOp 53
    Queued block_op 53
    Queuing NoOp 54
    Queued NoOp 54
    Queued block_op 54
    Queuing NoOp 55
    Queued NoOp 55
    Queued block_op 55
    Queuing NoOp 56
    Queued NoOp 56
    Queued block_op 56
    Queuing NoOp 57
    ^C
    $
    
    // test case with a single callback and many kernels
    
    $ cat t2042.cu
    #include <iostream>
    #include <vector>
    #include <mutex>
    #include <condition_variable>
    #include <cstdlib>
    #define CUDACHECK(x) x
    // empty kernel
    __global__ void NoOpKernel() {}
    
    // for blocking stream to wait for host signal
    class Event {
     private:
      std::mutex mtx_condition_;
      std::condition_variable condition_;
      bool signalled = false;
    
     public:
      void Signal() {
        {
          std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
          signalled = true;
        }
        condition_.notify_all();
      }
    
      void Wait() {
        std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
        while (!signalled) {
          condition_.wait(lock);
        }
      }
    };
    
    void CUDART_CB block_op_host_fn(void* arg) {
      Event* evt = (Event*)arg;
      evt->Wait();
    }
    
    int main(int argc, char *argv[]) {
      cudaStream_t stream;
      CUDACHECK(cudaStreamCreate(&stream));
    
      int num_loops = 2000; // 50 is okay, 60 will hang
      int num_events = 0;
      std::vector<std::shared_ptr<Event>> event_vec;
      if (argc > 1) num_loops = atoi(argv[1]);
    
      for (int i = 0; i < num_loops; i++) {
        std::cout << "Queuing NoOp " << i << std::endl;
        NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
        std::cout << "Queued NoOp " << i << std::endl;
        if (i == 0){
          num_events++;
          event_vec.push_back(std::make_shared<Event>());
          cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());
    
          std::cout << "Queued block_op " << i << std::endl;}
      }
    
    
      for (int i = 0; i < num_events; i++) {
        event_vec[i]->Signal();
      }
    
      // clean up
      CUDACHECK(cudaDeviceSynchronize());
      CUDACHECK(cudaStreamDestroy(stream));
      return 0;
    }
    $ nvcc -o t2042 t2042.cu
    $ nvcc -o t2042 t2042.cu
    $ ./t2042
    ... <snip>
    Queuing NoOp 1019
    Queued NoOp 1019
    Queuing NoOp 1020
    Queued NoOp 1020
    Queuing NoOp 1021
    Queued NoOp 1021
    Queuing NoOp 1022
    ^C
    $
    

    (the code hangs when the queue becomes "full", and I terminate at that point with ctrl-C)

    How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?

    Currently, there is no specification for this in CUDA, nor any explicit method to query for this at runtime.