cuda nsight nvprof nsight-compute roofline

Roofline Model with CUDA Manual vs. Nsight Compute

I have a very simple vector addition kernel written for CUDA. I want to calculate the arithmetic intensity as well as GFLOP/s for this Kernel. The values I calculate differ visibly from the values obtained by Nsight Compute's Roofline Analysis section.

Since I have a very simple vector addition kernel of farm C = A + B with all three vectors having the size N I am expecting, I would expect: N arithmetic operations and 3 x N x 4 (assuming sizeof(float)==4) bytes accessed, this would then result with an arithmetic intensity of roughly 0.083.

Further, I would expect then except the GFLOP/s to be N x 1e-9 / kernel_time_in_seconds The values I compute are visibly different from Nsight compute, I am aware the Nsight compute decreases the clock speed, but I would expect the values for the arithmetic intensity (operation per byte) to be the same (or roughly the same due to have it profiles the code).

My CUDA kernels looks as following:

#include <iostream>
#include <cuda_runtime.h>

#define N 200000

__global__ void vectorAdd(float* a, float* b, float* c)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < N)
    {
        c[tid] = a[tid] + b[tid];
    }
}

int main()
{
    // Declare and initialize host vectors
    float* host_a = new float[N];
    float* host_b = new float[N];
    float* host_c = new float[N];
    for (int i = 0; i < N; ++i)
    {
        host_a[i] = i;
        host_b[i] = 2 * i;
    }

    // Declare and allocate device vectors
    float* dev_a, * dev_b, * dev_c;
    cudaMalloc((void**)&dev_a, N * sizeof(float));
    cudaMalloc((void**)&dev_b, N * sizeof(float));
    cudaMalloc((void**)&dev_c, N * sizeof(float));

    // Copy host vectors to device
    cudaMemcpy(dev_a, host_a, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, host_b, N * sizeof(float), cudaMemcpyHostToDevice);

    // Define kernel launch configuration
    int blockSize, gridSize;
    cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, vectorAdd, 0, N);

    // Start timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    vectorAdd<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c);

    // Stop timer and calculate execution duration
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result from device to host
    cudaMemcpy(host_c, dev_c, N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();

    // Print execution duration
    std::cout << "Kernel execution duration: " << milliseconds << " ms" << std::endl;

    int numFloatingPointOps = N;
    int numBytesAccessed = 3 * N * sizeof(float);
    float opsPerByte = static_cast<float>(numFloatingPointOps) / static_cast<float>(numBytesAccessed);

    std::cout << "Floating-point operations per byte: " << opsPerByte << std::endl;

    float executionTimeSeconds = milliseconds / 1e3;
    float numGFLOPs = static_cast<float>(numFloatingPointOps) / 1e9;
    float GFLOPs = numGFLOPs / executionTimeSeconds;

    std::cout << "GFLOP/s: " << GFLOPs << std::endl;

    // Cleanup
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
    delete[] host_a;
    delete[] host_b;
    delete[] host_c;

    return 0;
}

Example output on my pc:

Kernel execution duration: 0.014144 ms
Floating-point operations per byte: 0.0833333
GFLOP/s: 14.1403

Compiled and run/profiled simply with:

nvcc vectorAdd.cu
sudo env "PATH=$PATH" ncu -f -o vectorAdd_rep --set full ./a.out

Nsight compute says that the arithmetic intensity is 0.12, I have a photo of it:

When I look at the instruction statistics operations related to global load (LDG) and stores (STG) are 3 times more of the FADD (element-wise floating add), with the 4 bytes size I would against expect 0.083 but it is not the case, what is the cause of the discrepancy between the 2 arithmetic intensities, am I doing something wrong? The other instructions I checked dont seem to be relevant for the arithmetic intensity calculation.

I add a photo on instruction statistics:

Solution

With the updated code following the advice of Jérôme Richard I could identify the problem. Firstly, the results obtained with Nsight Compute are not accurate for small grid sizes. With enough elements, the results from the Nsight Compute are pretty close to my results.

Another important notice is that profiled code runs in less clock speed, as identify that the theoretical bounds (in memory transfer and peak FLOP/s that be achieved) are both less than the values that can be obtained through calls to the CUDA API. I can further verify that both in my code and in Nsight Compute, the achieved percentage of peak performance (w. respect to arithmetic intensity) is quite similar. Here is the updated code:

#include <iostream>
#include <cuda_runtime.h>

constexpr size_t N = static_cast<size_t>(1e9 / static_cast<float>(sizeof(float)));

#define CHECK_ERR checkErr(__FILE__,__LINE__)

std::string PrevFile = "";
int PrevLine = 0;

void checkErr(const std::string &File, int Line) {{
#ifndef NDEBUG
    cudaError_t Error = cudaGetLastError();
    if (Error != cudaSuccess) {{
        std::cout << std::endl << File
                << ", line " << Line
                << ": " << cudaGetErrorString(Error)
                << " (" << Error << ")"
                << std::endl;

        if (PrevLine > 0)
        std::cout << "Previous CUDA call:" << std::endl
                    << PrevFile << ", line " << PrevLine << std::endl;
        throw;
    }}
    PrevFile = File;
    PrevLine = Line;
#endif
}}

__global__ void vectorAdd(float* a, float* b, float* c)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < N)
    {
        c[tid] = a[tid] + b[tid];
    }
}

int main()
{
    // Declare and initialize host vectors
    float* host_a = new float[N];
    float* host_b = new float[N];
    float* host_c = new float[N];
    for (int i = 0; i < N; ++i)
    {
        host_a[i] = i;
        host_b[i] = 2 * i;
    }

    // Declare and allocate device vectors
    float* dev_a, * dev_b, * dev_c;
    cudaMalloc((void**)&dev_a, N * sizeof(float)); CHECK_ERR;
    cudaMalloc((void**)&dev_b, N * sizeof(float)); CHECK_ERR;
    cudaMalloc((void**)&dev_c, N * sizeof(float)); CHECK_ERR;

    // Copy host vectors to device
    cudaMemcpy(dev_a, host_a, N * sizeof(float), cudaMemcpyHostToDevice); CHECK_ERR;
    cudaMemcpy(dev_b, host_b, N * sizeof(float), cudaMemcpyHostToDevice); CHECK_ERR;

    // Define kernel launch configuration
    // int blockSize, gridSize;
    // cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, vectorAdd, 0, N); CHECK_ERR;vectorAdd<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c); CHECK_ERR;
    int blockSize = 256;
    int gridSize = (N + blockSize - 1) / blockSize;

    // Fire first kernel and discard
    vectorAdd<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c); CHECK_ERR;
    cudaDeviceSynchronize();

    // Start timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start); CHECK_ERR;
    cudaEventCreate(&stop); CHECK_ERR;
    cudaEventRecord(start); CHECK_ERR;

    // Launch kernel
    vectorAdd<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c); CHECK_ERR;

    // Stop timer and calculate execution duration
    cudaEventRecord(stop); CHECK_ERR;
    cudaEventSynchronize(stop); CHECK_ERR;
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop); CHECK_ERR;

    // Copy result from device to host
    cudaMemcpy(host_c, dev_c, N * sizeof(float), cudaMemcpyDeviceToHost); CHECK_ERR;
    cudaDeviceSynchronize(); CHECK_ERR;

    for (int i = 0; i < N; ++i)
    {
        if (host_c[i] > 1.001f * (3.0f * static_cast<float>(i)) ||
            host_c[i] < 0.999f * (3.0f * static_cast<float>(i))){
            throw std::runtime_error("Results different from expected " + std::to_string(host_c[i]) + " != " + std::to_string(3.0f * static_cast<float>(i)));
        }
    }

    // Print execution duration
    std::cout << "Kernel execution duration: " << milliseconds << " ms" << std::endl;

    size_t numFloatingPointOps = N;
    size_t numBytesAccessed = 3 * N * sizeof(float);
    float opsPerByte = static_cast<float>(numFloatingPointOps) / static_cast<float>(numBytesAccessed);

    std::cout << "Floating-point operations per byte: " << opsPerByte << std::endl;

    float executionTimeSeconds = milliseconds / 1e3;
    float numGFLOPs = static_cast<float>(numFloatingPointOps) / 1e9;
    float GFLOPs = numGFLOPs / executionTimeSeconds;

    std::cout << "GFLOP/s: " << GFLOPs << std::endl;

    float peakMemoryBandwidthTheo = 176.032; // GB /s
    float peakGFLOPTheo  = 4329.47; // GFlop /s
    float peakGFLOPforIntensity = std::min(peakMemoryBandwidthTheo * opsPerByte, peakGFLOPTheo);

    float achievedPeak = (static_cast<float>(GFLOPs) / peakGFLOPforIntensity) * 100.0f;
    std::string strAchievedPeak(6, '\0');
    std::sprintf(&strAchievedPeak[0], "%.2f", achievedPeak);
    std::cout << "Percentage of Peak Performance: " << strAchievedPeak << "%" << std::endl;

    float GBPerSecond = (static_cast<float>(numBytesAccessed) * 1e-9) / executionTimeSeconds;
    std::cout << "GB per Second: " << GBPerSecond << std::endl;

    // Cleanup
    cudaFree(dev_a); CHECK_ERR;
    cudaFree(dev_b); CHECK_ERR;
    cudaFree(dev_c); CHECK_ERR;
    delete[] host_a;
    delete[] host_b;
    delete[] host_c;

    return 0;
}

Example output from my RTX 3050:

Kernel execution duration: 17.6701 ms
Floating-point operations per byte: 0.0833333
GFLOP/s: 14.1482
Percentage of Peak Performance: 96.45%
GB per Second: 169.778