Compiling CUDA sample program

I have a very simple CUDA program that refuses to compile

This is main.cpp

#include <iostream>
#include <cstdlib>
#include "/opt/cuda/targets/x86_64-linux/include/cuda_runtime.h"
#include "/opt/cuda/targets/x86_64-linux/include/cublas_v2.h"

// Function to initialize a matrix with random values
void initMatrix(float* matrix, int rows, int cols) {
    for (int i = 0; i < rows * cols; ++i) {
        matrix[i] = rand() / (float)RAND_MAX;  // Initialize with random values
    }
}

// Function to print a matrix
void printMatrix(float* matrix, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << matrix[i * cols + j] << " ";
        }
        std::cout << std::endl;
    }
}

int main() {
    // Matrix dimensions
    int m = 3;  // Number of rows
    int n = 3;  // Number of columns

    // Allocate host memory for the matrix and vector
    float* h_matrix = new float[m * n];
    float* h_vector = new float[n];

    // Initialize matrix and vector with random values
    initMatrix(h_matrix, m, n);
    initMatrix(h_vector, n, 1);

    // Print the matrix and vector
    std::cout << "Matrix:" << std::endl;
    printMatrix(h_matrix, m, n);

    std::cout << "\nVector:" << std::endl;
    printMatrix(h_vector, n, 1);

    // Allocate device memory for the matrix and vector
    float* d_matrix;
    float* d_vector;
    cudaMalloc((void**)&d_matrix, m * n * sizeof(float));
    cudaMalloc((void**)&d_vector, n * sizeof(float));

    // Transfer the matrix and vector from host to device
    cudaMemcpy(d_matrix, h_matrix, m * n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector, h_vector, n * sizeof(float), cudaMemcpyHostToDevice);

    // cuBLAS handle
    cublasHandle_t handle;
    cublasCreate(&handle);

    // Perform matrix-vector multiplication
    float alpha = 1.0;  // Scaling factor
    float beta = 0.0;   // Scaling factor
    cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, d_matrix, m, d_vector, 1, &beta, d_vector, 1);

    // Transfer the result back to the host
    cudaMemcpy(h_vector, d_vector, m * sizeof(float), cudaMemcpyDeviceToHost);

    // Print the result
    std::cout << "\nResult:" << std::endl;
    printMatrix(h_vector, m, 1);

    // Clean up
    delete[] h_matrix;
    delete[] h_vector;
    cudaFree(d_matrix);
    cudaFree(d_vector);
    cublasDestroy(handle);

    return 0;
}

This is CMakeLists.txt:

cmake_minimum_required(VERSION 3.17)
project(CUBLAS_Matrix_Vector_Multiplication LANGUAGES CXX CUDA)

find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)

list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
add_definitions(-std=c++11)

set(CUDA_SEPARABLE_COMPILATION ON)

add_executable(matrix_vector_multiplication main.cpp)

target_include_directories(matrix_vector_multiplication PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
set_target_properties(matrix_vector_multiplication PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_STANDARD 11)
target_link_libraries(matrix_vector_multiplication PRIVATE ${CUDAToolkit_LIBRARIES})

trying to compile it by cmake create the config files. But then make causes a lot of errors:

/usr/bin/ld: CMakeFiles/matrix_vector_multiplication.dir/main.cpp.o: in function `main':
main.cpp:(.text+0x8056): undefined reference to `cudaMalloc'
/usr/bin/ld: main.cpp:(.text+0x8072): undefined reference to `cudaMalloc'
/usr/bin/ld: main.cpp:(.text+0x8098): undefined reference to `cudaMemcpy'
/usr/bin/ld: main.cpp:(.text+0x80ba): undefined reference to `cudaMemcpy'
/usr/bin/ld: main.cpp:(.text+0x80c6): undefined reference to `cublasCreate_v2'
/usr/bin/ld: main.cpp:(.text+0x8112): undefined reference to `cublasSgemv_v2'
/usr/bin/ld: main.cpp:(.text+0x8138): undefined reference to `cudaMemcpy'
/usr/bin/ld: main.cpp:(.text+0x81c6): undefined reference to `cudaFree'
/usr/bin/ld: main.cpp:(.text+0x81d2): undefined reference to `cudaFree'
/usr/bin/ld: main.cpp:(.text+0x81de): undefined reference to `cublasDestroy_v2'
collect2: Fehler: ld gab 1 als Ende-Status zurück
make[2]: *** [CMakeFiles/matrix_vector_multiplication.dir/build.make:97: matrix_vector_multiplication] Fehler 1
make[1]: *** [CMakeFiles/Makefile2:83: CMakeFiles/matrix_vector_multiplication.dir/all] Fehler 2
make: *** [Makefile:91: all] Fehler 2

Attempt to Resolve

I tried to follow this question, and used:

 g++ main.cpp -o main -L/opt/cuda/lib64/ -lcudart -lcuda -L/opt/cuda

This throws the error:

In file included from /opt/cuda/targets/x86_64-linux/include/cublas_v2.h:69,
                 from main.cpp:4:
/opt/cuda/targets/x86_64-linux/include/cublas_api.h:77:10: fatal error: cuda_fp16.h: No such file or directory
   77 | #include <cuda_fp16.h>
      |          ^~~~~~~~~~~~~
compilation terminated.

How can I resolve the issue?

 nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Sep__8_19:17:24_PDT_2023
Cuda compilation tools, release 12.3, V12.3.52
Build cuda_12.3.r12.3/compiler.33281558_0

Solution

When you change the includes to

#include <cuda_runtime.h>
#include <cublas_v2.h>

this should work:

cmake_minimum_required(VERSION 3.17)
project(CUBLAS_Matrix_Vector_Multiplication LANGUAGES CXX)

find_package(CUDAToolkit REQUIRED)

add_executable(matrix_vector_multiplication main.cpp)

target_link_libraries(matrix_vector_multiplication
    PRIVATE
        CUDA::cublas
        CUDA::cudart
)
target_compile_features(matrix_vector_multiplication
    PRIVATE
        cxx_std_11
)

The CUDA language in CMake and many options that you are trying to use are only needed for compiling code written in the CUDA C++ dialect, i.e. custom kernels. Other than that I used the cleanest/modernest way of specifying the C++ standard version. With the CUDA:: targets, CMake will take care of specifying the right include paths to the compiler with -I such that the hard-coded paths you used are not needed anymore (I had to change them for testing the CMakeLists.txt as the paths are different on my system). It will also take care of any dependencies of these targets.

For more information on the targets made available, see FindCUDAToolkit.