I have a very simple CUDA program that refuses to compile
This is main.cpp
#include <iostream>
#include <cstdlib>
#include "/opt/cuda/targets/x86_64-linux/include/cuda_runtime.h"
#include "/opt/cuda/targets/x86_64-linux/include/cublas_v2.h"
// Function to initialize a matrix with random values
void initMatrix(float* matrix, int rows, int cols) {
for (int i = 0; i < rows * cols; ++i) {
matrix[i] = rand() / (float)RAND_MAX; // Initialize with random values
}
}
// Function to print a matrix
void printMatrix(float* matrix, int rows, int cols) {
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
std::cout << matrix[i * cols + j] << " ";
}
std::cout << std::endl;
}
}
int main() {
// Matrix dimensions
int m = 3; // Number of rows
int n = 3; // Number of columns
// Allocate host memory for the matrix and vector
float* h_matrix = new float[m * n];
float* h_vector = new float[n];
// Initialize matrix and vector with random values
initMatrix(h_matrix, m, n);
initMatrix(h_vector, n, 1);
// Print the matrix and vector
std::cout << "Matrix:" << std::endl;
printMatrix(h_matrix, m, n);
std::cout << "\nVector:" << std::endl;
printMatrix(h_vector, n, 1);
// Allocate device memory for the matrix and vector
float* d_matrix;
float* d_vector;
cudaMalloc((void**)&d_matrix, m * n * sizeof(float));
cudaMalloc((void**)&d_vector, n * sizeof(float));
// Transfer the matrix and vector from host to device
cudaMemcpy(d_matrix, h_matrix, m * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_vector, h_vector, n * sizeof(float), cudaMemcpyHostToDevice);
// cuBLAS handle
cublasHandle_t handle;
cublasCreate(&handle);
// Perform matrix-vector multiplication
float alpha = 1.0; // Scaling factor
float beta = 0.0; // Scaling factor
cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, d_matrix, m, d_vector, 1, &beta, d_vector, 1);
// Transfer the result back to the host
cudaMemcpy(h_vector, d_vector, m * sizeof(float), cudaMemcpyDeviceToHost);
// Print the result
std::cout << "\nResult:" << std::endl;
printMatrix(h_vector, m, 1);
// Clean up
delete[] h_matrix;
delete[] h_vector;
cudaFree(d_matrix);
cudaFree(d_vector);
cublasDestroy(handle);
return 0;
}
This is CMakeLists.txt
:
cmake_minimum_required(VERSION 3.17)
project(CUBLAS_Matrix_Vector_Multiplication LANGUAGES CXX CUDA)
find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
add_definitions(-std=c++11)
set(CUDA_SEPARABLE_COMPILATION ON)
add_executable(matrix_vector_multiplication main.cpp)
target_include_directories(matrix_vector_multiplication PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
set_target_properties(matrix_vector_multiplication PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_STANDARD 11)
target_link_libraries(matrix_vector_multiplication PRIVATE ${CUDAToolkit_LIBRARIES})
trying to compile it by cmake
create the config files. But then make
causes a lot of errors:
/usr/bin/ld: CMakeFiles/matrix_vector_multiplication.dir/main.cpp.o: in function `main':
main.cpp:(.text+0x8056): undefined reference to `cudaMalloc'
/usr/bin/ld: main.cpp:(.text+0x8072): undefined reference to `cudaMalloc'
/usr/bin/ld: main.cpp:(.text+0x8098): undefined reference to `cudaMemcpy'
/usr/bin/ld: main.cpp:(.text+0x80ba): undefined reference to `cudaMemcpy'
/usr/bin/ld: main.cpp:(.text+0x80c6): undefined reference to `cublasCreate_v2'
/usr/bin/ld: main.cpp:(.text+0x8112): undefined reference to `cublasSgemv_v2'
/usr/bin/ld: main.cpp:(.text+0x8138): undefined reference to `cudaMemcpy'
/usr/bin/ld: main.cpp:(.text+0x81c6): undefined reference to `cudaFree'
/usr/bin/ld: main.cpp:(.text+0x81d2): undefined reference to `cudaFree'
/usr/bin/ld: main.cpp:(.text+0x81de): undefined reference to `cublasDestroy_v2'
collect2: Fehler: ld gab 1 als Ende-Status zurück
make[2]: *** [CMakeFiles/matrix_vector_multiplication.dir/build.make:97: matrix_vector_multiplication] Fehler 1
make[1]: *** [CMakeFiles/Makefile2:83: CMakeFiles/matrix_vector_multiplication.dir/all] Fehler 2
make: *** [Makefile:91: all] Fehler 2
Attempt to Resolve
I tried to follow this question, and used:
g++ main.cpp -o main -L/opt/cuda/lib64/ -lcudart -lcuda -L/opt/cuda
This throws the error:
In file included from /opt/cuda/targets/x86_64-linux/include/cublas_v2.h:69,
from main.cpp:4:
/opt/cuda/targets/x86_64-linux/include/cublas_api.h:77:10: fatal error: cuda_fp16.h: No such file or directory
77 | #include <cuda_fp16.h>
| ^~~~~~~~~~~~~
compilation terminated.
How can I resolve the issue?
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Sep__8_19:17:24_PDT_2023
Cuda compilation tools, release 12.3, V12.3.52
Build cuda_12.3.r12.3/compiler.33281558_0
When you change the includes to
#include <cuda_runtime.h>
#include <cublas_v2.h>
this should work:
cmake_minimum_required(VERSION 3.17)
project(CUBLAS_Matrix_Vector_Multiplication LANGUAGES CXX)
find_package(CUDAToolkit REQUIRED)
add_executable(matrix_vector_multiplication main.cpp)
target_link_libraries(matrix_vector_multiplication
PRIVATE
CUDA::cublas
CUDA::cudart
)
target_compile_features(matrix_vector_multiplication
PRIVATE
cxx_std_11
)
The CUDA
language in CMake and many options that you are trying to use are only needed for compiling code written in the CUDA C++ dialect, i.e. custom kernels. Other than that I used the cleanest/modernest way of specifying the C++ standard version. With the CUDA::
targets, CMake will take care of specifying the right include paths to the compiler with -I
such that the hard-coded paths you used are not needed anymore (I had to change them for testing the CMakeLists.txt
as the paths are different on my system). It will also take care of any dependencies of these targets.
For more information on the targets made available, see FindCUDAToolkit
.