I have a problem! I need to initialize a constant global array in cuda c. To initialize the array i need to use a for! I need to do this because I have to use this array in some kernels and my professor told me to define as a constant visible only in the device.
How can I do this??
I want to do something like this:
#include <stdio.h>
#include <math.h>
#define N 8
__constant__ double H[N*N];
__global__ void prodotto(double *v, double *w){
int k=threadIdx.x+blockDim.x*blockIdx.x;
w[k]=0;
for(int i=0;i<N;i++) w[k]=w[k]+H[k*N+i]*v[i];
}
int main(){
double v[8]={1, 1, 1, 1, 1, 1, 1, 1};
double *dev_v, *dev_w, *w;
double *host_H;
host_H=(double*)malloc((N*N)*sizeof(double));
cudaMalloc((void**)&dev_v,sizeof(double));
cudaMalloc((void**)&dev_w,sizeof(double));
for(int k=0;k<N;k++){
host_H[2*N*k+2*k]=1/1.414;
host_H[2*N*k+2*k+1]=1/1.414;
host_H[(2*k+1)*N+2*k]=1/1.414;
host_H[(2*k+1)+2*k+1]=-1/1.414;
}
cudaMemcpyToSymbol(H, host_H, (N*N)*sizeof(double));
cudaMemcpy(dev_v, v, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_w, w, N*sizeof(double), cudaMemcpyHostToDevice);
prodotto<<<1,N>>>(dev_v, dev_w);
cudaMemcpy(v, dev_v, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(w, dev_w, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++) printf("\n%f %f", v[i], w[i]);
return 0;
}
But the output is an array of zeros...I want the output array to be filled with the product of the matrix H(here seen as an array) and the array v. Thanks !!!!!
Something like this should work:
#define DSIZE 32
__constant__ int mydata[DSIZE];
int main(){
...
int *h_mydata;
h_mydata = new int[DSIZE];
for (int i = 0; i < DSIZE; i++)
h_mydata[i] = ....; // initialize however you wish
cudaMemcpyToSymbol(mydata, h_mydata, DSIZE*sizeof(int));
...
}
Not difficult. You can then use the __constant__
data directly in a kernel:
__global__ void mykernel(...){
...
int myval = mydata[threadIdx.x];
...
}
You can read about __constant__
variables in the programming guide. __constant__
variables are read-only from the perspective of device code (kernel code). But from the host, they can be read from or written to using the cudaMemcpyToSymbol/cudaMemcpyFromSymbol
API.
EDIT: Based on the code you've now posted, there were at least 2 errors:
dev_v
and dev_w
were not correct.w
. The following code seems to work correctly for me with those 2 fixes:
$ cat t579.cu
#include <stdio.h>
#include <math.h>
#define N 8
__constant__ double H[N*N];
__global__ void prodotto(double *v, double *w){
int k=threadIdx.x+blockDim.x*blockIdx.x;
w[k]=0;
for(int i=0;i<N;i++) w[k]=w[k]+H[k*N+i]*v[i];
}
int main(){
double v[N]={1, 1, 1, 1, 1, 1, 1, 1};
double *dev_v, *dev_w, *w;
double *host_H;
host_H=(double*)malloc((N*N)*sizeof(double));
w =(double*)malloc( (N)*sizeof(double));
cudaMalloc((void**)&dev_v,N*sizeof(double));
cudaMalloc((void**)&dev_w,N*sizeof(double));
for(int k=0;k<N;k++){
host_H[2*N*k+2*k]=1/1.414;
host_H[2*N*k+2*k+1]=1/1.414;
host_H[(2*k+1)*N+2*k]=1/1.414;
host_H[(2*k+1)+2*k+1]=-1/1.414;
}
cudaMemcpyToSymbol(H, host_H, (N*N)*sizeof(double));
cudaMemcpy(dev_v, v, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_w, w, N*sizeof(double), cudaMemcpyHostToDevice);
prodotto<<<1,N>>>(dev_v, dev_w);
cudaMemcpy(v, dev_v, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(w, dev_w, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++) printf("\n%f %f", v[i], w[i]);
printf("\n");
return 0;
}
$ nvcc -arch=sm_20 -o t579 t579.cu
$ cuda-memcheck ./t579
========= CUDA-MEMCHECK
1.000000 0.000000
1.000000 -0.707214
1.000000 -0.707214
1.000000 -1.414427
1.000000 1.414427
1.000000 0.707214
1.000000 1.414427
1.000000 0.707214
========= ERROR SUMMARY: 0 errors
$
A few notes:
cuda-memcheck
(just as I have above) to get a quick read of whether any CUDA errors are encountered.printf
vs. cout
, and what have you. I'm focused primarily on CUDA topics in this answer.