cudagpu-constant-memory

Initialize constant global array CUDA C


I have a problem! I need to initialize a constant global array in cuda c. To initialize the array i need to use a for! I need to do this because I have to use this array in some kernels and my professor told me to define as a constant visible only in the device.

How can I do this??

I want to do something like this:

#include <stdio.h>
#include <math.h>
#define N 8

__constant__ double H[N*N];

__global__ void prodotto(double *v, double *w){

        int k=threadIdx.x+blockDim.x*blockIdx.x;

        w[k]=0;
        for(int i=0;i<N;i++) w[k]=w[k]+H[k*N+i]*v[i];
}

int main(){

        double v[8]={1, 1, 1, 1, 1, 1, 1, 1};
        double *dev_v, *dev_w, *w;
        double *host_H;
        host_H=(double*)malloc((N*N)*sizeof(double));
        cudaMalloc((void**)&dev_v,sizeof(double));
        cudaMalloc((void**)&dev_w,sizeof(double));

        for(int k=0;k<N;k++){
            host_H[2*N*k+2*k]=1/1.414;
            host_H[2*N*k+2*k+1]=1/1.414;
            host_H[(2*k+1)*N+2*k]=1/1.414;
            host_H[(2*k+1)+2*k+1]=-1/1.414;

        }

        cudaMemcpyToSymbol(H, host_H, (N*N)*sizeof(double));
        cudaMemcpy(dev_v, v, N*sizeof(double), cudaMemcpyHostToDevice); 
        cudaMemcpy(dev_w, w, N*sizeof(double), cudaMemcpyHostToDevice); 

        prodotto<<<1,N>>>(dev_v, dev_w);

        cudaMemcpy(v, dev_v, N*sizeof(double), cudaMemcpyDeviceToHost); 
        cudaMemcpy(w, dev_w, N*sizeof(double), cudaMemcpyDeviceToHost); 


        for(int i=0;i<N;i++) printf("\n%f   %f", v[i], w[i]);

        return 0;
    }

But the output is an array of zeros...I want the output array to be filled with the product of the matrix H(here seen as an array) and the array v. Thanks !!!!!


Solution

  • Something like this should work:

    #define DSIZE 32
    __constant__ int mydata[DSIZE];
    
    int main(){
      ...
      int *h_mydata;
      h_mydata = new int[DSIZE];
      for (int i = 0; i < DSIZE; i++)
        h_mydata[i] = ....;   // initialize however you wish
      cudaMemcpyToSymbol(mydata, h_mydata, DSIZE*sizeof(int));
      ...
    }
    

    Not difficult. You can then use the __constant__ data directly in a kernel:

    __global__ void mykernel(...){
      ...
      int myval = mydata[threadIdx.x];
      ...
      }
    

    You can read about __constant__ variables in the programming guide. __constant__ variables are read-only from the perspective of device code (kernel code). But from the host, they can be read from or written to using the cudaMemcpyToSymbol/cudaMemcpyFromSymbol API.

    EDIT: Based on the code you've now posted, there were at least 2 errors:

    1. Your allocation sizes for dev_v and dev_w were not correct.
    2. You had no host allocation for w.

    The following code seems to work correctly for me with those 2 fixes:

    $ cat t579.cu
    #include <stdio.h>
    #include <math.h>
    #define N 8
    
    __constant__ double H[N*N];
    
    __global__ void prodotto(double *v, double *w){
    
            int k=threadIdx.x+blockDim.x*blockIdx.x;
    
            w[k]=0;
            for(int i=0;i<N;i++) w[k]=w[k]+H[k*N+i]*v[i];
    }
    
    int main(){
    
            double v[N]={1, 1, 1, 1, 1, 1, 1, 1};
            double *dev_v, *dev_w, *w;
            double *host_H;
            host_H=(double*)malloc((N*N)*sizeof(double));
            w     =(double*)malloc(  (N)*sizeof(double));
            cudaMalloc((void**)&dev_v,N*sizeof(double));
            cudaMalloc((void**)&dev_w,N*sizeof(double));
    
            for(int k=0;k<N;k++){
                host_H[2*N*k+2*k]=1/1.414;
                host_H[2*N*k+2*k+1]=1/1.414;
                host_H[(2*k+1)*N+2*k]=1/1.414;
                host_H[(2*k+1)+2*k+1]=-1/1.414;
    
            }
    
            cudaMemcpyToSymbol(H, host_H, (N*N)*sizeof(double));
            cudaMemcpy(dev_v, v, N*sizeof(double), cudaMemcpyHostToDevice);
            cudaMemcpy(dev_w, w, N*sizeof(double), cudaMemcpyHostToDevice);
    
            prodotto<<<1,N>>>(dev_v, dev_w);
    
            cudaMemcpy(v, dev_v, N*sizeof(double), cudaMemcpyDeviceToHost);
            cudaMemcpy(w, dev_w, N*sizeof(double), cudaMemcpyDeviceToHost);
    
    
            for(int i=0;i<N;i++) printf("\n%f   %f", v[i], w[i]);
            printf("\n");
            return 0;
        }
    $ nvcc -arch=sm_20 -o t579 t579.cu
    $ cuda-memcheck ./t579
    ========= CUDA-MEMCHECK
    
    1.000000   0.000000
    1.000000   -0.707214
    1.000000   -0.707214
    1.000000   -1.414427
    1.000000   1.414427
    1.000000   0.707214
    1.000000   1.414427
    1.000000   0.707214
    ========= ERROR SUMMARY: 0 errors
    $
    

    A few notes:

    1. Any time you're having trouble with a CUDA code, it's good practice to use proper cuda error checking.
    2. You can run your code with cuda-memcheck (just as I have above) to get a quick read of whether any CUDA errors are encountered.
    3. I've not verified the numerical results or worked through the math. If it's not what you wanted, I assume you can sort it out.
    4. I've not made any changes to your code other than what seemed sensible to me to fix the obvious errors and make the results presentable for educational purposes. Certainly there can be discussions about preferred allocation methods, printf vs. cout, and what have you. I'm focused primarily on CUDA topics in this answer.