I am allocating a 2d matrix using malloc and trying to insert values in relative address. I do not understand why it is core dumped error. Please look at my code below.
#include <stdio.h>
#include <stdlib.h>
int main()
{
int width = 4;
FILE *fp = fopen("matB.txt", "r");
int *x;
x = (int*)malloc(width*width*sizeof(int));
int i, j;
for(i=0; i<width; i++)
{
for(j=0; j<width; j++)
{
fscanf(fp, "%d", x[i*width+j]);
}
}
for(i=0; i<width; i++)
{
for(j=0; j<width; j++)
{
printf("%d", x[i*width+j]);
}
}
return 0;
}
matB.txt
1 2 3 4
1 2 3 4
1 2 3 4
1 2 3 4
I have made the above sample program to check with the relative address and placing &x[] in fscanf cleared this problem.
The above sample C code is done because of the same read problem in Cuda. When using the same way of allocation of 2d array and its relative address, it is reading the file and when trying to print the same.. it prints 0's instead of 1,2,3,4.. I am in learning phase of CUDA. I see there is no allocation problem for the host array and placing in its relative address but why the file read is printing 0's??
Cuda Program is below
//Matrix multiplication using shared and non shared kernal
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define TILE_WIDTH 2
/*matrix multiplication kernels*/
//non shared
__global__ void MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{
// calculate thread id
unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
for (int k = 0 ; k<WIDTH ; k++ )
{
Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
}
}
// shared
__global__ void MatrixMulSh( float *Md , float *Nd , float *Pd , const int WIDTH )
{
//Taking shared array to break the MAtrix in Tile widht and fatch them in that array per ele
__shared__ float Mds [TILE_WIDTH][TILE_WIDTH] ;
__shared__ float Nds [TILE_WIDTH][TILE_WIDTH] ;
// calculate thread id
unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
for (int m = 0 ; m<WIDTH/TILE_WIDTH ; m++ ) // m indicate number of phase
{
Mds[threadIdx.y][threadIdx.x] = Md[row*WIDTH + (m*TILE_WIDTH + threadIdx.x)] ;
Nds[threadIdx.y][threadIdx.x] = Nd[ ( m*TILE_WIDTH + threadIdx.y) * WIDTH + col] ;
__syncthreads() ; // for syncronizeing the threads
// Do for tile
for ( int k = 0; k<TILE_WIDTH ; k++ )
Pd[row*WIDTH + col]+= Mds[threadIdx.x][k] * Nds[k][threadIdx.y] ;
__syncthreads() ; // for syncronizeing the threads
}
}
// main routine
int main (int argc, char* argv[])
{
const int WIDTH = 4 ;
printf("%d\n", WIDTH);
//float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH], M_result_array_h[WIDTH][WIDTH] ;
float *array1_h, *array2_h, *M_result_array_h;
float *array1_d , *array2_d ,*result_array_d ,*M_result_array_d; // device array
int i , j ;
cudaEvent_t start_full,stop_full;
float time;
cudaEventCreate(&start_full);
cudaEventCreate(&stop_full);
cudaEventRecord(start_full, 0);
//char *file1 = argv[2];
//char *file2 = argv[3];
//char *file3 = argv[4];
FILE *fp1 = fopen("matA.txt", "r");
FILE *fp2 = fopen("matB.txt", "r");
FILE *fp3 = fopen("matC.txt", "w");
//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
cudaMallocHost((void **) &array1_h , WIDTH*WIDTH*sizeof (float) ) ;
cudaMallocHost((void **) &array2_h , WIDTH*WIDTH*sizeof (float) ) ;
cudaMallocHost((void **) &M_result_array_h , WIDTH*WIDTH*sizeof (float) ) ;
//input in host array
for ( i = 0 ; i<WIDTH ; i++ )
{
for (j = 0 ; j<WIDTH ; j++ )
{
fscanf(fp1, "%d", &array1_h[i*WIDTH + j]);
printf("%d\t", array1_h[i*WIDTH + j]);
}
// fscanf(fp1, "\n");
}
/*
for ( i = 0 ; i<WIDTH ; i++ )
{
for (j = 0 ; j<WIDTH ; j++ )
{
printf("%d\t", array1_h[i*WIDTH+j]);
}
printf("\n");
}*/
for ( i = 0 ; i<WIDTH ; i++ )
{
for (j = 0 ; j<WIDTH ; j++ )
{
fscanf(fp2, "%d", &array2_h[i*WIDTH+j]);
}
fscanf(fp2, "\n");
}
//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (float) ) ;
cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (float) ) ;
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
//allocating memory for resultant device array
cudaMalloc((void **) &result_array_d , WIDTH*WIDTH*sizeof (float) ) ;
cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (float) ) ;
//calling kernal
dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;
dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;
// Change if 0 to if 1 for running non shared code and make if 0 for shared memory code
#if 0
MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
#endif
#if 1
MatrixMulSh<<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
#endif
// all GPU function blocked till kernel is working
//copy back result_array_d to result_array_h
cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(int), cudaMemcpyDeviceToHost) ;
//printf the result array
for ( i = 0 ; i<WIDTH ; i++ )
{
for ( j = 0 ; j < WIDTH ; j++ )
{
fprintf (fp3, "%d\t", M_result_array_h[i*WIDTH+j]) ;
}
fprintf (fp3, "\n") ;
}
//system("pause") ;
cudaFree(array1_d);
cudaFree(array2_d);
cudaFree(M_result_array_d);
cudaFreeHost(array1_h);
cudaFreeHost(array2_h);
cudaFreeHost(M_result_array_h);
cudaEventRecord(stop_full, 0);
cudaEventSynchronize(stop_full);
cudaEventElapsedTime(&time, start_full, stop_full);
printf ("Total execution Time is : %1.5f ms\n", time);
}
Should be fscanf(fp, "%d", &x[i*width+j]);
. The scanf
family requires the address of a location in which to write the scanned value.
Also, don't cast malloc.