[SOLVED] Cant read the file and place in 2d relative matrix address in CUDA

I am allocating a 2d matrix using malloc and trying to insert values in relative address. I do not understand why it is core dumped error. Please look at my code below.

#include <stdio.h>
#include <stdlib.h>

int main()
{
    int width = 4;
    FILE *fp = fopen("matB.txt", "r");
    int *x;
    x = (int*)malloc(width*width*sizeof(int));
    int i, j;
    for(i=0; i<width; i++)
    {
        for(j=0; j<width; j++)
        {
            fscanf(fp, "%d", x[i*width+j]);
        }
    }

    for(i=0; i<width; i++)
    {
        for(j=0; j<width; j++)
        {
            printf("%d", x[i*width+j]);
        }
    }
    return 0;
}

matB.txt
1 2 3 4
1 2 3 4
1 2 3 4
1 2 3 4

I have made the above sample program to check with the relative address and placing &x[] in fscanf cleared this problem.

The above sample C code is done because of the same read problem in Cuda. When using the same way of allocation of 2d array and its relative address, it is reading the file and when trying to print the same.. it prints 0's instead of 1,2,3,4.. I am in learning phase of CUDA. I see there is no allocation problem for the host array and placing in its relative address but why the file read is printing 0's??

Cuda Program is below

//Matrix multiplication using shared and non shared kernal
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define TILE_WIDTH 2

/*matrix multiplication kernels*/

//non shared
__global__ void MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{
           // calculate thread id
        unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
        unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
        for (int k = 0 ; k<WIDTH ; k++ )
        {
            Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
        }
}

// shared
__global__ void MatrixMulSh( float *Md , float *Nd , float *Pd , const int WIDTH )
{
        //Taking shared array to break the MAtrix in Tile widht and fatch them in that array per ele
          __shared__ float Mds [TILE_WIDTH][TILE_WIDTH] ;
           __shared__ float Nds [TILE_WIDTH][TILE_WIDTH] ;

         // calculate thread id
          unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
          unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
        for (int m = 0 ; m<WIDTH/TILE_WIDTH ; m++ ) // m indicate number of phase
       {
            Mds[threadIdx.y][threadIdx.x] =  Md[row*WIDTH + (m*TILE_WIDTH + threadIdx.x)]  ;
            Nds[threadIdx.y][threadIdx.x] =  Nd[ ( m*TILE_WIDTH + threadIdx.y) * WIDTH + col] ;
         __syncthreads() ; // for syncronizeing the threads
         // Do for tile
           for ( int k = 0; k<TILE_WIDTH ; k++ )
                       Pd[row*WIDTH + col]+= Mds[threadIdx.x][k] * Nds[k][threadIdx.y] ;
         __syncthreads() ; // for syncronizeing the threads
     }
}

// main routine
int main (int argc, char* argv[])
{

    const int WIDTH = 4 ;
    printf("%d\n", WIDTH);
    //float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH], M_result_array_h[WIDTH][WIDTH]  ;
    float *array1_h, *array2_h, *M_result_array_h;
    float *array1_d , *array2_d ,*result_array_d  ,*M_result_array_d; // device array
    int i , j ;
    cudaEvent_t start_full,stop_full;
    float time;
    cudaEventCreate(&start_full);
    cudaEventCreate(&stop_full);
    cudaEventRecord(start_full, 0);

    //char *file1 = argv[2];
    //char *file2 = argv[3];
    //char *file3 = argv[4];

    FILE *fp1 = fopen("matA.txt", "r");
    FILE *fp2 = fopen("matB.txt", "r");
    FILE *fp3 = fopen("matC.txt", "w");

    //create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
    cudaMallocHost((void **) &array1_h , WIDTH*WIDTH*sizeof (float) ) ;
    cudaMallocHost((void **) &array2_h , WIDTH*WIDTH*sizeof (float) ) ;
    cudaMallocHost((void **) &M_result_array_h , WIDTH*WIDTH*sizeof (float) ) ;
    //input in host array
    for ( i = 0 ; i<WIDTH ; i++ )
    {
        for (j = 0 ; j<WIDTH ; j++ )
        {
            fscanf(fp1, "%d", &array1_h[i*WIDTH + j]);
            printf("%d\t", array1_h[i*WIDTH + j]);
        }
      //  fscanf(fp1, "\n");
    }
    /*
    for ( i = 0 ; i<WIDTH ; i++ )
    {
        for (j = 0 ; j<WIDTH ; j++ )
        {
            printf("%d\t", array1_h[i*WIDTH+j]);
        }
        printf("\n");
    }*/
    for ( i = 0 ; i<WIDTH ; i++ )
    {
        for (j = 0 ; j<WIDTH ; j++ )
        {
            fscanf(fp2, "%d", &array2_h[i*WIDTH+j]);
        }
        fscanf(fp2, "\n");
    }

    //create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
    cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (float) ) ;
    cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (float) ) ;

    //copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
    cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
    cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;

    //allocating memory for resultant device array
    cudaMalloc((void **) &result_array_d , WIDTH*WIDTH*sizeof (float) ) ;
    cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (float) ) ;

    //calling kernal
    dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;
    dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;

    // Change if 0 to if 1 for running non shared code and make if 0 for shared memory code
    #if 0
            MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
    #endif

    #if 1
            MatrixMulSh<<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
    #endif

    // all GPU function blocked till kernel is working
    //copy back result_array_d to result_array_h
    cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(int), cudaMemcpyDeviceToHost) ;

    //printf the result array
    for ( i = 0 ; i<WIDTH ; i++ )
    {
        for ( j = 0 ; j < WIDTH ; j++ )
        {
            fprintf (fp3, "%d\t", M_result_array_h[i*WIDTH+j]) ;
        }
        fprintf (fp3, "\n") ;
    }
    //system("pause") ;
    cudaFree(array1_d);
    cudaFree(array2_d);
    cudaFree(M_result_array_d);

    cudaFreeHost(array1_h);
    cudaFreeHost(array2_h);
    cudaFreeHost(M_result_array_h);

    cudaEventRecord(stop_full, 0);
    cudaEventSynchronize(stop_full);

    cudaEventElapsedTime(&time, start_full, stop_full);
    printf ("Total execution Time is : %1.5f ms\n", time);

}