I'm trying to create a program that calculate edit distance between two files. I read with the funcution fread and I use the code to read binary ("rb"). I put in input two PDF files and during the debug I found out that when I try to fill the matrix of the Levenshtein distance algorithm I get a "SIGSEGV (Segmentation fault)" at char n° 1354 of the first file and the program exit with:
Process finished with exit code -1073741819 (0xC0000005)
I controlled and char n° 1354 is \n .
The code that I use to read the files is:
long getFileSize(FILE *file) {
long int size;
fseek(file, 0, SEEK_END);
size = ftell(file);
fseek(file, 0, SEEK_SET);
return size;
}
char *readFromBinary(char *path) {
FILE *file;
file = fopen(path, "rb");
if (file == NULL)
printf("Error!\n");
long fileSize = getFileSize(file);
char *buffer = malloc((fileSize + 1) * sizeof(char));
fread(buffer, sizeof(char), fileSize, file);
return buffer;
}
This is the code that I use to calculate the edit distance:
int calculateDistance(char *pathFile1, char *pathFile2, int choice, char *path) {
FILE *f1 = fopen(pathFile1, "rb");
FILE *f2 = fopen(pathFile2, "rb");
char *contentFile1 = readFromBinary(pathFile1);
char *contentFile2 = readFromBinary(pathFile2);
int distance = 0;
int dim1 = getFileSize(f1);
int dim2 = getFileSize(f2);
int **matrix = constructMatrix(dim1, dim2);
fillMatrix(matrix, dim1, dim2, contentFile1, contentFile2);
distance = matrix[dim1][dim2];
struct Instruction instruction[distance + 1];
int initActions = initInstructions(matrix, pathFile1, &dim1, pathFile2, &dim2, instruction);
endInstructions(pathFile1, &dim1, pathFile2, &dim2, instruction, initActions);
if (choice == 1)
printOnFile(instruction, distance, path);
for (int i = 0; i <= dim1; i++)
free(matrix[i]);
free(matrix);
if (numberOfDivisions > 0)
numberOfDivisions--;
return distance;
}
And this is the code that i use to create and fill the matrix:
int **constructMatrix(int dim1, int dim2) {
//matrice di puntatori
int **matrice = (int **) malloc((dim1 + 1) * sizeof(int *));
//matrice di puntatori
for (int i = 0; i <= dim1; i++)
matrice[i] = (int *) malloc((dim2 + 1) * sizeof(int));
return matrice;
}
void fillMatrix(int **matrix, int dim1, int dim2, char *file1, char *file2) {
for (int i = 0; i <= dim1; i++)
matrix[i][0] = i;
for (int j = 1; j <= dim2; j++)
matrix[0][j] = j;
for (int i = 1; i <= dim1; i++) {
for (int j = 1; j <= dim2; j++) {
if (file1[i - 1] != file2[j - 1]) {
int k = minimum(matrix[i][j - 1], matrix[i - 1][j], matrix[i - 1][j - 1]);
matrix[i][j] = k + 1;
} else
matrix[i][j] = matrix[i - 1][j - 1];
}
}
}
In particular the debugger stops in this line of calculateDistance(fillMatrix(matrix, dim1, dim2, contentFile1, contentFile2);
), and in this line of fillMatrix(matrix[i][0] = i;
) when i=1354.
Information about PDF:
The PDF file is 188671 byte
It has 1355 lines
PS. My program works with txt files.
When any of the memory allocation functions, including malloc, calloc, and realloc() make a request to the OS to obtain memory, unless the OS can find a single block of contiguous memory of the size requested, the function will return NULL
. Since you are asking for a block of incredible size, it is likely to fail.
It is always recommended that the return of any of these functions is tested before attempting to use the value that was returned:
char *buffer = malloc((fileSize + 1) * sizeof(char));
if(!buffer)
{
//handle error
And in this case, it would be good to re-evaluate your algorithm.