so Im working on this project just to refresh on hash tables and some C libraries..
I have implemented general hash functions and everything needed for a basic table using file i/o..but I am stuck when trying to think of how to go about resizing the table.
I am wondering if it would be best to copy over the data from my table into a new one..or just initialize a new hash table when the first one fills..
My confusion here lies mainly in how to copy one hashtable's values into a new hashtable file effectively and how to handle the previous file's then useless memory allocation
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
int currSize = 0;
int tableSize = 25;
typedef struct mydata_tag{
int used;
/* 0 - empty 1- used */
int key;
char name[25];
char ailment[30];
char insurance[30];
char social[10];
} mydata;
int setInfo(mydata * data)
{
printf("What is the client's ailment?");
scanf("%s", data->ailment);
printf("What is the client's insurance provider?");
scanf("%s", data->insurance);
printf("What is the client's Social Security Number?");
scanf("%s", data->social);
}
int hashKey(char * name){
int key, len, i;
key = 0;
len = strlen(name);
for(i = 0; i < len; i++)
{
key += name[i] * 10;
}
key %= 19;
key %= 7;
return key;
}
void init_table(char *filename, int size)
{
if(!filename || size < 1)
{
exit(1);
}
currSize++;
FILE * fp;
mydata data;
int i;
memset(&data, 0, sizeof(data));
fp = fopen(filename, "w+"); //create file with write capabilities
for(i = 0; i < size; i++)//initialize table
{
fwrite(&data, sizeof(mydata), i, fp);
}
}
void insert_data(double key, char *name, char *filename)
{
if(!name || !filename)
{
exit(1);
}
FILE * fp;
mydata data, slot;
int pos;
pos = key;
data.used = 1;
data.key = key;
strcpy(data.name, name);
setInfo(&data);
fp = fopen(filename, "r+");
while(1)
{
fseek(fp, pos*sizeof(mydata), SEEK_SET);
fread(&slot, sizeof(mydata), 1, fp);
if(slot.used != 1)
{
break;
}
printf("COLLISION!\n");
pos++;
pos %= 19;
pos %= 7;
}
printf("pos = %d\n", pos);
printf("key = %d\n", data.key);
fseek(fp, pos*sizeof(mydata), SEEK_SET);
fwrite(&data, sizeof(mydata), 1, fp);
fclose(fp);
}
void print_buckets(char * filename)
{
FILE * fp;
mydata data;
int i;
fp = fopen(filename, "r+");
if(fp == NULL)
{
perror("fopen: print_buckets");
exit(1);
}
for(i = 0; i < 25; i++)
{
fread(&data, sizeof(mydata), 1, fp);
if(data.used == 1){
printf("used = %d \n key = %d \n Name = %s\n Ailment: %s \n",
data.used, data.key, data.name, data.ailment);
}
}
fclose(fp);
}
int main(int argc, char** argv) {
int i, key;
init_table("myhashtable", tableSize);
while(1)
{
char choice[1];
printf("----------Menu-----------\n");
printf("------(A)dd Client-------\n");
printf("---(P)rint Client List---\n");
printf("---------(E)xit----------\n");
scanf("%s", &choice);
if(choice[0] == 'e' || choice[0] == 'E'){break;}
else if(choice[0] == 'a' || choice[0] == 'A')
{
char name[20];
printf("What is the clients name?");
scanf("%s", &name);
key = hashKey(name);
insert_data(key, name, "myhashtable");
}else if(choice[0] == 'p' || choice[0] == 'P'){
print_buckets("myhashtable");
}
}
return (EXIT_SUCCESS);
}
edit: This was an old, silly question, disregard
I'm not sure how you came up with
key %= 19
key %= 7
But I find that very suspicious, and possibly wrong.
The basic idea behind using prime numbers is a good one, but %
is the modulo operation, thus key
after the first line will be in the range [0,18]
, and [0,6]
after the second line, thus there are only 7
possible hash values.
This would be a better way to do it:
for(i = 0; i < len; i++)
key = 19*key + name[i];
key % tableSize;
Simply multiplying each value by 10 (as you did) isn't ideal, as abc
and cab
would have the same hash value - you want to have the weight them differently, i.e. multiply the first value by 1, the second by 10, the third by 100, etc. - using prime numbers here helps to minimize conflicts.
Making tableSize
a prime number could also help to minimize conflicts.
Taking the modulo of division by a prime number twice doesn't help.
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ...
% 19 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 0 1 2 3 4 5 6 ...
% 7 0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 0 1 2 3 4 5 6 ...
See what happened there at 19
- we skipped 5
and 6
- this isn't good - that means that 0-4
's are more likely to occur, so that instantly increases the chance of conflicts for those.
You'll need to rehash everything anyway (because if the file size changes, the hash values may change, and entries offset by collisions may no longer be offset), so it doesn't really matter whether you use the same file or another one - you'll still need to read everything, and write everything.
Keep in mind that using the same file adds complexity - you'll need to have an in-memory structure to store all the data before you wipe the file and start writing to it again. With the separate files, you could probably use your code with just minor changes.
If the entire file can't fit into memory (which is one reason why you'd choose to use files rather than memory structures), you don't really have another choice than using different files, as the intermediate in-memory structure won't be an option.
Once done with the old file, you can just delete it.