ccs50edx

Frowny face for cs50 pset 5 "handles most basic words properly" and "spell-checking is case-insensitive"


I ran check50 and I got a frowny face for "spell-checking is case-insensitive" and "handles most basic words properly."

How could I fix this? Does it have to do with my hash function?

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

#include "dictionary.h"

// Represents a node in a hash table
typedef struct node
{
    char word[LENGTH + 1];
    struct node *next;
}
node;

// Number of words in dictionary
int word_count = 0;

// Number of buckets in hash table
const unsigned int N = 26;

// Hash table
node *table[N];

// Returns true if word is in dictionary, else false
bool check(const char *word)
{
    unsigned int n = hash(word);

    node *cursor = table[n];

    while (cursor != NULL)
    {
        if (strcasecmp(word, cursor->word) == 0)
        {
            return true;
        }

        cursor = cursor->next;
    }
    return false;
}

// Hashes word to a number
// Function credit to staff on CS50 reddit page
unsigned int hash(const char *word)
{
    unsigned int hash_value = 0;

    for (int i = 0, n = strlen(word); i < n; i++)
    {
         hash_value = (hash_value << 2) ^ word[i];
    }
    return hash_value % N; 
}

// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
    // Open dictionary and check for memory issue
    //Function guide credit to CS50 Guide by Anvea on YouTube
    FILE *dict = fopen(dictionary, "r");
    char word[LENGTH + 1];

    // Check for memory issue with dict
    if (dict == NULL)
    {
        printf("Dictionary is null\n");
        unload();
        return false;
    }

    while (fscanf(dict, "%s", word) != EOF)
    {
        node *n = malloc(sizeof(node));
        if (n == NULL)
        {
            return false;
        }

        strcpy(n->word, word);
        word_count++;

        // Index word using hash function
        int dict_index = hash(word);

        // Insert into hash table if already empty
        if (table[dict_index] == NULL)
        {
            n->next = NULL;
        }
        // Insert work as new node if not empyty
        else
        {
            n->next = table[dict_index];
        }

        table[dict_index] = n;
    }

    // Close dictionary file
    fclose(dict);

    // Indicate success
    return true;
}

// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
    return word_count;
    return 0;
}

// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
    for (int i = 0; i < N; i++)
    {
        node *cursor = table[i];

        while (cursor)
        {
            node *tmp = cursor;
            cursor = cursor->next;
            free(tmp);
        }
    }
    return true;
}

Solution

  • A few issues ...

    1. All dictionary words are lowercase, one per line. So, the hash is done on lowercase characters.
    2. But, the check text can be uppercase. In check, it calls hash on the mixed case text, so it can generate different hash values for certain words (e.g.) Won't this work? and No, it won't.
    3. In check, we need to lowercase the string before calling hash so we get the same hash value/index. As a beneficial side effect of this, we can replace strcasecmp with strcmp (which is faster).
    4. In load, there is no need to have a special case for when table[n] is NULL, so we can simplify this.
    5. Although I didn't check this, in hash doing << may produce a hash value that isn't evenly distributed, slowing down the search in check.
    6. Because the dictionary file is [guaranteed to be] all lowercase and one entry per line, in load, we can replace fscanf with fgets [stripping the newline] as fgets is much faster.
    7. With gcc 8.3.1, the compiler flagged node *table[N]; because of: const unsigned int N = 26; Unlike C++, this isn't allowed in C--it requires a literal constant and not just const. So, I changed this to enum { N = 26 };
    8. Style note: Never do (e.g.): x -> y instead of x->y.

    Here is the corrected code. I've checked it against all possible input files and dictionaries. It is annotated with the bugs and fixes:

    #include <stdbool.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <strings.h>
    #if 1
    #include <ctype.h>
    #endif
    
    #include "dictionary.h"
    
    // Represents a node in a hash table
    typedef struct node {
        char word[LENGTH + 1];
        struct node *next;
    } node;
    
    // Number of words in dictionary
    int word_count = 0;
    
    // Number of buckets in hash table
    // NOTE/BUG: C (vs C++) doesn't support global arrays from const
    #if 0
    const unsigned int N = 26;
    #else
    enum {
        N = 26
    };
    #endif
    
    // Hash table
    node *table[N];
    
    // locase -- copy lower cased word
    void
    locase(char *dst,const char *src)
    {
    
        for (;  *src != 0;  ++src, ++dst)
            *dst = tolower((unsigned char) *src);
        *dst = 0;
    }
    
    // Returns true if word is in dictionary, else false
    bool
    check(const char *word)
    {
    // NOTE/FIX: we _must_ take lowercase of word _before_ calculating the hash
    // otherwise, the hash value will differ for uppercase words here and lowercase
    // words in the dictionary (i.e. different hash buckets)
    // NOTE/FIX: we should convert test word to lowercase _once_ (it allows us to
    // use strcmp below)
    #if 1
        char lobuf[LENGTH + 1];
        locase(lobuf,word);
        word = lobuf;
    #endif
        unsigned int n = hash(word);
    
        node *cursor = table[n];
    
        while (cursor != NULL) {
    // NOTE/BUG: strcasecmp is very slow compared to strcmp
    #if 0
            if (strcasecmp(word, cursor->word) == 0)
                return true;
    #else
            if (strcmp(word, cursor->word) == 0)
                return true;
    #endif
            cursor = cursor->next;
        }
    
        return false;
    }
    
    // Hashes word to a number
    // Function credit to staff on CS50 reddit page
    unsigned int
    hash(const char *word)
    {
        unsigned int hash_value = 0;
    
    #if 0
        for (int i = 0, n = strlen(word); i < n; i++)
            hash_value = (hash_value << 2) ^ word[i];
    #endif
    #if 0
        for (int i = 0; word[i] != 0; i++)
            hash_value = (hash_value << 2) ^ word[i];
    #endif
    #if 1
        for (int i = 0; word[i] != 0; i++)
            hash_value = (hash_value * 31) ^ word[i];
    #endif
    
        return hash_value % N;
    }
    
    // Loads dictionary into memory, returning true if successful else false
    bool
    load(const char *dictionary)
    {
        // Open dictionary and check for memory issue
        // Function guide credit to CS50 Guide by Anvea on YouTube
        FILE *dict = fopen(dictionary, "r");
    #if 0
        char word[LENGTH + 1];
    #else
        char word[LENGTH + 10];
    #endif
    
        // Check for memory issue with dict
        if (dict == NULL) {
            printf("Dictionary is null\n");
            unload();
            return false;
        }
    
    // NOTE/BUG: dictionary is guaranteed to be one word per line and fscanf is
    // slow compared to fgets
    #if 0
        while (fscanf(dict, "%s", word) != EOF) {
    #else
        while (1) {
            if (fgets(word,sizeof(word),dict) == NULL)
                break;
            char *cp = strchr(word,'\n');
            if (cp != NULL)
                *cp = 0;
    #endif
    
            node *n = malloc(sizeof(node));
    
            if (n == NULL) {
                return false;
            }
    
            strcpy(n->word, word);
    
            word_count++;
    
            // Index word using hash function
            int dict_index = hash(word);
    
    // NOTE/BUG: no need to special case this
    #if 0
            // Insert into hash table if already empty
            if (table[dict_index] == NULL) {
                n->next = NULL;
            }
            // Insert work as new node if not empyty
            else {
                n->next = table[dict_index];
            }
    #else
            n->next = table[dict_index];
    #endif
    
            table[dict_index] = n;
        }
    
        // Close dictionary file
        fclose(dict);
    
        // Indicate success
        return true;
    }
    
    // Returns number of words in dictionary if loaded else 0 if not yet loaded
    unsigned int
    size(void)
    {
        return word_count;
        return 0;
    }
    
    // Unloads dictionary from memory, returning true if successful else false
    bool
    unload(void)
    {
        for (int i = 0; i < N; i++) {
    
            node *cursor = table[i];
    
            while (cursor) {
                node *tmp = cursor;
    
                cursor = cursor->next;
                free(tmp);
            }
        }
        return true;
    }
    

    In the code above, I've used cpp conditionals to denote old vs. new code:

    #if 0
    // old code
    #else
    // new code
    #endif
    
    #if 1
    // new code
    #endif
    

    Note: this can be cleaned up by running the file through unifdef -k


    Here is the cleaned up version (run through unifdef -k with some comment cleanup):

    #include <stdbool.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <strings.h>
    #include <ctype.h>
    
    #include "dictionary.h"
    
    // Represents a node in a hash table
    typedef struct node {
        char word[LENGTH + 1];
        struct node *next;
    } node;
    
    // Number of words in dictionary
    int word_count = 0;
    
    // Number of buckets in hash table
    enum {
        N = 26
    };
    
    // Hash table
    node *table[N];
    
    // locase -- copy lower cased word
    void
    locase(char *dst,const char *src)
    {
    
        for (;  *src != 0;  ++src, ++dst)
            *dst = tolower((unsigned char) *src);
        *dst = 0;
    }
    
    // Returns true if word is in dictionary, else false
    bool
    check(const char *word)
    {
        char lobuf[LENGTH + 1];
        locase(lobuf,word);
        word = lobuf;
        unsigned int n = hash(word);
    
        node *cursor = table[n];
    
        while (cursor != NULL) {
            if (strcmp(word, cursor->word) == 0)
                return true;
            cursor = cursor->next;
        }
    
        return false;
    }
    
    // Hashes word to a number
    // Function credit to staff on CS50 reddit page
    unsigned int
    hash(const char *word)
    {
        unsigned int hash_value = 0;
    
        for (int i = 0; word[i] != 0; i++)
            hash_value = (hash_value * 31) ^ word[i];
    
        return hash_value % N;
    }
    
    // Loads dictionary into memory, returning true if successful else false
    bool
    load(const char *dictionary)
    {
        // Open dictionary and check for memory issue
        // Function guide credit to CS50 Guide by Anvea on YouTube
        FILE *dict = fopen(dictionary, "r");
        char word[LENGTH + 10];
    
        // Check for memory issue with dict
        if (dict == NULL) {
            printf("Dictionary is null\n");
            unload();
            return false;
        }
    
        while (1) {
            if (fgets(word,sizeof(word),dict) == NULL)
                break;
            char *cp = strchr(word,'\n');
            if (cp != NULL)
                *cp = 0;
    
            node *n = malloc(sizeof(node));
    
            if (n == NULL) {
                return false;
            }
    
            strcpy(n->word, word);
    
            word_count++;
    
            // Index word using hash function
            int dict_index = hash(word);
    
            n->next = table[dict_index];
    
            table[dict_index] = n;
        }
    
        // Close dictionary file
        fclose(dict);
    
        // Indicate success
        return true;
    }
    
    // Returns number of words in dictionary if loaded else 0 if not yet loaded
    unsigned int
    size(void)
    {
        return word_count;
        return 0;
    }
    
    // Unloads dictionary from memory, returning true if successful else false
    bool
    unload(void)
    {
        for (int i = 0; i < N; i++) {
    
            node *cursor = table[i];
    
            while (cursor) {
                node *tmp = cursor;
    
                cursor = cursor->next;
                free(tmp);
            }
        }
        return true;
    }
    

    Although the above code is [now] correct, it can be further improved.

    For some additional fixes and speedups, see my cs50 speller answer: cs50 pset5 Speller optimisation

    Here is a comparison of the elapsed time for the check function between your [fixed] version vs my [optimized] version from the link:

     Yours       Mine     File
     27.900433   0.271936 aca.txt
      8.713822   0.093062 austen.txt
      1.553954   0.015378 birdman.txt
      3.996230   0.041159 burnett.txt
      1.927505   0.021139 carroll.txt
      0.001437   0.000007 cat.txt
      0.477209   0.005412 constitution.txt
     12.684350   0.141040 federalist.txt
      5.947873   0.060730 frankenstein.txt
      6.810451   0.081574 grimm.txt
      1.279325   0.013508 her.txt
     78.311622   0.861220 holmes.txt
     14.265868   0.145593 homer.txt
      1.297946   0.013600 lalaland.txt
      4.110416   0.042714 mansfield.txt
      0.002796   0.000017 pneumonoultramicroscopicsilicovolcanoconiosis.txt
      1.739467   0.017283 revenant.txt
      5.238748   0.054731 rinehart.txt
     68.048034   0.680697 shakespeare.txt
      6.071052   0.064508 stein.txt
     11.721317   0.121086 stoker.txt
     14.137166   0.146902 surgery.txt
     40.615153   0.433262 tolstoy.txt
      9.731160   0.099112 wells.txt
     39.266734   0.457958 whittier.txt
      0.014416   0.000132 wordsworth.txt
     13.774792   0.144299 xueqin1.txt
     18.768864   0.196494 xueqin2.txt