c++algorithmrabin-karp

Rabin-Karp algorithm in c++


I am trying to understand the implementation of the Rabin-Karp algorithm. d is the number of characters in the input alphabet, but if I replace 0 or any other value instead of 20, it won't affect anything. Why is this happening like this ?

    // Rabin-Karp algorithm in C++
#include <string.h>
#include <iostream>
using namespace std;
#define d 20

void rabinKarp(char pattern[], char text[], int q) {
    int m = strlen(pattern);
    int n = strlen(text);
    int i, j;
    int p = 0;
    int t = 0;
    int h = 1;

    for (i = 0; i < m - 1; i++)
        h = (h * d) % q;

    // Calculate hash value for pattern and text
    for (i = 0; i < m; i++) {
        p = (d * p + pattern[i]) % q;
        t = (d * t + text[i]) % q;
    }

    // Find the match
    for (i = 0; i <= n - m; i++) {
        if (p == t) {
            for (j = 0; j < m; j++) {
                if (text[i + j] != pattern[j])
                    break;
            }

            if (j == m)
                cout << "Pattern is found at position: " << i + 1 << endl;
        }

        if (i < n - m) {
            t = (d * (t - text[i] * h) + text[i + m]) % q;

            if (t < 0)
                t = (t + q);
        }
    }
}

int main() {

   // char text[] = "ABCCDXAEFGX";
    char text[] = "QWERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
    char pattern[] = "KLXQW";
    int q = 13;
    rabinKarp(pattern, text, q);
} 

Solution

  • I believe the short answer is that the lower d is the more hash collisions you will have, but you go about verifying the match anyway so it does not affect anything.

    A bit more verbose:

    First let me modify your code to be have more expressive variables:

    // Rabin-Karp algorithm in C++
    #include <string.h>
    #include <iostream>
    using namespace std;
    #define HASH_BASE 0
    
    void rabinKarp(char pattern[], char text[], int inputBase) {
        int patternLen = strlen(pattern);
        int textLen = strlen(text);
        int i, j; //predefined iterators
        int patternHash = 0;
        int textHash = 0;
        int patternLenOut = 1;
    
        for (i = 0; i < patternLen - 1; i++)
            patternLenOut = (patternLenOut * HASH_BASE) % inputBase; // hash of pattern len
    
        // Calculate hash value for pattern and text
        for (i = 0; i < patternLen; i++) {
            patternHash = (HASH_BASE * patternHash + pattern[i]) % inputBase;
            textHash = (HASH_BASE * textHash + text[i]) % inputBase;
        }
    
        // Find the match
        for (i = 0; i <= textLen - patternLen; i++) {
            if (patternHash == textHash) {
                for (j = 0; j < patternLen; j++) {
                    if (text[i + j] != pattern[j])
                        break;
                }
    
                if (j == patternLen)
                    cout << "Pattern is found at position: " << i + 1 << endl;
            }
    
            if (i < textLen - patternLen) {
                textHash = (HASH_BASE * (textHash - text[i] * patternLenOut) + text[i + patternLen]) % inputBase;
                
                if (textHash < 0)
                    textHash = (textHash + inputBase);
            }
        }
    }
    
    int main() {
    
       // char text[] = "ABCCDXAEFGX";
        char text[] = "QWEEERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
        char pattern[] = "EE";
        int q = 13;
        rabinKarp(pattern, text, q);
    } 
    

    The easiest way to attack it is to set HASH_BASE (previously d) to zero and see where we can simplify. The rabinKarp function can then be reduced to:

    void rabinKarp(char pattern[], char text[], int inputBase) {
        int patternLen = strlen(pattern);
        int textLen = strlen(text);
        int i, j; //predefined iterators
        int patternHash = 0;
        int textHash = 0;
        int patternLenOut = 0;
    
        // Calculate hash value for pattern and text
        for (i = 0; i < patternLen; i++) {
            patternHash = (pattern[i]) % inputBase;
            textHash = (text[i]) % inputBase;
        }
    
        // Find the match
        for (i = 0; i <= textLen - patternLen; i++) {
            if (patternHash == textHash) {
                for (j = 0; j < patternLen; j++) {
                    if (text[i + j] != pattern[j])
                        break;
                }
    
                if (j == patternLen)
                    cout << "Pattern is found at position: " << i + 1 << endl;
            }
    
            if (i < textLen - patternLen) {
                textHash = (text[i + patternLen]) % inputBase;
                
                if (textHash < 0)
                    textHash = (textHash + inputBase);
            }
        }
    }
    

    now you'll notice that all the hashes becomes is the sum of the letters mod some number (in your case 13, in my case 2). This is a bad hash, meaning many things will sum to the same number. However, in this portion of the code:

    if (patternHash == textHash) {
        for (j = 0; j < patternLen; j++) {
            if (text[i + j] != pattern[j])
                  break;
        }
        
        if (j == patternLen)
            cout << "Pattern is found at position: " << i + 1 << endl;
    }
    

    you explicitly check the match, letter by letter, if the hashes match. The worse your hash function is, the more often you will have false positives (which will mean a longer runtime for your function). There are more details, but I believe that directly answers your question. What might be interesting is to record false positives and see how the false positive rate increases as d and q decrease.