java algorithm performance trie levenshtein-distance

Implementing a simple Trie for efficient Levenshtein Distance calculation - Java

UPDATE 3

Done. Below is the code that finally passed all of my tests. Again, this is modeled after Murilo Vasconcelo's modified version of Steve Hanov's algorithm. Thanks to all that helped!

/**
 * Computes the minimum Levenshtein Distance between the given word (represented as an array of Characters) and the
 * words stored in theTrie. This algorithm is modeled after Steve Hanov's blog article "Fast and Easy Levenshtein
 * distance using a Trie" and Murilo Vasconcelo's revised version in C++.
 * 
 * http://stevehanov.ca/blog/index.php?id=114
 * http://murilo.wordpress.com/2011/02/01/fast-and-easy-levenshtein-distance-using-a-trie-in-c/
 * 
 * @param ArrayList<Character> word - the characters of an input word as an array representation
 * @return int - the minimum Levenshtein Distance
 */
private int computeMinimumLevenshteinDistance(ArrayList<Character> word) {

    theTrie.minLevDist = Integer.MAX_VALUE;

    int iWordLength = word.size();
    int[] currentRow = new int[iWordLength + 1];

    for (int i = 0; i <= iWordLength; i++) {
        currentRow[i] = i;
    }

    for (int i = 0; i < iWordLength; i++) {
        traverseTrie(theTrie.root, word.get(i), word, currentRow);
    }
    return theTrie.minLevDist;
}

/**
 * Recursive helper function. Traverses theTrie in search of the minimum Levenshtein Distance.
 * 
 * @param TrieNode node - the current TrieNode
 * @param char letter - the current character of the current word we're working with
 * @param ArrayList<Character> word - an array representation of the current word
 * @param int[] previousRow - a row in the Levenshtein Distance matrix
 */
private void traverseTrie(TrieNode node, char letter, ArrayList<Character> word, int[] previousRow) {

    int size = previousRow.length;
    int[] currentRow = new int[size];
    currentRow[0] = previousRow[0] + 1;

    int minimumElement = currentRow[0];
    int insertCost, deleteCost, replaceCost;

    for (int i = 1; i < size; i++) {

        insertCost = currentRow[i - 1] + 1;
        deleteCost = previousRow[i] + 1;

        if (word.get(i - 1) == letter) {
            replaceCost = previousRow[i - 1];
        } else {
            replaceCost = previousRow[i - 1] + 1;
        }

        currentRow[i] = minimum(insertCost, deleteCost, replaceCost);

        if (currentRow[i] < minimumElement) {
            minimumElement = currentRow[i];
        }
    }

    if (currentRow[size - 1] < theTrie.minLevDist && node.isWord) {
        theTrie.minLevDist = currentRow[size - 1];
    }

    if (minimumElement < theTrie.minLevDist) {

        for (Character c : node.children.keySet()) {
            traverseTrie(node.children.get(c), c, word, currentRow);
        }
    }
}

UPDATE 2

Finally, I've managed to get this to work for most of my test cases. My implementation is practically a direct translation from Murilo's C++ version of Steve Hanov's algorithm. So how should I refactor this algorithm and/or make optimizations? Below is the code...

public int search(String word) {

    theTrie.minLevDist = Integer.MAX_VALUE;

    int size = word.length();
    int[] currentRow = new int[size + 1];

    for (int i = 0; i <= size; i++) {
        currentRow[i] = i;
    }
    for (int i = 0; i < size; i++) {
        char c = word.charAt(i);
        if (theTrie.root.children.containsKey(c)) {
            searchRec(theTrie.root.children.get(c), c, word, currentRow);
        }
    }
    return theTrie.minLevDist;
}
private void searchRec(TrieNode node, char letter, String word, int[] previousRow) {

    int size = previousRow.length;
    int[] currentRow = new int[size];
    currentRow[0] = previousRow[0] + 1;

    int insertCost, deleteCost, replaceCost;

    for (int i = 1; i < size; i++) {

        insertCost = currentRow[i - 1] + 1;
        deleteCost = previousRow[i] + 1;

        if (word.charAt(i - 1) == letter) {
            replaceCost = previousRow[i - 1];
        } else {
            replaceCost = previousRow[i - 1] + 1;
        }
        currentRow[i] = minimum(insertCost, deleteCost, replaceCost);
    }

    if (currentRow[size - 1] < theTrie.minLevDist && node.isWord) {
        theTrie.minLevDist = currentRow[size - 1];
    }

    if (minElement(currentRow) < theTrie.minLevDist) {

        for (Character c : node.children.keySet()) {
            searchRec(node.children.get(c), c, word, currentRow);

        }
    }
}

Thank you everyone who contributed to this question. I tried getting the Levenshtein Automata to work, but I couldn't make it happen.

So I'm looking for suggestions on refactoring and/or optimizations regarding the above code. Please let me know if there's any confusion. As always, I can provide the rest of the source code as needed.

UPDATE 1

So I've implemented a simple Trie data structure and I've been trying to follow Steve Hanov's python tutorial to compute the Levenshtein Distance. Actually, I'm interested in computing the minimum Levenshtein Distance between a given word and the words in the Trie, thus I've been following Murilo Vasconcelos's version of Steve Hanov's algorithm. It's not working very well, but here's my Trie class:

public class Trie {

    public TrieNode root;
    public int minLevDist;

    public Trie() {
        this.root = new TrieNode(' ');
    }

    public void insert(String word) {

        int length = word.length();
        TrieNode current = this.root;

        if (length == 0) {
            current.isWord = true;
        }
        for (int index = 0; index < length; index++) {

            char letter = word.charAt(index);
            TrieNode child = current.getChild(letter);

            if (child != null) {
                current = child;
            } else {
                current.children.put(letter, new TrieNode(letter));
                current = current.getChild(letter);
            }
            if (index == length - 1) {
                current.isWord = true;
            }
        }
    }
}

... and the TrieNode class:

public class TrieNode {

    public final int ALPHABET = 26;

    public char letter;
    public boolean isWord;
    public Map<Character, TrieNode> children;

    public TrieNode(char letter) {
        this.isWord = false;
        this.letter = letter;
        children = new HashMap<Character, TrieNode>(ALPHABET);
    }

    public TrieNode getChild(char letter) {

        if (children != null) {
            if (children.containsKey(letter)) {
                return children.get(letter); 
            }
        }
        return null;
    }
}

Now, I've tried to implement the search as Murilo Vasconcelos has it, but something is off and I need some help debugging this. Please give suggestions on how to refactor this and/or point out where the bugs are. The very first thing I'd like to refactor is the "minCost" global variable, but that's the smallest of things. Anyway, here's the code...

public void search(String word) {

    int size = word.length();
    int[] currentRow = new int[size + 1];

    for (int i = 0; i <= size; i++) {
        currentRow[i] = i;
    }
    for (int i = 0; i < size; i++) {
        char c = word.charAt(i);
        if (theTrie.root.children.containsKey(c)) {
            searchRec(theTrie.root.children.get(c), c, word, currentRow);
        }
    }
}

private void searchRec(TrieNode node, char letter, String word, int[] previousRow) {

    int size = previousRow.length;
    int[] currentRow = new int[size];
    currentRow[0] = previousRow[0] + 1;

    int replace, insertCost, deleteCost;

    for (int i = 1; i < size; i++) {

        char c = word.charAt(i - 1);

        insertCost = currentRow[i - 1] + 1;
        deleteCost = previousRow[i] + 1;
        replace = (c == letter) ? previousRow[i - 1] : (previousRow[i - 1] + 1);

        currentRow[i] = minimum(insertCost, deleteCost, replace);
    }

    if (currentRow[size - 1] < minCost && !node.isWord) {
        minCost = currentRow[size - 1];
    }
    Integer minElement = minElement(currentRow);
    if (minElement < minCost) {

        for (Map.Entry<Character, TrieNode> entry : node.children.entrySet()) {
            searchRec(node, entry.getKey(), word, currentRow);
        }
    }
}

I apologize for the lack of comments. So what am I doing wrong?

INITIAL POST

I've been reading an article, Fast and Easy Levenshtein distance using a Trie, in hopes of figuring out an efficient way to compute the Levenshtein Distance between two Strings. My main goal with this is, given a large set of words, to be able to find the minimal Levenshtein Distance between an input word(s) and this set of words.

In my trivial implementation, I compute the Levenshtein Distance between an input word and the set of words, for each input word, and return the minimum. It works, but it is not efficient...

I've been looking for implementations of a Trie, in Java, and I've come across two seemingly good sources:

Koders.com version
code.google.com version (EDIT: This seems to have moved to github.com/rkapsi)

However, these implementations seem too complicated for what I'm trying to do. As I've been reading through them to understand how they work and how Trie data structures work in general, I've only become more confused.

So how would I implement a simple Trie data structure in Java? My intuition tells me that each TrieNode should store the String it represents and also references to letters of the alphabet, not necessarily all letters. Is my intuition correct?

Once that is implemented, the next task is to compute the Levenshtein Distance. I read through the Python code example in the article above, but I don't speak Python, and my Java implementation runs out of Heap memory once I hit the recursive searching. So how would I compute the Levenshtein Distance using the Trie data structure? I have a trivial implementation, modeled after this source code, but it doesn't use a Trie... it is inefficient.

It would be really nice to see some code in addition to your comments and suggestions. After all, this is a learning process for me... I've never implemented a Trie... so I have plenty to learn from this experience.

Thanks.

p.s. I can provide any source code if need be. Also, I've already read through and tried using a BK-Tree as suggested in Nick Johnson's blog, but its not as efficient as I think it can be... or maybe my implementation is wrong.

Solution

I've implemented the algo described on "Fast and Easy Levenshtein distance using a Trie" article in C++ and it is really fast. If you want (understand C++ better than Python), I can past the code in somewhere.

Edit:

I posted it on my blog. Here's a copy:

#include <iostream>
#include <map>
#include <string>
#include <vector>
#include <algorithm>
#include <cctype>

/*
 * Algorithm: Edit distance using a trie-tree (Dynamic Programming)
 * Author: Murilo Adriano Vasconcelos <muriloufg@gmail.com>
 */

using namespace std;

// Trie's node
struct trie
{
    typedef map<char, trie*> next_t;

    // The set with all the letters which this node is prefix
    next_t next;

    // If word is equal to "" is because there is no word in the
    //  dictionary which ends here.
    string word;

    trie() : next(map<char, trie*>()) {}

    void insert(string w)
    {
        w = string("$") + w;
        
        int sz = w.size();
        
        trie* n = this;
        for (int i = 0; i < sz; ++i) {
            if (n->next.find(w[i]) == n->next.end()) {
                n->next[w[i]] = new trie();
            }

            n = n->next[w[i]];
        }

        n->word = w;
    }
};

// The tree
trie tree;

// The minimum cost of a given word to be changed to a word of the dictionary
int min_cost;

//
void search_impl(trie* tree, char ch, vector<int> last_row, const string& word)
{
    int sz = last_row.size();

    vector<int> current_row(sz);
    current_row[0] = last_row[0] + 1;

    // Calculate the min cost of insertion, deletion, match or substution
    int insert_or_del, replace;
    for (int i = 1; i < sz; ++i) {
        insert_or_del = min(current_row[i-1] + 1, last_row[i] + 1);
        replace = (word[i-1] == ch) ? last_row[i-1] : (last_row[i-1] + 1);

        current_row[i] = min(insert_or_del, replace);
    }

    // When we find a cost that is less than the min_cost, is because
    // it is the minimum until the current row, so we update
    if ((current_row[sz-1] < min_cost) && (tree->word != "")) {
        min_cost = current_row[sz-1];
    }

    // If there is an element wich is smaller than the current minimum cost,
    //  we can have another cost smaller than the current minimum cost
    if (*min_element(current_row.begin(), current_row.end()) < min_cost) {
        for (trie::next_t::iterator it = tree->next.begin(); it != tree->next.end(); ++it) {
            search_impl(it->second, it->first, current_row, word);
        }
    }
}

int search(string word)
{
    word = string("$") + word;
    
    int sz = word.size();
    min_cost = 0x3f3f3f3f;

    vector<int> current_row(sz + 1);

    // Naive DP initialization
    for (int i = 0; i < sz; ++i) current_row[i] = i;
    current_row[sz] = sz;
    
    
    
    // For each letter in the root map wich matches with a
    //  letter in word, we must call the search
    for (int i = 0 ; i < sz; ++i) {
        if (tree.next.find(word[i]) != tree.next.end()) {
            search_impl(tree.next[word[i]], word[i], current_row, word);
        }
    }

    return min_cost;
}