javascriptalgorithmdata-structurespaginationtrie

How to paginate a trie, so you can find 100 words matching prefix in the trie, starting at a paginated offset?


I am using a trie made to work on alphabets/scripts from any language.

class TrieNode {
  constructor(key) {
    // the "key" value will be the character in sequence
    this.key = key;

    // we keep a reference to parent
    this.parent = null;

    // we have hash of children
    this.children = {};

    // check to see if the node is at the end
    this.end = false;
  }

  getWord() {
    let output = [];
    let node = this;

    while (node !== null) {
      output.unshift(node.key)
      node = node.parent
    }

    return output.join('')
  }
}

class Trie {
  constructor() {
    this.base = new TrieNode(null)
  }

  insert(word) {
    let node = this.base

    const points = Array.from(word)

    for (const i in points) {
      const point = points[i]
      if (!node.children[point]) {
        const child = node.children[point] = new TrieNode(point)
        child.parent = node
      }

      node = node.children[point]

      if (i == word.length - 1) {
        node.end = true
      }
    }
  }

  contains(word) {
    let node = this.base

    const points = Array.from(word)

    for (const i in points) {
      const point = points[i]

      if (node.children[point]) {
        node = node.children[point]
      } else {
        return false
      }
    }

    return node.end;
  }

  find(prefix) {
    let node = this.base
    let output = []

    const points = Array.from(prefix)

    for (const i in points) {
      const point = points[i]

      // make sure prefix actually has words
      if (node.children[point]) {
        node = node.children[point]
      } else {
        // there's none. just return it.
        return output
      }
    }

    const stack = [node]
    while (stack.length) {
      node = stack.shift()
      // base case, if node is at a word, push to output
      if (node.end) {
        output.unshift(node.getWord())
      }

      // iterate through each children, call recursive findAllWords
      for (var child in node.children) {
        stack.push(node.children[child])
      }
    }

    return output
  }
}

Here is how you can find all words matching prefix. These scrabble words were used (tmp/scrabble.csv).

const fs = require('fs')
const Trie = require('./Trie')

const words = fs.readFileSync('tmp/scrabble.csv', 'utf-8')
  .trim()
  .split(/\n+/)
  .map(x => x.trim())

const trie = new Trie()

words.forEach(word => trie.insert(word))

console.log(trie.find('zy'))
[
  'zygodactylous', 'zygomorphies', 'zygapophysis',
  'zygapophyses',  'zygomorphic',  'zymologies',
  'zygospores',    'zygosities',   'zygomorphy',
  'zygodactyl',    'zymurgies',    'zymograms',
  'zymogenes',     'zygotenes',    'zygospore',
  'zygomatic',     'zyzzyvas',     'zymosans',
  'zymology',      'zymogram',     'zymogens',
  'zymogene',      'zygotene',     'zygosity',
  'zygomata',      'zyzzyva',      'zymurgy',
  'zymotic',       'zymosis',      'zymoses',
  'zymosan',       'zymogen',      'zymases',
  'zygotic',       'zygotes',      'zygosis',
  'zygoses',       'zygomas',      'zydecos',
  'zymase',        'zygote',       'zygose',
  'zygoma',        'zygoid',       'zydeco',
  'zymes',         'zyme'
]

However, search find('a') will find over 10,000 matches (words starting with a). If this were a web app, you might not want to show all 10,000 at once, maybe only 100 or 1,000 at a time. How could you implement a pagination system against this trie? Is it possible?

From my initial thought-attempts, I was trying to think how you could keep track of the "path" you left off with in the trie (say we are paginating 100 matches/words at a time). So I am trying to modify this function:

class Trie {
  // ...

  find(prefix, max = 100) {
    let node = this.base
    let output = []

    const points = Array.from(prefix)

    for (const i in points) {
      const point = points[i]

      // make sure prefix actually has words
      if (node.children[point]) {
        node = node.children[point]
      } else {
        // return path [-1] since there is no match?
        return [output, [-1]]
      }
    }

    const stack = [node]
    const pathStack = [[0]]
    while (stack.length) {
      const path = pathStack.shift()
      node = stack.shift()
      // base case, if node is at a word, push to output
      if (node.end) {
        output.unshift(node.getWord())
        if (output.length === max) {
          return [output, path]
        }
      }

      // iterate through each children, call recursive findAllWords
      let i = 0
      for (var child in node.children) {
        pathStack.push(path.concat(i++))
        stack.push(node.children[child])
      }
    }

    return [output, [-1]]
  }
}

Maybe this is partially correct? But I don't see how to use the path to jump back to the right spot on the next query.

[
  [
    'abye', 'abut', 'abri', 'abos', 'ably', 'able', 'abet', 'abed',
    'abbe', 'abba', 'abas', 'aals', 'aahs', 'azo',  'ays',  'aye',
    'axe',  'awn',  'awl',  'awe',  'awa',  'avo',  'ave',  'ava',
    'auk',  'att',  'ate',  'ass',  'asp',  'ask',  'ash',  'art',
    'ars',  'arm',  'ark',  'arf',  'are',  'arc',  'arb',  'apt',
    'ape',  'any',  'ant',  'ani',  'ane',  'and',  'ana',  'amu',
    'amp',  'ami',  'ama',  'alt',  'als',  'alp',  'all',  'ale',
    'alb',  'ala',  'ait',  'ais',  'air',  'ain',  'aim',  'ail',
    'aid',  'aha',  'ago',  'age',  'aga',  'aft',  'aff',  'adz',
    'ads',  'ado',  'add',  'act',  'ace',  'aby',  'abs',  'abo',
    'aba',  'aas',  'aal',  'aah',  'ay',   'ax',   'aw',   'at',
    'as',   'ar',   'an',   'am',   'al',   'ai',   'ah',   'ag',
    'ae',   'ad',   'ab',   'aa'
  ],
  [ 0, 1, 17, 0 ]
]

And plus, the words appear to be in unusual order, maybe I am not inserting them the correct order?

This way, in theory, if you had the path, then when you call find(prefix, startAtPath), you would pass the startAtPath as a second parameter, and start the search from that point forward, to get the next 100 words. I haven't seen any "trie pagination" examples on the web, so maybe it's not a thing that's possible, not sure.


Solution

  • In TrieNode.constructor add a this.count property.

    In insert, update the counts for the insertion.

    In find add startAt and max parameters. As long as node.count < startAt you will skip past node (do not add its children to pathStack) and decrease startAt by node.count.

    And then you collect up to max words to return.

    This allows you to skip past previous results without having to enumerate them. You do not need to pass in the path, just where you expect to start.

    The cause of your order problem appears to be that you are using unshift instead of push.


    Here is your code edited to give the right answer. I also fixed a minor bug that your output order depends on the order in which node.children was inserted. This will work if you load the trie in alphabetical order, but not otherwise.

    class TrieNode {
      constructor(key) {
        // the "key" value will be the character in sequence
        this.key = key;
    
        // we keep a reference to parent
        this.parent = null;
    
        // we have hash of children
        this.children = {};
    
        // check to see if the node is at the end
        this.end = false;
    
        // count how many words are at this node or below.
        this.count = 0;
    
        // Ensure children are in the right order.
        this.lastChildKey = '';
      }
    
      getWord() {
        let output = [];
        let node = this;
    
        while (node !== null) {
          output.unshift(node.key)
          node = node.parent
        }
    
        return output.join('')
      }
    }
    
    class Trie {
      constructor() {
        this.base = new TrieNode(null)
      }
    
      insert(word) {
        let node = this.base
    
        const points = Array.from(word)
    
        for (const i in points) {
          const point = points[i]
          node.count++
          if (!node.children[point]) {
            const child = node.children[point] = new TrieNode(point)
            child.parent = node
          }
    
          node = node.children[point]
    
          if (i == word.length - 1) {
            node.end = true
          }
    
          // If words were inserted out of order, fix order of children.
          if (node.lastChildKey <= point) {
            node.lastChildKey = point
          }
          else {
            // Need to fix order of children.
            let keys = Object.keys(node.children)
            keys.sort()
            let orderedChildren = {}
            for (const key in keys) {
              orderedChildren[key] = node.children[key]
            }
            node.children = orderedChildren
          }
        }
      }
    
      contains(word) {
        let node = this.base
    
        const points = Array.from(word)
    
        for (const i in points) {
          const point = points[i]
    
          if (node.children[point]) {
            node = node.children[point]
          } else {
            return false
          }
        }
    
        return node.end;
      }
      find(prefix, startAt, maxCount) {
        let node = this.base
        let output = []
    
        const points = Array.from(prefix)
    
        for (const i in points) {
          const point = points[i]
    
          // make sure prefix actually has words
          if (node.children[point]) {
            node = node.children[point]
          } else {
            // there's none. just return it.
            return output
          }
        }
    
        const stack = [node]
        while (stack.length) {
          if (maxCount <= output.length) {
              return output
          }
          node = stack.shift()
          if (node.count < startAt) {
            startAt -= node.count
          }
          else {
            // base case, if node is at a word, push to output
            if (node.end) {
              if (0 < startAt) {
                startAt--;
              }
              else {
                output.push(node.getWord())
              }
            }
    
            // iterate through each children, call recursive findAllWords
            for (var child in node.children) {
              stack.push(node.children[child])
            }
          }
        }
    
        return output
      }
    }
    
    exports.Trie = Trie
    

    Here is your demo program edited for the changes.

    const fs = require('fs')
    const Trie = require('./Trie')
    
    const words = fs.readFileSync('tmp/scrabble.csv', 'utf-8')
      .trim()
      .split(/\n+/)
      .map(x => x.trim())
    
    const trie = new Trie.Trie()
    
    words.forEach(word => trie.insert(word))
    
    console.log(trie.find('a', 0, 12))
    
    console.log(trie.find('a', 6, 6))