javascriptalgorithmparsingtreerecursive-descent

How to construct tree pattern matching algorithm in JavaScript?


Okay this is a bit of an involved question, but tl;dr it's basically how do you parse an "actual tree" using a "pattern tree"? How do you check if a particular tree instance is matched by a specific pattern tree?

To start, we have the structure of our pattern tree. The pattern tree can generally contain these types of nodes:

That should be good enough for this question. There are a few more node types, but these are the main ones. Essentially it is like a regular expression or grammar tree.

We can start with a simple pattern tree:

<sequence>
  <text value="foo">
    <text value="bar" />
  </text>
</sequence>

This should match any of the following trees of text.

<foo><bar/></foo><foo><bar/></foo><foo><bar/></foo>
<foo><bar/></foo>
<foo><bar/></foo><foo><bar/></foo>

More specifically, you should imagine that as JSON and the pattern tree as JSON as well.

{
  "tag": "foo",
  "children": [
    { "tag": "bar" }
  ]
}

And the sequence pattern tree is like this:

{
  "type": "sequence",
  "children": [
    {
      "type": "text",
      "value": "foo",
      "children": [
        {
          "type": "text",
          "value": "bar"
        }
      ]
    }
  ]
}

A more complex example would be like this for a pattern tree:

matchThing = <text name="thing">
  <text />
  <sequence>
    <first>
      <class name="value"/>
      <class name="function"/>
    </first>
  </sequence>
</text>

matchFunction = <text name="function">
  <text />
  <sequence>
    <text name="input">
      <text />
    </text>
  </sequence>
  <sequence>
    <text name="call">
      <text />
    </text>
  </sequence>
</text>

matchValue = <text name="value">
  <text />
</text>

It would match text like this:

<thing>
  <call-me-anything />
  <function>
    <input><foo/></input>
    <input><bar/></input>
    <call><foo/></call>
    <call><foo/></call>
    <call><bar/></call>
    <call><bar/></call>
  </function>
  <function>
    <call><baz/></call>
  </function>
  <value>
    <hello/>
  </value>
  <function>
    <input><hello/></input>
    <call><world/></input>
  </function>
</thing>

So imagine that as JSON as well.

Wondering how you go about creating an algorithm for this. I am stuck at the beginning, but it seems to require something like recursive descent of some sort, but on trees rather than on sequences.

So you have a function:

function checkIfMatch(actualTree, patternTree) {
  for (node in actualTree) {
    if (!checkIfMatchesSubtree(node, patternTree)) {
      return false
    }
  }
  return true
}

I don't really know how to begin this one, and am searching Google for "tree pattern matching algorithms" or "arbology". Going to take a lot of time to try and translate those mathematical abstractions into code, if I am even going in the right direction. Wondering if you could help construct a simple algorithm for this case. It's hard to figure out how you should be traversing the actual tree, while also traversing the pattern tree, and keeping track of the position you are in each tree.

Spending quite a bit of time on it doesn't get me very far:

function parse(patternTree, textTree) {
  let state = { ti: 0, pi: 0 }
  while (state.ti < textTree.length) {
    let pattern = patternTree[state.pi]
    let result = parsePattern(pattern, textTree[state.ti])
    if (!result) {
      return
    }
  }
}

function parsePattern(patternNode, textNode, state) {
  if (patternNode.type == 'sequence') {
    return parseSequencePattern(patternNode, textNode, state)
  }
}

function parseSequencePattern(patternNode, textNode, state) {
  while (true) {
    let i = 0
    while (i < patternNode.children.length) {
      let childPattern = patternNode.children[i++]
      let result = parsePattern(childPattern, textNode)
      if (!result) {
        return false
      }

    }
  }
}


while (stack.length) {
  let {
    parents,
    node,
  } = stack.shift()

  stack.push({
    parents: [node, ...parents]
  })
}

Solution

  • When attempting to match your pattern p to a tree t, you have to first match the root of p and its children to the root of t and t's children. If p does not match at the root, then the children of t have to be traversed, and each of those child values matched against p.

    Based on your posted input and node types, there are three main input scenarios that a recursive pattern matching function has to handle:

    1. p(object) and t(object). Here, the pattern is an object with type and child keys. The input tree is also an object with at minimum tag and children keys. The pattern matching function delegates to a helper function that carries out the match for p's object.type.
    2. p(object) and t(Array). In this case, the tree is an Array of objects, and based on the pattern's object, the code has to decide how tree's Array should be treated (sequence, first, etc).
    3. p(Array) and t(Array). Here, both p and t are in the form of a sequence, and the match operation is to find, for every a in p's Array, a corresponding a1 in t's Array such that pattern_match(a, a1) produces true.

    As a start, the code below implements a find_match function that supports the foundational nodes text and sequence:

    //object that stores node match handlers
    node_types = {'sequence':sequence_match, 'text':text_match}
    //main find_match function
    function find_match(pattern, tree_root, tree_children=null){
        var tree = tree_children === null? tree_root.children : tree_children;
        if (!Array.isArray(pattern)){ 
            //pattern is an object - delegate to handler for node type
            if (node_types[pattern.type](pattern, tree_root, tree)){
                return true
            }
            //no match at the root - check the children
            return tree.some(x => find_match(pattern, x, null));
        }
        //pattern is an array - delegate to sequence
        return sequence_match({'children':pattern}, tree_root, tree)
    }
    //text match handler
    function text_match(pattern, tree_root, tree_children){
        if (!('value' in pattern) && !('name' in pattern)){
            //empty text node found
            return true
        }
        if (tree_root.tag === pattern['value' in pattern ? 'value' : 'name']){
            //match found at the root - continue match with pattern and tree children
            return find_match(pattern.children, null, tree_root.children)
        } 
        //no match - check the tree's children
        return (tree_children === null ? tree_root.children : tree_children).some(x => find_match(pattern, x))
    
    }
    //sequence match handler
    function sequence_match(pattern, tree_root, tree_children){
        var tree = tree_children === null ? tree_root.children : tree_children;
        //copy the pattern and tree objects, as "p" is mutated as part of the match process
        var p = JSON.parse(JSON.stringify(pattern))
        var t = JSON.parse(JSON.stringify(tree))
        while (p.children.length){
            if (!t.length){
                return false
            }
            var f = false;
            var f_seq = p.children.shift();
            for (var _n_t of t){
                //compare sub-pattern and sub-tree
                if (find_match(f_seq, _n_t, t)){
                    f = true
                    break;
                }
            }
            //no match found for sub-pattern in sequence, return false
            if (!f){
                return false
            }
        }
        //matches found for all sub-patterns in the sequence, return true
        return true
    }
    

    tree = [{'tag': 'foo', 'children': [{'tag': 'bar', 'children': []}]}, {'tag': 'foo', 'children': [{'tag': 'bar', 'children': []}]}, {'tag': 'foo', 'children': [{'tag': 'bar', 'children': []}]}]
    pattern = {'type': 'text', 'name': 'thing', 'children': [{'type': 'text', 'children': []}, {'type': 'sequence', 'children': [{'type': 'first', 'children': [{'type': 'class', 'name': 'value', 'children': []}, {'type': 'class', 'name': 'function', 'children': []}]}]}]}
    console.log(find_match(pattern, {'tag':null, 'children':tree}))
    //prints "true"