I'm working on a complex rule parser that has the following properties:
I'm able to do simple rules but having trouble evaluating complex rules in nested parenthesis.
Here's nested rule definition I'm trying to evaluate:
definition = '((K00925 K00625),K01895) (K00193+K00197+K00194) (K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584) (K00399+K00401+K00402) (K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))'
((K00925 K00625),K01895)
(K00193+K00197+K00194)
(K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584)
(K00399+K00401+K00402)
(K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))
Here's the code I have that is almost correct:
import re
def rule_splitter(rule: str, split_characters: set = {"+", "-", ",", "(", ")", " "}) -> set:
"""
Split rule by characters.
Args:
rule (str): Boolean logical string.
split_characters (list): List of characters to split in rule.
Returns:
set: Unique tokens in a rule.
"""
rule_decomposed = str(rule)
if split_characters:
for character in split_characters:
character = character.strip()
if character:
rule_decomposed = rule_decomposed.replace(character, " ")
unique_tokens = set(filter(bool, rule_decomposed.split()))
return unique_tokens
def find_rules(definition: str) -> list:
"""
Find and extract rules from the definition string.
Args:
definition (str): Complex boolean logical string with multiple rules.
Returns:
list: List of extracted rules as strings.
"""
rules = []
stack = []
current_rule = ""
outside_rule = ""
for char in definition:
if char == '(':
if stack:
current_rule += char
if outside_rule.strip():
rules.append(outside_rule.strip())
outside_rule = ""
stack.append(char)
elif char == ')':
stack.pop()
if stack:
current_rule += char
else:
current_rule = f"({current_rule.strip()})"
rules.append(current_rule)
current_rule = ""
else:
if stack:
current_rule += char
else:
outside_rule += char
# Add any remaining outside_rule at the end of the loop
if outside_rule.strip():
rules.append(outside_rule.strip())
return rules
def evaluate_rule(rule: str, tokens: set, replace={"+": " and ", ",": " or "}) -> bool:
"""
Evaluate a string of boolean logicals.
Args:
rule (str): Boolean logical string.
tokens (set): List of tokens in rule.
replace (dict, optional): Replace boolean characters. Defaults to {"+":" and ", "," : " or "}.
Returns:
bool: Evaluated rule.
"""
# Handle optional tokens prefixed by '-'
rule = re.sub(r'-\w+', '', rule)
# Replace characters for standard logical formatting
if replace:
for character_before, character_after in replace.items():
rule = rule.replace(character_before, character_after)
# Split the rule into individual symbols
unique_symbols = rule_splitter(rule, replace.values())
# Create a dictionary with the presence of each symbol in the tokens
token_to_bool = {sym: (sym in tokens) for sym in unique_symbols}
# Parse and evaluate the rule using a recursive descent parser
def parse_expression(expression: str) -> bool:
expression = expression.strip()
# Handle nested expressions
if expression.startswith('(') and expression.endswith(')'):
return parse_expression(expression[1:-1])
# Evaluate 'OR' conditions
if ' or ' in expression:
parts = expression.split(' or ')
return any(parse_expression(part) for part in parts)
# Evaluate 'AND' conditions
elif ' and ' in expression:
parts = expression.split(' and ')
return all(parse_expression(part) for part in parts)
# Evaluate individual token presence
else:
return token_to_bool.get(expression.strip(), False)
return parse_expression(rule)
def evaluate_definition(definition: str, tokens: set) -> dict:
"""
Evaluate a complex definition string involving multiple rules.
Args:
definition (str): Complex boolean logical string with multiple rules.
tokens (set): Set of tokens to check against the rules.
Returns:
dict: Dictionary with each rule and its evaluated result.
"""
# Extract individual rules from the definition
rules = find_rules(definition)
# Evaluate each rule
rule_results = {}
for rule in rules:
try:
cleaned_rule = rule[1:-1] if rule.startswith('(') and rule.endswith(')') else rule # Remove outer parentheses if they exist
result = evaluate_rule(cleaned_rule, tokens)
except SyntaxError:
# Handle syntax errors from eval() due to incorrect formatting
result = False
rule_results[rule] = result
return rule_results
# Example usage
definition = '((K00925 K00625),K01895) (K00193+K00197+K00194) (K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584) (K00399+K00401+K00402) (K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))'
tokens = {
'K00925',
'K00625',
# 'K01895',
'K00193',
'K00197',
'K00194',
'K00577',
'K00578',
'K00579',
'K00580',
'K00581',
# 'K00582',
'K00584',
'K00399',
'K00401',
'K00402',
'K22480',
# 'K22481',
# 'K22482',
'K03388',
'K03389',
'K03390',
# 'K08264',
# 'K08265',
'K14127',
'K14126',
'K14128',
'K22516',
# 'K00125'
}
result = evaluate_definition(definition, tokens)
# result
# {'((K00925 K00625),K01895)': False,
# '(K00193+K00197+K00194)': True,
# '(K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584)': True,
# '(K00399+K00401+K00402)': True,
# '(K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))': True}
Note that the implementation is splitting the first rule.
Here is the following output I'm expecting:
{
'(K00925 K00625),K01895)':True, # Note this is True because of `K00925` and `K00625` together (b/c the parenthesis) OR K01895 being present
'(K00193+K00197+K00194)':True,
'(K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584)':True, # This is missing optional tokens
'(K00399+K00401+K00402)':True,
'(K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125)':True, # This combination allows for {K03388, K03389, K03390, K14127} +{K14126+K14128}
}
Here's a graphical representation of the information flow for this complex rule definition:
It should also be able to handle this rule:
rule_edge_case='((K00134,K00150) K00927,K11389)'
with the following query tokens: tokens = { 'K00134', 'K00927'}
. This should be True
because either (K00134 OR K00150) with K00927
are present which is sufficient.
Edit 1: I had an error before and the last rule was actually True and not False.
Edit 2: I've changed "items" to "tokens" and modified which tokens are used for evaluation to capture better edge cases.
A great tool for this job would be a proper parser for expression grammars. I'm using parsimonious for this answer, which allows you to define a BNF or eBNF like syntax for your grammar, to assist with decoding your DSL.
edit I updated the grammar to check the or ","
operator before checking for the rule break operator " "
. This also means you check for the maybe_or
in the brace_expression
definition, rather than for the rules
definition.
Given the rules, you've defined a few operators with different precedence.
","
" "
"+"
or ignore operator "-"
"+"
and "-"
have the same precedence"(...)"
Given these rules, here's how you can define a grammar.
from parsimonious.grammar import Grammar
grammar = Grammar(
"""
maybe_or = (maybe_rule or+) / maybe_rule
or = "," maybe_rule
maybe_rule = (maybe_and rule+) / maybe_and
rule = " " maybe_and
maybe_and = (expression and+) / expression
and = and_op expression
and_op = "+" / "-"
expression = brace_expression / variable
brace_expression = "(" maybe_or ")"
variable = ~r"[A-Z0-9]+"
"""
)
In this example, I've made explicit steps that allow following the operator precedence in the grammar. Depending on your descent parser, you may wish to order differently. In the case of parsimonious
, items are greedily captured, meaning we want to evaluate the long form of the definition first.
Now that we have a grammar, we can construct containers to represent its hierarchy. We'll want to be able to test against the items set by evaluating each container against its children.
At this point we can consider which of the operators need to be defined by separate containers. The and-operation "+"
and rules operator " "
have the same boolean evaluation, so we can combine them to a single And
class. Other containers would be Or
, Ignored
and Var
to represent the bool var codename.
We'll also benefit from a str repr to see whats going on in the case of errors, which we can use to dump out a normalized version of the expression.
from dataclasses import dataclass
from typing import List, Protocol
class Matcher(Protocol):
def matches(self, variables: List[str]) -> bool:
pass
@dataclass
class Var:
value: str
def __str__(self):
return self.value
def matches(self, variables: List[str]) -> bool:
return self.value in variables
@dataclass
class Ignored:
value: Matcher
def __str__(self):
return f"{self.value}?"
def matches(self, variables: List[str]) -> bool:
return True
@dataclass
class And:
values: List[Matcher]
def __str__(self):
return "(" + "+".join(map(str, self.values)) + ")"
def matches(self, variables: List[str]) -> bool:
return all(v.matches(variables) for v in self.values)
@dataclass
class Or:
values: List[Matcher]
def __str__(self):
return "(" + ",".join(map(str, self.values)) + ")"
def matches(self, variables: List[str]) -> bool:
return any(v.matches(variables) for v in self.values)
With a grammar and a set of containers we can now begin to unpack the statement. We can use the NodeVistor
class from parsimonious
for this. Each definition in the grammar can be given its own handler method to use while unpacking values.
You'll want to add lots of test cases for this class, as the parsing logic used is highly dependent on the grammar being evaluated. I've found if the grammar is logical, so will the decoder methods be on the NodeVistor
implementation.
from parsimonious.nodes import NodeVisitor
class Visitor(NodeVisitor):
def visit_maybe_rule(self, node, visited_children):
# If there are multiple rules, combine them.
children, *_ = visited_children
if isinstance(children, list):
return And([children[0], *children[1]])
return children
def visit_rule(self, node, visited_children):
# Strip out the " " rule operator child token
return visited_children[1]
def visit_maybe_or(self, node, visited_children):
# If there are multiple or values, combine them.
children, *_ = visited_children
if isinstance(children, list):
return Or([children[0], *children[1]])
return children
def visit_or(self, node, visited_children):
# Strip out the "," or operator child token
return visited_children[1]
def visit_maybe_and(self, node, visited_children):
# If there are multiple and values, combine them.
children, *_ = visited_children
if isinstance(children, list):
return And([children[0], *children[1]])
return children
def visit_and(self, node, visited_children):
# Strip out the operator child token, and
# handle the case where we ignore values.
if visited_children[0] == "-":
return Ignored(visited_children[1])
return visited_children[1]
def visit_and_op(self, node, visited_children):
# get the text of the operator.
return node.text
def visit_expression(self, node, visited_children):
# expressions only have one item
return visited_children[0]
def visit_brace_expression(self, node, visited_children):
# Strip out the "(" opening and ")" closing braces
return visited_children[1]
def visit_variable(self, node, visited_children):
# Parse the variable name
return Var(node.text)
def generic_visit(self, node, visited_children):
# Catchall response.
return visited_children or node
Following your example set, we can see each output and how it is being evaluated quite clearly.
items = {
'K00925', 'K00193', 'K00197', 'K00194', 'K00577', 'K00578', 'K00579', 'K00580', 'K00581',
'K00582', 'K00584', 'K00399', 'K00401', 'K00402', 'K22480', 'K22481', 'K22482', 'K03388',
'K03389', 'K03390', 'K08264', 'K08265', 'K14127', 'K14126', 'K22516',
}
definitions = [
"((K00925 K00625),K01895)",
"(K00193+K00197+K00194)",
"(K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584)",
"(K00399+K00401+K00402)",
"(K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))",
"((K00134,K00150) K00927,K11389)",
]
for definition in definitions:
tree = grammar.parse(definition)
val = Visitor().visit(tree)
print(f"Test: {definition}")
print(f"Parsed as: {val}")
print(f"Result: {val.matches(items)}")
print()
Which outputs as expected:
Test: ((K00925 K00625),K01895)
Parsed as: ((K00925+K00625),K01895)
Result: False
Test: (K00193+K00197+K00194)
Parsed as: (K00193+K00197+K00194)
Result: True
Test: (K00577+K00578+K00579+K00580 K00581-K00582-K00583+K00584)
Parsed as: ((K00577+K00578+K00579+K00580)+(K00581+K00582?+K00583?+K00584))
Result: True
Test: (K00399+K00401+K00402)
Parsed as: (K00399+K00401+K00402)
Result: True
Test: (K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))
Parsed as: ((K22480+K22481+K22482),(K03388+K03389+K03390),(K08264+K08265),(K03388+K03389+K03390+K14127+((K14126+K14128),(K22516+K00125))))
Result: True
Test: ((K00134,K00150) K00927,K11389)
Parsed as: (((K00134,K00150)+K00927),K11389)
Result: False