pythonparsingbooleanboolean-logicevaluate

How to evaluate nested boolean/logical expressions in Python?


I'm working on a complex rule parser that has the following properties:

I'm able to do simple rules but having trouble evaluating complex rules in nested parenthesis.

Here's nested rule definition I'm trying to evaluate:

definition = '((K00925 K00625),K01895) (K00193+K00197+K00194) (K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584) (K00399+K00401+K00402) (K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))'

Here's the code I have that is almost correct:

import re

def rule_splitter(rule: str, split_characters: set = {"+", "-", ",", "(", ")", " "}) -> set:
    """
    Split rule by characters.

    Args:
        rule (str): Boolean logical string.
        split_characters (list): List of characters to split in rule.

    Returns:
        set: Unique tokens in a rule.
    """
    rule_decomposed = str(rule)
    if split_characters:
        for character in split_characters:
            character = character.strip()
            if character:
                rule_decomposed = rule_decomposed.replace(character, " ")
    unique_tokens = set(filter(bool, rule_decomposed.split()))
    return unique_tokens

def find_rules(definition: str) -> list:
    """
    Find and extract rules from the definition string.

    Args:
        definition (str): Complex boolean logical string with multiple rules.

    Returns:
        list: List of extracted rules as strings.
    """
    rules = []
    stack = []
    current_rule = ""
    outside_rule = ""

    for char in definition:
        if char == '(':
            if stack:
                current_rule += char
            if outside_rule.strip():
                rules.append(outside_rule.strip())
                outside_rule = ""
            stack.append(char)
        elif char == ')':
            stack.pop()
            if stack:
                current_rule += char
            else:
                current_rule = f"({current_rule.strip()})"
                rules.append(current_rule)
                current_rule = ""
        else:
            if stack:
                current_rule += char
            else:
                outside_rule += char

    # Add any remaining outside_rule at the end of the loop
    if outside_rule.strip():
        rules.append(outside_rule.strip())

    return rules

def evaluate_rule(rule: str, tokens: set, replace={"+": " and ", ",": " or "}) -> bool:
    """
    Evaluate a string of boolean logicals.

    Args:
        rule (str): Boolean logical string.
        tokens (set): List of tokens in rule.
        replace (dict, optional): Replace boolean characters. Defaults to {"+":" and ", "," : " or "}.

    Returns:
        bool: Evaluated rule.
    """
    # Handle optional tokens prefixed by '-'
    rule = re.sub(r'-\w+', '', rule)

    # Replace characters for standard logical formatting
    if replace:
        for character_before, character_after in replace.items():
            rule = rule.replace(character_before, character_after)

    # Split the rule into individual symbols
    unique_symbols = rule_splitter(rule, replace.values())

    # Create a dictionary with the presence of each symbol in the tokens
    token_to_bool = {sym: (sym in tokens) for sym in unique_symbols}

    # Parse and evaluate the rule using a recursive descent parser
    def parse_expression(expression: str) -> bool:
        expression = expression.strip()
        
        # Handle nested expressions
        if expression.startswith('(') and expression.endswith(')'):
            return parse_expression(expression[1:-1])

        # Evaluate 'OR' conditions
        if ' or ' in expression:
            parts = expression.split(' or ')
            return any(parse_expression(part) for part in parts)
        
        # Evaluate 'AND' conditions
        elif ' and ' in expression:
            parts = expression.split(' and ')
            return all(parse_expression(part) for part in parts)
        
        # Evaluate individual token presence
        else:
            return token_to_bool.get(expression.strip(), False)

    return parse_expression(rule)

def evaluate_definition(definition: str, tokens: set) -> dict:
    """
    Evaluate a complex definition string involving multiple rules.

    Args:
        definition (str): Complex boolean logical string with multiple rules.
        tokens (set): Set of tokens to check against the rules.

    Returns:
        dict: Dictionary with each rule and its evaluated result.
    """
    # Extract individual rules from the definition
    rules = find_rules(definition)
    
    # Evaluate each rule
    rule_results = {}
    for rule in rules:
        try:
            cleaned_rule = rule[1:-1] if rule.startswith('(') and rule.endswith(')') else rule  # Remove outer parentheses if they exist
            result = evaluate_rule(cleaned_rule, tokens)
        except SyntaxError:
            # Handle syntax errors from eval() due to incorrect formatting
            result = False
        rule_results[rule] = result
    
    return rule_results

# Example usage
definition = '((K00925 K00625),K01895) (K00193+K00197+K00194) (K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584) (K00399+K00401+K00402) (K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))'
tokens = {
    'K00925',
    'K00625',
    # 'K01895',
    'K00193',
    'K00197',
    'K00194',
    'K00577',
    'K00578',
    'K00579',
    'K00580',
    'K00581',
    # 'K00582',
    'K00584',
    'K00399',
    'K00401',
    'K00402',
    'K22480',
    # 'K22481',
    # 'K22482',
    'K03388',
    'K03389',
    'K03390',
    # 'K08264',
    # 'K08265',
    'K14127',
    'K14126',
    'K14128',
    'K22516',
    # 'K00125'
}

result = evaluate_definition(definition, tokens)
# result
# {'((K00925 K00625),K01895)': False,
#  '(K00193+K00197+K00194)': True,
#  '(K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584)': True,
#  '(K00399+K00401+K00402)': True,
#  '(K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))': True}

Note that the implementation is splitting the first rule.

Here is the following output I'm expecting:

{
    '(K00925 K00625),K01895)':True, # Note this is True because of `K00925` and `K00625` together (b/c the parenthesis) OR K01895 being present
    '(K00193+K00197+K00194)':True,
    '(K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584)':True, # This is missing optional tokens
    '(K00399+K00401+K00402)':True,
    '(K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125)':True, # This combination allows for {K03388, K03389, K03390, K14127} +{K14126+K14128}
}

Here's a graphical representation of the information flow for this complex rule definition:

enter image description here

It should also be able to handle this rule:

rule_edge_case='((K00134,K00150) K00927,K11389)' with the following query tokens: tokens = { 'K00134', 'K00927'}. This should be True because either (K00134 OR K00150) with K00927 are present which is sufficient.

Edit 1: I had an error before and the last rule was actually True and not False.

Edit 2: I've changed "items" to "tokens" and modified which tokens are used for evaluation to capture better edge cases.


Solution

  • A great tool for this job would be a proper parser for expression grammars. I'm using parsimonious for this answer, which allows you to define a BNF or eBNF like syntax for your grammar, to assist with decoding your DSL.


    edit I updated the grammar to check the or "," operator before checking for the rule break operator " ". This also means you check for the maybe_or in the brace_expression definition, rather than for the rules definition.


    Defining the grammar

    Given the rules, you've defined a few operators with different precedence.

    1. Break all values by the or operator ","
    2. Break all top level items by the rules " "
    3. Break all values by the and operator "+" or ignore operator "-"
      • "+" and "-" have the same precedence
    4. Values may be defined in braces "(...)"
      • These are to be recursively parsed by referencing the root definition.

    Given these rules, here's how you can define a grammar.

    from parsimonious.grammar import Grammar
    
    
    grammar = Grammar(
        """
        maybe_or = (maybe_rule or+) / maybe_rule
        or = "," maybe_rule
    
        maybe_rule = (maybe_and rule+) / maybe_and
        rule = " " maybe_and
    
        maybe_and = (expression and+) / expression
        and = and_op expression
        and_op = "+" / "-"
    
        expression = brace_expression / variable
        brace_expression = "(" maybe_or ")"
    
        variable = ~r"[A-Z0-9]+"
        """
    )
    

    In this example, I've made explicit steps that allow following the operator precedence in the grammar. Depending on your descent parser, you may wish to order differently. In the case of parsimonious, items are greedily captured, meaning we want to evaluate the long form of the definition first.

    Defining containers

    Now that we have a grammar, we can construct containers to represent its hierarchy. We'll want to be able to test against the items set by evaluating each container against its children.

    At this point we can consider which of the operators need to be defined by separate containers. The and-operation "+" and rules operator " " have the same boolean evaluation, so we can combine them to a single And class. Other containers would be Or, Ignored and Var to represent the bool var codename.

    We'll also benefit from a str repr to see whats going on in the case of errors, which we can use to dump out a normalized version of the expression.

    from dataclasses import dataclass
    from typing import List, Protocol
    
    
    class Matcher(Protocol):
        def matches(self, variables: List[str]) -> bool:
            pass
    
    @dataclass
    class Var:
        value: str
    
        def __str__(self):
            return self.value
    
        def matches(self, variables: List[str]) -> bool:
            return self.value in variables
    
    @dataclass
    class Ignored:
        value: Matcher
    
        def __str__(self):
            return f"{self.value}?"
    
        def matches(self, variables: List[str]) -> bool:
            return True
    
    @dataclass
    class And:
        values: List[Matcher]
    
        def __str__(self):
            return "(" + "+".join(map(str, self.values))  + ")"
    
        def matches(self, variables: List[str]) -> bool:
            return all(v.matches(variables) for v in self.values)
    
    @dataclass
    class Or:
        values: List[Matcher]
    
        def __str__(self):
            return "(" + ",".join(map(str, self.values)) + ")"
    
        def matches(self, variables: List[str]) -> bool:
            return any(v.matches(variables) for v in self.values)
    

    Parsing the grammar

    With a grammar and a set of containers we can now begin to unpack the statement. We can use the NodeVistor class from parsimonious for this. Each definition in the grammar can be given its own handler method to use while unpacking values.

    You'll want to add lots of test cases for this class, as the parsing logic used is highly dependent on the grammar being evaluated. I've found if the grammar is logical, so will the decoder methods be on the NodeVistor implementation.

    from parsimonious.nodes import NodeVisitor
    
    
    class Visitor(NodeVisitor):
        def visit_maybe_rule(self, node, visited_children):
            # If there are multiple rules, combine them.
    
            children, *_ = visited_children
            if isinstance(children, list):
                return And([children[0], *children[1]])
            return children
    
        def visit_rule(self, node, visited_children):
            # Strip out the " " rule operator child token
    
            return visited_children[1]
    
        def visit_maybe_or(self, node, visited_children):
            # If there are multiple or values, combine them.
    
            children, *_ = visited_children
            if isinstance(children, list):
                return Or([children[0], *children[1]])
            return children
    
        def visit_or(self, node, visited_children):
            # Strip out the "," or operator child token
    
            return visited_children[1]
    
        def visit_maybe_and(self, node, visited_children):
            # If there are multiple and values, combine them.
    
            children, *_ = visited_children
            if isinstance(children, list):
                return And([children[0], *children[1]])
            return children
    
        def visit_and(self, node, visited_children):
            # Strip out the operator child token, and
            # handle the case where we ignore values.
    
            if visited_children[0] == "-":
                return Ignored(visited_children[1])
            return visited_children[1]
    
        def visit_and_op(self, node, visited_children):
            # get the text of the operator.
    
            return node.text
    
        def visit_expression(self, node, visited_children):
            # expressions only have one item
    
            return visited_children[0]
    
        def visit_brace_expression(self, node, visited_children):
            # Strip out the "(" opening and ")" closing braces
    
            return visited_children[1]
    
        def visit_variable(self, node, visited_children):
            # Parse the variable name
    
            return Var(node.text)
    
        def generic_visit(self, node, visited_children):
            # Catchall response.
    
            return visited_children or node
    

    Testing

    Following your example set, we can see each output and how it is being evaluated quite clearly.

    items = {
        'K00925', 'K00193', 'K00197', 'K00194', 'K00577', 'K00578', 'K00579', 'K00580', 'K00581',
        'K00582', 'K00584', 'K00399', 'K00401', 'K00402', 'K22480', 'K22481', 'K22482', 'K03388',
        'K03389', 'K03390', 'K08264', 'K08265', 'K14127', 'K14126', 'K22516',
    }
    
    definitions = [
        "((K00925 K00625),K01895)",
        "(K00193+K00197+K00194)",
        "(K00577+K00578+K00579+K00580+K00581-K00582-K00583+K00584)",
        "(K00399+K00401+K00402)",
        "(K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))",
        "((K00134,K00150) K00927,K11389)",
    ]
    
    for definition in definitions:
        tree = grammar.parse(definition)
        val = Visitor().visit(tree)
        print(f"Test: {definition}")
        print(f"Parsed as: {val}")
        print(f"Result: {val.matches(items)}")
        print()
    

    Which outputs as expected:

    Test: ((K00925 K00625),K01895)
    Parsed as: ((K00925+K00625),K01895)
    Result: False
    
    Test: (K00193+K00197+K00194)
    Parsed as: (K00193+K00197+K00194)
    Result: True
    
    Test: (K00577+K00578+K00579+K00580 K00581-K00582-K00583+K00584)
    Parsed as: ((K00577+K00578+K00579+K00580)+(K00581+K00582?+K00583?+K00584))
    Result: True
    
    Test: (K00399+K00401+K00402)
    Parsed as: (K00399+K00401+K00402)
    Result: True
    
    Test: (K22480+K22481+K22482,K03388+K03389+K03390,K08264+K08265,K03388+K03389+K03390+K14127+(K14126+K14128,K22516+K00125))
    Parsed as: ((K22480+K22481+K22482),(K03388+K03389+K03390),(K08264+K08265),(K03388+K03389+K03390+K14127+((K14126+K14128),(K22516+K00125))))
    Result: True
    
    Test: ((K00134,K00150) K00927,K11389)
    Parsed as: (((K00134,K00150)+K00927),K11389)
    Result: False