I am working on a solution to convert the mathematical instruction given in string statement to mathematical formula. In this approach, one column will contain the given string instructions and in second column would contain the mathematical formula. e.g :div(mul(mstr,mul(div(baseline_year,transaction_yr),spnd_val)),1000) -->((mstr*((baseline_year*transaction_yr)/spnd_val))/1000)
To achieve the above output I am using below python code.
import re
import pandas as pd
def get_operations(calculation):
# Use regular expression to find all occurrences of add, mul, div, sub
operations = re.findall(r'\b(add|mul|div|sub)\b', calculation)
return operations
def sort_operations(operations):
priority = {'mul': 1, 'div': 2, 'add': 3, 'sub': 4}
return sorted(operations, key=lambda op: priority[op])
# function to generate the column expression and update the new column name
def replace_operations(expression):
# Required mathematical operations and theirs corresponding regex
patterns = {
'mul': re.compile(r'mul\(([^,]+),([^)]+)\)'),
'div': re.compile(r'div\(([^,]+),([^)]+)\)'),
'add': re.compile(r'add\(([^,]+),([^)]+)\)'),
'sub': re.compile(r'sub\(([^,]+),([^)]+)\)')
}
# replace function will help to replace the matched expression with the corresponding mathematical operation
def replace(match):
op = match.group(0)
if 'mul' in op:
return f"({match.group(1)}*{match.group(2)})"
elif 'div' in op:
return f"({match.group(1)}/{match.group(2)})"
elif 'add' in op:
return f"({match.group(1)}+{match.group(2)})"
elif 'sub' in op:
return f"({match.group(1)}-{match.group(2)})"
# Apply patterns in BODMAS order
priority = {'mul': 1, 'div': 2, 'add': 3, 'sub': 4}
while any(pattern.search(expression) for pattern in patterns.values()):
print("pattern", any(pattern.search(expression) for pattern in patterns.values()))
math_oprtrs = sort_operations(get_operations(expression))
print("math_operator",math_oprtrs)
for key in math_oprtrs:
print(expression,"pattern_for",patterns[key]) # Change the order here
expression = patterns[key].sub(replace, expression)
return expression
# Create a sample DataFrame
data = {'expression': ['div(mul(mul(div(baseline_year,transaction_year),mstr),spnd_val),1000)']}
df = pd.DataFrame(data)
# Apply the replace_operations function to the 'expression' column
df['updated_expression'] = df['expression'].apply(replace_operations)
display(df)
While I am going to generalized it for more than one then I am getting wrong output and also sometime it is not following the BODMASS rule.
Could you please check and look into that?
I have been tried to write code in python and it is working fine if we are passing two argument in a string instruction and when i am passed the more than two argument then I am getting wrong output.
I propose to do it recursively. Function parse_args
gets the inner string from e.g. mul(inner string) and finds recursively all arguments. Function parse_expression
recursively gets the args and formats the string.
Solution:
import re
operation_q = re.compile(r"^(?P<operation>(mul|div|add|sub))\((?P<args>.*)\)$")
mapping = {
"mul": "*",
"div": "/",
"add": "+",
"sub": "-",
}
def parse_args(args_string):
"""returns first and second argument from args string"""
parentheses_count=0
for i, character in enumerate(args_string):
if character == "(":
parentheses_count += 1
elif character == ")":
parentheses_count -= 1
if parentheses_count == 0:
# found matching parenthesis
return [args_string[:i+1].strip()] + parse_args(args_string[i+2:])
elif character == "," and parentheses_count == 0:
# no nested operations
return [args_string[:i].strip()] + parse_args(args_string[i + 1:])
return [args_string.strip()] if args_string else []
def parse_expression(exp):
"""recursively parse operation and its arguments"""
m = operation_q.match(exp)
if not m:
# not an expression
return exp
groups = m.groupdict()
operator = f""" {mapping[groups["operation"]]} """
args = parse_args(groups["args"])
return f"({operator.join([parse_expression(arg) for arg in args])})"
print(parse_expression("div(mul(mul(div(baseline_year,transaction_year),mstr),spnd_val),1000)"))
# >> ((((baseline_year / transaction_year) * mstr) * spnd_val) / 1000)
print(parse_expression("add(sub(o,p,q),div(p,q,r),mul(c,d,e))"))
# >> ((o - p - q) + (p / q / r) + (c * d * e))
It does not do anything with the order of the operations though. That would be another exercise.