pythonparsingplylark-parser

Is there a Python parsing library that can parse a TOML-like format that specifies nested fields with [ParentHeader_ChildSection]?


I want to parse an externally defined (and undocumented) file format in Python. It looks somewhat similar to TOML, but with different text styles, and no quoting. For example:

[Schedule_Step122]
m_nMaxCurrent=0
m_szAddIn=Relay OFF
m_szLabel=06 - End Charge
m_uLimitNum=2

[Schedule_Step122_Limit0]
Equation0_szCompareSign=>=
Equation0_szRight=F_05_Charge_Capacity
Equation0_szLeft=PV_CHAN_Charge_Capacity
m_bStepLimit=1
m_szGotoStep=End Test


[Schedule_Step122_Limit1]
Equation0_szCompareSign=>=
Equation0_szLeft=PV_CHAN_Voltage
Equation0_szRight=3
m_bStepLimit=1
m_szGotoStep=End Test

(This is Arbin's test schedule format.)

I would like the parsed structure to be something like:

"steps": [
  {
    "max_current": 0,
    "add_in": RELAY_OFF,
    "label": "09 - End Charge",
    "limits": [
      {
        "equations": [
          {
            "left": PV_CHAN_CHARGE_CAPACITY,
            "compare_sign": ">=",
            "right": F_05_CHARGE_CAPACITY
          }
        ],
        "step_limit": 1,
        "goto_step": END_TEST
      },
      {
        "equations": [
          {
            "left": PV_CHAN_VOLTAGE,
            "compare_sign": ">=",
            "right": 6
          }
        ],
        "step_limit": 1,
        "goto_step": END_TEST
      }
    ]
  }
]

The format seems superficially similar to TOML, including some of the nesting, but the string handling is different. I would also like to capture certain values as named constants.

I was also looking into defining a context-free grammar and using a lexer/parser like ANTLR, PLY, pyparsing, or Lark. I'm familiar with reading grammars in documentation, but haven't written or used one with a parser before. However, I don't know how one would represent the nesting structure (such as Schedule_Step122_Limit0 being a member of Schedule_Step122) or the lack of guaranteed order among related keys (like Equation0_szCompareSign, Equation0_szLeft`, etc).

Is there a generic parsing tool I could write a definition for, which would give me the parsed/structured output? Or is the best approach here to write custom parsing logic?


Solution

  • Tools like ANTLR, PLY, pyparsing, or Lark will give you almost no help with this problem. configparser might help a little, but I suspect it'd be more bother than it's worth.

    The following code is close to what you want. You'll need to tweak it based on what you discover about the input-format, and what you'd like for the output-structure.

    import re, json
    
    def main():
        obj = parse('input.txt')
        print(json.dumps(obj, indent=2))
    
    def parse(filename):
        root_object = {}
        current_object = None
        for line in open(filename):
            # trim trailing whitespace:
            line = line.rstrip()
    
            if line == '':
                # blank line
                pass
    
            elif mo := re.fullmatch(r'\[(\w+)\]', line):
                # header line
                # This identifies, via a 'path' from the root object,
                # the object that subsequent name-value lines are talking about.
                header_path = mo.group(1)
                header_pieces = header_path.split('_')
                current_object = get_nested_object(root_object, header_pieces)
    
            elif mo := re.fullmatch(r'([^=]+)=(.*)', line):
                # name-value line
                (name_part, value_str) = mo.groups()
                # The {name_part} identifies a field in {current_object}
                # or some object nested within {current_object}.
                # The {value_str} encodes the value to be assigned to that field.
                name_pieces = name_part.split('_')
                prefix_pieces = name_pieces[:-1]
                field_name_piece = name_pieces[-1]
    
                if prefix_pieces == ['m']:
                    # This is an 'immediate' field of {current_object}
                    obj_w_field = current_object
                else:
                    # This is a field of some object nested within {current_object}
                    obj_w_field = get_nested_object(current_object, prefix_pieces)
    
                mo = re.fullmatch(r'([a-z]+)([A-Z][a-zA-Z]*)', field_name_piece)
                (type_indicator, field_name_pc) = mo.groups()
    
                field_name = to_snake_case(field_name_pc)
                field_value = value_str
    
                obj_w_field[field_name] = field_value
    
            else:
                assert 0, line
        return root_object
    
    def get_nested_object(base_object, header_pieces):
        if header_pieces == []:
            return base_object
        else:
            prefix_pieces = header_pieces[:-1]
            last_piece = header_pieces[-1]
    
            obj = get_nested_object(base_object, prefix_pieces)
    
            if mo := re.fullmatch(r'[A-Za-z]+', last_piece):
                # e.g. "Schedule"
                # This identifies a field/property/member of {obj}
                field_name = to_snake_case(last_piece)
                # That field might or might not exist already.
                if field_name not in obj:
                    # It doesn't exist yet.
                    # We assume that the value of the field is an object
                    obj[field_name] = {}
                return obj[field_name]
                
            elif mo := re.fullmatch(r'([A-Za-z]+)(\d+)', last_piece):
                # e.g., "Step122", "Limit0"
                # This identifies an element of an array that is a field of {obj}
                # e.g., "Step122" implies that {obj} has a field named "steps",
                # whose value is an array,
                # and this identifies the element at index 122 in that array.
                (array_field_name_pc, index_str) = mo.groups()
    
                array_field_name = to_snake_case(array_field_name_pc) + 's'
                index = int(index_str)
    
                if array_field_name not in obj:
                    obj[array_field_name] = {}
                    # In practice, you might want to make this a list.
                array = obj[array_field_name]
    
                if index not in array:
                    array[index] = {}
                return array[index]
    
            else:
                assert 0, last_piece
    
            assert 0
    
    # "_pc" suffix denotes a Pascal-cased name, e.g. "MaxCurrent"
    
    def to_snake_case(name_pc):
        assert '_' not in name_pc
        def replfunc(mo):
            cap_letter = mo.group(0)
            low_letter = cap_letter.lower()
            if mo.start() == 0:
                return low_letter
            else:
                return '_' + low_letter
        return re.sub(r'[A-Z]', replfunc, name_pc)
    
    main()
    

    For the example input, it prints:

    {
      "schedule": {
        "steps": {
          "122": {
            "max_current": "0",
            "add_in": "Relay OFF",
            "label": "06 - End Charge",
            "limit_num": "2",
            "limits": {
              "0": {
                "equations": {
                  "0": {
                    "compare_sign": ">=",
                    "right": "F_05_Charge_Capacity",
                    "left": "PV_CHAN_Charge_Capacity"
                  }
                },
                "step_limit": "1",
                "goto_step": "End Test"
              },
              "1": {
                "equations": {
                  "0": {
                    "compare_sign": ">=",
                    "left": "PV_CHAN_Voltage",
                    "right": "3"
                  }
                },
                "step_limit": "1",
                "goto_step": "End Test"
              }
            }
          }
        }
      }
    }