pythonjsonregexlambdasyslog

Converting syslog to Json - expecting as object


I have a python lambda code to convert syslog records to Json. when deployed it getting unexpected errors.

from __future__ import print_function

import base64
import json
import gzip
import re

print('Loading function')


def lambda_handler(event, context):
    output = []
    succeeded_record_cnt = 0
    failed_record_cnt = 0

    for record in event['records']:
        print(record['recordId'])
        payload = base64.b64decode(record['data'])

        regex_string = (r"^((?:\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?"
                        r"|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b\s+(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])\s+"
                        r"(?:(?:2[0123]|[01]?[0-9]):(?:[0-5][0-9]):(?:(?:[0-5]?[0-9]|60)(?:[:\.,][0-9]+)?)))) (?:<(?:[0-9]+).(?:[0-9]+)> )"
                        r"?((?:[a-zA-Z0-9._-]+)) ([\w\._/%-]+)(?:\[((?:[1-9][0-9]*))\])?: (.*)")
        p = re.compile(regex_string)
        m = p.match(payload)
        if m:
            succeeded_record_cnt += 1
            data_field = {
                'timestamp': m.group(1),
                'hostname': m.group(2),
                'program': m.group(3),
                'processid': m.group(4),
                'message': m.group(5)
            }
            output_record = {
                'recordId': record['recordId'],
                'result': 'Ok',
                'data': base64.b64encode(json.dumps(data_field))
            }
        else:
            print('Parsing failed')
            failed_record_cnt += 1
            output_record = {
                'recordId': record['recordId'],
                'result': 'ProcessingFailed',
                'data': record['data']
            }

        output.append(output_record)

    print('Processing completed.  Successful records {}, Failed records {}.'.format(succeeded_record_cnt, failed_record_cnt))
    return {'records': output}

When I deploy this I am getting error like, it is expecting data as an object and I decoded that records and deployed it again but getting similar errors:

[ERROR] TypeError: cannot use a string pattern on a bytes-like object
Traceback (most recent call last):
  File "/var/task/ec2_logs_parquet.py", line 24, in lambda_handler
    m = p.match(payload)

I tried to fix that using below patches to decode it and created seperate variable to pass the data but it is not working.

    payload = base64.b64decode(record['data'])
    payload_str = payload.decode('utf-8')

    p = re.compile(regex_string)
    m = p.match(payload_str)

Still getting error. Did I miss anything here ?


Solution

  • This is a much simpler way to parse syslog records.

    def convert_log( row ):
        time = row[:15]
        host,prog,message = row[16:].split(maxsplit=2)
        if '[' in prog:
            prog,process = prog.split('[')
            process = process[:-2]
        else:
            proc = prog[:-1]
            process = ''
    
        data_field = {
            'timestamp': time,
            'hostname': host,
            'program': prog,
            'processid': process,
            'message': message
        }
        return data_field
    
    for line in open('/var/log/syslog'):
        print( convert_log( line ))