perlparse-recdescent

Parse::RecDescent performance issue


I'm using Parse::RecDescent to parse lines in a Cisco IOS ACL. The ACL is used on the edge router of a large network, so it contains almost 8k lines which are set by the government. I'm looping through each of those lines and putting the values into a hash. Although it is 8k lines I'm still spending over 14 seconds parsing the lines? Does this sound reasonable? It seems VERY slow to me. Is there some overhead to using a hash verse another data structure?

Sample Input: (about 8k of these or similar)

deny   ip 2.3.4.5 0.0.0.7 any log-input
deny   ip 5.6.7.8 0.0.0.255 any log-input
deny   ip host 9.10.11.12 any log-input
deny   ip 13.14.15.16 0.0.31.255 any log-input
permit tcp host 17.18.19.20 host 21.22.23.24 eq bgp
permit icmp 25.26.0.0 0.0.255.255 27.28.0.0 0.0.255.255

Here is my entire parser:

package AccessList::Parser;

use strict;
use warnings;
use Carp;
use Scalar::Util 'blessed';
use Parse::RecDescent;

our $VERSION = '0.05';

sub new {
    my ($class) = @_;
    my $self = { PARSER => undef, };
    bless $self, $class;
    $self->_init();
    return $self;
}

sub _init {
    my ($self) = @_;
    $self->{PARSER} = Parse::RecDescent->new( $self->_grammar() );
}

sub parse {
    my ( $self, $string ) = @_;
    defined ($string) or confess "blank line received";
    my $tree = $self->{PARSER}->startrule($string);
    defined($tree) or confess "unrecognized line\n";
    return visit($tree);
}

#
# Finished tests
#

sub visit {
    my ($node) = @_;

    my $Rule_To_Key_Map = {
        "acl_action"              => 1,
        "acl_protocol"            => 1,
        "acl_src_ip"              => 1,
        "acl_src_port"            => 1,
        "acl_dst_ip"              => 1,
        "acl_dst_port"            => 1,
        "acl_remark"              => 1
    };

    my $parent_key;
    my $result;

    # set s of explored vertices
    my %seen;

    #stack is all neighbors of s
    my @stack;
    push @stack, [ $node, $parent_key ];

    my $key;

    while (@stack) {

        my $rec = pop @stack;

        $node       = $rec->[0];
        $parent_key = $rec->[1];    #undef for root

        next if ( $seen{$node}++ );

        my $rule_id = ref($node);

        if ( exists( $Rule_To_Key_Map->{$rule_id} ) ) {
            $parent_key = $rule_id;
        }

        foreach my $key ( keys %$node ) {
            next if ( $key eq "EOL" );
            my $next = $node->{$key};
            if ( blessed($next) ) {
                if ( exists( $next->{__VALUE__} ) ) {
                    #print ref($node), " ", ref($next), " ", $next->{__VALUE__},"\n";
                    my $rule  = ref($node);
                    my $token = $next->{__VALUE__};
                    $result->{$parent_key} = $token;
                    #print $rule, " ", $result->{$rule}, "\n";
                }
                push @stack, [ $next, $parent_key ];
                #push @stack, $next;
            }
        }
    }
    return $result;
}

sub _grammar {
    my ($self) = @_;

    my $grammar = q{
<autotree>

startrule :
        access_list EOL
    |   acl_remark EOL
    |   <error>

#
# access-lists
#

access_list : acl_action

acl_remark :
        "remark" REMARKS

acl_action :
        ACTIONS acl_protocol

#
# protocol options
#

acl_protocol :
        PROTOCOL acl_src_ip

#
# access-list source IP addresses
#

acl_src_ip :
        address acl_dst_ip
    |   address acl_src_port

#
# access-list source ports
#

acl_src_port : 
        port acl_dst_ip

#
# access-list destination IP address
#

acl_dst_ip :
        address acl_dst_port
    |   address acl_options
    | address CONNECTION_TYPE
    | address LAYER3_OPTIONS
    | IPRANGE

#
# access-list destination ports
#

acl_dst_port : 
        port acl_options
    |   acl_icmp_type acl_options

#
# icmp_types
#

acl_icmp_type :
       ICMP_TYPE

#
# access-list options
#

acl_options :
      acl_logging LAYER3_OPTIONS
    |   acl_logging
    |   EOL
    |   <error>

acl_logging :
            "log-input"
    |       "log"

#
# IP address types
#
# "object" should be fine here because "object" can not  
# be used to specify ports 

address :
        "host" IPADDRESS
    |   "host" NAME
    |   IPNETWORK
    | WILDCARD_NETWORK
    |   ANY


#
# port types
#

port :
        port_eq
    |   port_range
    |   port_gt
    |   port_lt
    |   port_neq

port_eq :
    "eq" PORT_ID

port_range :
    "range" PORT_RANGE

port_gt :
    "gt" PORT_GT

port_lt :
    "lt" PORT_LT

port_neq :
    "neq" <error: neq is unsupported>

#
# Token Definitions
#

STRING :
        /\S+/

DIGIT :
        /\d+/

NAME :
        /((^|\s[a-zA-Z])(\.|[0-9a-zA-Z_-]+)+)/

RULE_REF :
        /\S+/

ANY:
        "any"

IPADDRESS :
        /((\d{1,3})((\.)(\d{1,3})){3})/

MASK :
        /(((255\.){3}(255|254|252|248|240|224|192|128|0+))|((255\.){2}(255|254|252|248|240|224|192|128|0+)\.0)|((255\.)(255|254|252|248|240|224|192|128|0+)(\.0+){2})|((255|254|252|248|240|224|192|128|0+)(\.0+){3}))/

INVERSE_MASK :
        /(0+|1|3|7|15|31|63|127|255)((\.)(255|127|63|31|15|7|3|1|0)){3}/

WILDCARD_NETWORK :
        /((\d{1,3})((\.)(\d{1,3})){3}) (0+|1|3|7|15|31|63|127|255)((\.)(255|127|63|31|15|7|3|1|0)){3}/

IPNETWORK :
        /((\d{1,3})((\.)(\d{1,3})){3}) (((255\.){3}(255|254|252|248|240|224|192|128|0+))|((255\.){2}(255|254|252|248|240|224|192|128|0+)\.0)|((255\.)(255|254|252|248|240|224|192|128|0+)(\.0+){2})|((255|254|252|248|240|224|192|128|0+)(\.0+){3}))/

IPRANGE :
        /((\d{1,3})((\.)(\d{1,3})){3}) ((\d{1,3})((\.)(\d{1,3})){3})/

PROTOCOL :
        /\d+/ | "ahp" | "eigrp" | "esp" | "gre" | "icmp" | "icmp6" | "igmp" 
    | "igrp" | "ip" | "ipinip" | "ipsec" | "nos" | "ospf" | "pcp" 
    | "pim" | "pptp" | "snp" | "tcp" | "udp" | "41" 

GROUP_PROTOCOL :
        "tcp-udp" | "tcp" | "udp"

ICMP_TYPE : 
        /\d+/ | "alternate-address" | "conversion-error" | "echo-reply" | "echo"
    | "information-reply" | "information-request" | "mask-reply" | "mask-request"
    | "mobile-redirect" | "parameter-problem" | "redirect" | "router-advertisement"
    | "router-solicitation" | "source-quench" | "time-exceeded" | "timestamp-reply"
    | "timestamp-request" | "traceroute" | "unreachable"

CONNECTION_TYPE:
        "established"

LAYER3_OPTIONS:
        "fragments" | "packet-too-big"

PORT_ID :
        /\S+/

PORT_GT :
        /\S+/
{
    bless {__VALUE__=>"$item[1] 65535"}, $item[0]
}

PORT_LT :
        /\S+/
{
    bless {__VALUE__=>"1 $item[1]"}, $item[0]
}

PORT_RANGE :
        /\S+ \S+/

ACTIONS :
        "permit"
    |   "deny"

REMARKS :
        /.*$/

LOG_LEVEL :
        /\d+/ | "emergencies" | "alerts" | "critical" | "errors" 
    | "warnings" | "notifications" | "informational" | "debugging"
    | "disable"

EOL :
        /$/ 
};

    return $grammar;
}

1;

Solution

  • Performance problems:

    Functionality problems:

    I started fixing up your code. (COMPLETELY UNTESTED)

    # make_parser.pl
    
    use strict;
    use warnings;
    
    use Parse::RecDescent qw( );
    
    my $grammar = <<'__EOI__';
    
       {
          use strict;
          use warnings;
    
          use Socket qw( inet_aton );
    
          my %protocol_names = map { $_ => 1 } qw(
             ahp   eigrp  esp     gre    icmp  icmp6  igmp
             igrp  ip     ipinip  ipsec  nos   ospf   pcp
             pim   pptp   snp     tcp    udp
          );
    
          my %protocol_group_names = map { $_ => 1 } qw(
             tcp-udp  tcp  udp
          );
    
          my %icmp_type_names = map { $_ => 1 } qw(
             alternate-address    conversion-error     echo-reply     echo
             information-reply    information-request  mask-reply     mask-request
             mobile-redirect      parameter-problem    redirect       router-advertisement
             router-solicitation  source-quench        time-exceeded  timestamp-reply
             timestamp-request    traceroute           unreachable
          );
    
          sub parse_ipv4_addr {
             my ($addr) = @_;
             return inet_aton($addr);
          }
       }
    
       parse            : <skip: qr/[ \t]*/> line(s) /\Z/ { $item[2] }
    
       line             : line_body /\n|\Z/ { $item[1] }
    
       line_body        : PERMIT <commit> permit_deny_args { [ $item[1], $item[3] ] }
                        | DENY   <commit> permit_deny_args { [ $item[1], $item[3] ] }
                        | REMARK <commit> /[^\n]*/         { 0 }
                        | /[ \t]+/                         { 0 }
    
       permit_deny_args : protocol permit_deny_src permit_deny_dst { [ @item[1,2,3] ] }
    
       permit_deny_src  : addrs ports { [ @item[1, 2] ] }
    
       permit_deny_dst  : ...
    
       addrs            : HOST      <commit> ( IPv4_ADDR | DOMAIN ) { [ host  => $item[3]           ] }
                        | IPv4_ADDR <commit> IPv4_ADDR              { [ range => $item[1], $item[3] ] }
                        | ANY       <commit>                        { [ any   =>                    ] }
    
       ports            : EQ    <commit> IDENT       { [ permit => $item[2], $item[2] ] }
                        | NEQ   <commit> IDENT       { [ deny   => $item[2], $item[2] ] }
                        | GT    <commit> IDENT       { [ deny   => 1,        $item[2] ] }
                        | LT    <commit> IDENT       { [ deny   => $item[2], 65535    ] }
                        | RANGE <commit> IDENT IDENT { [ permit => $item[2], $item[3] ] }
                        |                            { [ permit => 1,        65535    ] }
    
    
       # Rules that match simply return what they match (i.e. no type info is returned).
    
       PROTOCOL_NAME    : IDENT { $protocol_names{$item[1]} ? $item[1] : undef }
    
       DOMAIN           : ...
    
       IPv4_ADDR        : /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ { parse_ipv4_addr($item[1]) }
    
       # Keywords
       REMARK           : IDENT { $item[1] eq 'remark' ? $item[1] : undef }
       PERMIT           : IDENT { $item[1] eq 'permit' ? $item[1] : undef }
       DENY             : IDENT { $item[1] eq 'deny'   ? $item[1] : undef }
       ANY              : IDENT { $item[1] eq 'any'    ? $item[1] : undef }
       EQ               : IDENT { $item[1] eq 'eq'     ? $item[1] : undef }
       NEQ              : IDENT { $item[1] eq 'neq'    ? $item[1] : undef }
       LT               : IDENT { $item[1] eq 'lt'     ? $item[1] : undef }
       GT               : IDENT { $item[1] eq 'gt'     ? $item[1] : undef }
    
       IDENT            : /[a-zA-Z][a-zA-Z0-9_]*/
    
    __EOI__
    
    Parse::RecDescent->Precompile($grammar, 'Parser')
        or die("Bad grammar\n");
    

    Run the above file, then you can use the parse as follows:

    # test.pl
    
    use strict;
    use warnings;
    
    use Data::Dumper qw( Dumper );
    use Parser       qw( );
    
    my $text = '...';
    
    my $parser = Parser->new();
    
    print(Dumper($parser->parse($text)));