perlperl-data-structures

What would be best approach to work with two hashes of arrays in this scenario?


What would be the best approach to process these two hashes of arrays? The 1st data set contains xml data and 2nd is from csv file, the idea is to check if the filename from 2nd dataset is in the first one and if so, calculate the delay in file delivery. Im not sure how to best produce the workable hash that i can work with (or change existing ones to have filenames as their keys or maybe somehow merge these together), any feedback would be greatly appreciated

dataset 1 (xml data):

$VAR1 = [
      {
        'StartTimestamp' => 1478146371,
        'EndTimestamp' => 1478149167,
        'FileName' => 'a3_file_20161024.req',
        'Stage' => 'SentUserResponse'
      },
      {
        'StartTimestamp' => 1478146375,
        'EndTimestamp' => 1478149907,
        'FileName' => 'a2_file_20161024.req',
        'Stage' => 'SentUserResponse'
      },
      {
        'StartTimestamp' => 1478161030,
        'EndTimestamp' => 1478161234,
        'FileName' => 'file_DEX_0.req',
        'Stage' => 'SentUserResponse'
      },

Data Set 2 from csv file:

$VAR1 = [
      {
        'FileName' => 'a3_file_20161024.req',
        'ExpectedTime' => '20:04:07'
      },
      {
        'FileName' => 'a2_file_20161024.req',
        'ExpectedTime' => '20:14:39'
      },
      {
        'FileName' => 'file_DEX_0.req',
        'ExpectedTime' => '20:48:40'
      },

code used:

    sub Demo {
    my $api_ref = GetData($apicall);
    my $csvdata = ReadDataFile();
    print Dumper($api_ref);
    print "-------------------------*********--------------************------------------\n";
    print Dumper ($csvdata);
    print "#####################\n";
}


    sub ReadDataFile {
        my $parser = Text::CSV::Simple->new;
        $parser->field_map(qw/FileName ExpectedTime/);
        my @csv_data = $parser->read_file($datafile);
        return \@csv_data;
}


    sub GetData {
    my ($xml) = @_;
    my @api_data;
    my %request;
    my $t = XML::Twig->new(
        twig_handlers => {
            '//UserRequest' => sub {
                push @api_data, {%request} if %request;
                %request = ();
                $_->purge;    # free memory
            },
            '//UserRequest/HomeFileName' => sub {
                $request{FileName} = $_->trimmed_text;
            },
            '//UserRequest/Stage' => sub {
                $request{Stage} = $_->trimmed_text;
            },
            '//UserRequest/StartTimestamp' => sub {
                $request{StartTimestamp} = str2time(substr($_->trimmed_text, -8));
            },
            '//UserRequest/EndTimestamp' => sub {
                $request{EndTimestamp} = str2time(substr($_->trimmed_text, -8));
            },
        },
    );
    $t->xparse($xml);
    $t->purge;
    return \@api_data;
}

Solution

  • I am assuming, that you can map the elements of the first array to the elements of the second array by comparing by the filename and that relation is an 1:1 relation, I would perform the following steps:

    1. Sort the lists by filename or generate a index hash
    2. Combine both sets into a single array of hashs or use the index to process through your data set
    3. Do whatever you need to do with the data sets

    Just a litte example:

    #!/usr/bin/env perl
    
    use strict;
    use warnings;
    
    
    my $api_ref = [
        {
            'StartTimestamp' => 1478146371,
            'EndTimestamp'   => 1478149167,
            'FileName'       => 'a3_file_20161024.req',
            'Stage'          => 'SentUserResponse'
        },
        {
            'StartTimestamp' => 1478146375,
            'EndTimestamp'   => 1478149907,
            'FileName'       => 'a2_file_20161024.req',
            'Stage'          => 'SentUserResponse'
        },
        {
            'StartTimestamp' => 1478161030,
            'EndTimestamp'   => 1478161234,
            'FileName'       => 'file_DEX_0.req',
            'Stage'          => 'SentUserResponse'
        }
    ];
    
    my $csvdata = [
        {
            'FileName'     => 'a3_file_20161024.req',
            'ExpectedTime' => '20:04:07'
        },
        {
            'FileName'     => 'a2_file_20161024.req',
            'ExpectedTime' => '20:14:39'
        },
        {
            'FileName'     => 'file_DEX_0.req',
            'ExpectedTime' => '20:48:40'
        }
    ];
    
    # generate the index
    my %index = ();
    
    
    for ( my $i = 0 ; $i < @{$api_ref} ; $i++ ) {
        $index{ $api_ref->[$i]{FileName} }{api_idx} = $i;
    }
    
    for ( my $i = 0 ; $i < @{$csvdata} ; $i++ ) {
        $index{ $csvdata->[$i]{FileName} }{csv_idx} = $i;
    }
    
    # filter for elements not present in both data sets
    my @filename_intersection =
      grep { exists $index{$_}{api_idx} && exists $index{$_}{csv_idx} }
      ( keys %index );
    
    foreach my $filename (@filename_intersection) {
    
        # do something with
        my $api_entry = $api_ref->[ $index{$filename}{api_idx} ];
        my $csv_entry = $csvdata->[ $index{$filename}{csv_idx} ];
    
    # example convert ExpectedTime into seconds and compare it to Start/End time difference
        $csv_entry->{ExpectedTime} =~ /^(\d{2}):(\d{2}):(\d{2})$/;
        my $exp_sec  = ( $1 * 60 + $2 ) * 60 + $3;
        my $real_sec = $api_entry->{EndTimestamp} - $api_entry->{StartTimestamp};
    
        my $msg = "";
        if ( $exp_sec >= $real_sec ) {
            $msg = "in time:";
        }
        else {
            $msg = "late:";
        }
    
        printf
          "Filename %s was %s; expected time: %d seconds, real time: %d seconds\n",
          $filename, $msg, $exp_sec, $real_sec;
    }
    

    Best, Frank