#!/usr/bin/perl -w

#
# dsc_to_rssacint
#
# Copyright (C) 2016 University of Southern California.
# All rights reserved.                                            
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# 

=head1 NAME

dsc_to_rssacint - convert Hedgehog's DSC output to rssac intermediate format

=head1 SYNOPSIS

dsc_to_rssacint < foo.dscdata.xml > foo.rssacint

=head1 DESCRIPTION

This program reads the output of C<dsc -d -f> with the RSSAC
extensions (in XML) and converts it to rssacint format (tab-delimited
text with operators for merging).

The formats are nearly equivalent, semantically.
Rssacint adds operators to support merging.
DSC includes a great deal more information that is passed here.
In principle that information could also be preserved,
but that is left as an exercise to the reader.

The input file is read into memory for XML parsing.
In principle that limitation could be relaxed,
but for a proof-of-concept that seems unnecessary.


=head1 OPTIONS

=over

=item B<-d>

Enable debugging output.

=item B<-v>

Enable verbose output.

=item B<--help>

Show help.

=item B<--man>

Show full manual.


=head1 OUTPUT

By default, we output information needed for RSSAC-002:

We include additional information to evaluate data completeness:



Optional information for additional analysis:


=head1 SAMPLE USAGE

=head2 Input

    <dscdata>
    <array name="pcap_stats" dimensions="2" start_time="1464705181" stop_time="1464705259">
      <dimension number="1" type="ifname"/>
      <dimension number="2" type="pcap_stat"/>
      <data>
        <ifname val="L3RtcC9maWxlLnBjYXA=" base64="1">
          <pcap_stat val="pkts_captured" count="6815406"/>
        </ifname>
      </data>
    </array>
    <array name="traffic_sizes_responses" dimensions="2" start_time="1464705181" stop_time="1464705259">
      <dimension number="1" type="Transport"/>
      <dimension number="2" type="MsgLen"/>
      <data>
        <Transport val="udp">
          <MsgLen val="460" count="264701"/>
          <MsgLen val="646" count="78671"/>
          <!-- much more -->

=head2 Command

    dsc_to_rssacint < foo.dscdata.xml > foo.rssacint

=head2 Output

    #fsdb -F t key count
    <ts     1464705181
    >ts     1464705259
    +4u1:460        264701
    +4u1:646        78671
    ...

=back

=cut

use strict;
use Pod::Usage;
use Getopt::Long;
use XML::Simple;
use Net::IP;

Getopt::Long::Configure ("bundling");
pod2usage(2) if ($#ARGV >= 0 && $ARGV[0] eq '-?');
my(@orig_argv) = @ARGV;
my($prog) = $0;
my $debug = undef;
my $verbose = undef;
&GetOptions(
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'd|debug+' => \$debug,   
        'v|verbose+' => \$verbose) or pod2usage(2);

my($out_schema) = "#fsdb -F t key count";

binmode STDOUT, ":utf8";
print $out_schema . "\n";

my $xs = new XML::Simple;
my $xml = $xs->XMLin('-');

print $out_schema . "\n";

sub truncate_ip6($) {
    my($srcip) = $_[0];
    # As nice as Net::IP is, it's slow to build objects, so we avoid
    # it for IPv4.
    my($srcip_ni) = new Net::IP($srcip);
    return "error" if (!$srcip_ni);
    my $a_str = $srcip_ni->ip();
    $a_str =~ s/....:....:....:....$/:/;
    my $as = new Net::IP($a_str);
    return $as->short();
}

sub output_key_value_pair($$) {
    my($key, $value) = @_;
    print "$key\t$value\n";
};
sub output_key_value_pairs(@) {
    foreach (@{$_[0]}) {
	output_key_value_pair($_->[0], $_->[1]);
    };
};

sub flatten_xml_2d_array($) {
    my($xml_array) = @_;
    die ("xml array without dimension\n") unless ($xml_array->{dimension});
    die ("xml 2d array with wrong number of dimensions\n") unless ($xml_array->{dimensions} == 2);
    my($xn, $yn) = ($xml_array->{dimension}->[0]->{type}, $xml_array->{dimension}->[1]->{type});
    my($d);
    my($rows_aref);  # nonesense because XML::Simple flattens arrays with one row :-(
    if (ref $xml_array->{data}->{$xn} eq 'HASH') {
	$rows_aref = [ $xml_array->{data}->{$xn} ];
    } else {
	$rows_aref = \@{$xml_array->{data}->{$xn}};
    };
    foreach my $row (@$rows_aref) {
        my $x = $row->{val};
	foreach my $elem (@{$row->{$yn}}) {
	    my $y = $elem->{val};
	    $d->{$x}->{$y} = $elem->{count};
	};
    }
    return $d;
}

sub output_array_dimension($$) {
    my($prefix, $xml_array) = @_;
    foreach (sort keys %$xml_array) {
	print "$prefix$_\t$xml_array->{$_}\n";
    };
};


# extra: keep track of measurement period
output_key_value_pairs ([ ['<ts' => $xml->{array}->{pcap_stats}->{start_time} ],
		          ['>ts' => $xml->{array}->{pcap_stats}->{stop_time} ]]);

# rssac-002v2 section 2.3: number of queries
my($traffic_volume_queries) = flatten_xml_2d_array($xml->{array}->{traffic_volume_queries});
output_key_value_pairs ([ ['+3t04' => $traffic_volume_queries->{tcp}->{IPv4}],
			  ['+3t06' => $traffic_volume_queries->{tcp}->{IPv6}],
			  ['+3u04' => $traffic_volume_queries->{udp}->{IPv4}],
			  ['+3u06' => $traffic_volume_queries->{udp}->{IPv6}] ]);
my($traffic_volume_responses) = flatten_xml_2d_array($xml->{array}->{traffic_volume_responses});
output_key_value_pairs ([ ['+3t14' => $traffic_volume_responses->{tcp}->{IPv4}],
			  ['+3t16' => $traffic_volume_responses->{tcp}->{IPv6}],
			  ['+3u14' => $traffic_volume_responses->{udp}->{IPv4}],
			  ['+3u16' => $traffic_volume_responses->{udp}->{IPv6}] ]);

# rssac-002v2 section 2.4: query and response size distribution
my($traffic_sizes_queries) = flatten_xml_2d_array($xml->{array}->{traffic_sizes_queries});
output_array_dimension ('+4t0:', $traffic_sizes_queries->{tcp});
output_array_dimension ('+4u0:', $traffic_sizes_queries->{udp});
my($traffic_sizes_responses) = flatten_xml_2d_array($xml->{array}->{traffic_sizes_responses});
output_array_dimension ('+4t1:', $traffic_sizes_responses->{tcp});
output_array_dimension ('+4u1:', $traffic_sizes_responses->{udp});

# rssac-002v2 section 2.5: rcode distribution (responses only, but we do both)
my($rcode) = flatten_xml_2d_array($xml->{array}->{rcode});
output_array_dimension ('+51:', $rcode->{ALL});

# rssac-002v2 section 2.6: unique sources
my($unique_sources) = flatten_xml_2d_array($xml->{array}->{unique_sources});
output_array_dimension ('+64:', $unique_sources->{IPv4});
output_array_dimension ('+66:', $unique_sources->{IPv6});
# have to generate the truncated v6 by hand:
foreach (sort keys %{$unique_sources->{IPv6}}) {
    my($ip6a) = truncate_ip6($_);
    print "+6a:$ip6a\t" . $unique_sources->{IPv6}->{$_} . "\n"
};



print "# dsc_to_rssacint\n";


exit 0;

=head1 SEE ALSO

L<dsc(1)>,
L<message_to_rssacint(1)>,
L<rssacint_reduce(1)>,
L<rssacfin_to_rssacyaml(1)>


=head1 AUTHOR and COPYRIGHT

This program was written by John Heidemann.

Copyright (C) 2016 University of Southern California.

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut


