#!/usr/bin/perl -w

#
# rssacfin_to_rssacyaml
#
# Copyright (C) 2016-2024 University of Southern California.
# All rights reserved.                                            
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# 

=head1 NAME

rssacint_to_rssacyaml - RSSAC intermediate format (post-counted) to final RSSAC YAML

=head1 SYNOPSIS

rssacint_to_rssacyaml < foo.rssacint

=head1 DESCRIPTION

This program reads RSSAC intermediate-format files,
after they have been IP counted, and generates 
final RSSAC YAML output.

It runs in linear time and has constant memory usage.

=head1 OPTIONS

=over

=item B<--directory> DESTROOT or B<-D> DESTROOT

Write output into an RSSAC tree rooted at DESTROOT.
Under DESTROOT will a file in YYYY/MM/PART/rootname-YYYYMMDD-PART.yaml.
Default: none and all output is written to standard output.

=item B<--service> FULL_SERVICE

Specify the full service name.  Typically x.root-servers.net.
What appears in the "service" field inside an RSSAC YAML file.
Default: unknown.example.net.

=item B<--short-service> SHORT_SERVICE

Specify the short service name.  Typically x-root.
What appears in the filename of an RSSAC YAML file.
Default: unknown.

=item B<--service-address-regexp> RE

Give a regular expression RE of valid service addresses;
only these appear in queries-to-service-address.
Default: no filtering is done.

=item B<--period> PERIOD

Specifies the day (PERIOD) under consideration, in ISO-date format (2015-12-27).
Default: inferred from the data, assuming it's about 24 hours (within 3 hours).

=item B<--version> VERSION

Specifies the RSSAC-002 VERSION to report, either rssac002v2, rssac002v3,
rssac002v4, rssac002v4-strict, rssac002v4-lax, rssac002v5
(or just v2, v3, v4, v4-strict, v4-lax, v5).
Default: v5.

=item B<--license> LICENSE

Specific the LICENSE for the data, an arbitrary string that 
is reported in b-extra.
(No special quoting is done.)
Default: none.

=item B<-d>

Enable debugging output.

=item B<-v>

Enable verbose output.

=item B<--help>

Show help.

=item B<--man>

Show full manual.


=head1 SAMPLE USAGE

=head2 Input

=head2 Command

=head2 Output

=back

=cut

use strict;
use Pod::Usage;
use Getopt::Long;
use DateTime;
use DateTime::Format::ISO8601;

Getopt::Long::Configure ("bundling");
pod2usage(2) if ($#ARGV >= 0 && $ARGV[0] eq '-?');
#my(@orig_argv) = @ARGV;
my($prog) = $0;
my $debug = undef;
my $verbose = undef;
my $service = 'unknown.example.net';
my $short_service = 'unknown';
my $service_address_regexp = undef;
my $period = undef;
my $precise_period = undef;
my $out_dir = undef;
my $version = 'v5';
my $license = undef;
&GetOptions(
	'D|directory=s' => \$out_dir,
	'license=s' => \$license,
	'period=s' => \$period,
	'precise-period!' => \$precise_period,
	'short-service=s' => \$short_service,
	'service=s' => \$service,
	'service-address-regexp=s' => \$service_address_regexp,
	'version=s' => \$version,
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'd|debug+' => \$debug,   
        'v|verbose+' => \$verbose) or pod2usage(2);

if (!defined($version)) {
    die "rssacfin_to_rssacyaml: no --version given\n";
};
$version = lc($version);
$version =~ s/^rssac002//;
my $short_version = $version;
$version = 'v4' if ($version eq 'v4-strict');
$short_version =~ s/-[a-z]+//;
if (!($version eq 'v2' || $version eq 'v3' || $version eq 'v4' || $version eq 'v4' || $version eq 'v4-lax' || $version eq 'v5')) {
    die "rssacfin_to_rssacyaml: --version must be v2, v3, v4, v4-strict, v4-lax, or v5.\n";
};


#die "$prog: must specify a service with --service=x.rootservers.net\n"
#    unless ($service);
# can auto-infer period
#die "$prog: must specify a period with --period=2015-12-27\n"
#    unless ($period);
my($in_schema) = "#fsdb -F t key count";
binmode STDOUT, ":utf8";

my $period_start_dt = undef;
my $period_end_dt = undef;
my(%fin) = ();  # the data!

sub infer_dt_from_fin {
    my($fin_field, $offset) = @_;
    $offset //= 0;
    die "$prog: trying to infer date from nonexistent $fin_field\n"
	unless ($fin{$fin_field});
    return DateTime->from_epoch(epoch => $fin{$fin_field} + $offset);
}

sub infer_period {
    return if (defined($period_start_dt) && defined($period_end_dt));
    if (defined($period)) {
	# just compute dates
	die ("$prog: date must be ISO8601 YYYY-MM-DD format, not $period\n")
	    if ($period !~ /^\d{4}-\d{2}-\d{2}$/);
	$period_start_dt = DateTime::Format::ISO8601->parse_datetime($period . "T00:00:00Z");
	$period_end_dt = DateTime::Format::ISO8601->parse_datetime($period . "T23:59:59Z");
    } elsif ($precise_period) {
	$period_start_dt = infer_dt_from_fin('<ts');
	$period_end_dt = infer_dt_from_fin('>te');
    } else {
	my($first_dt) = infer_dt_from_fin('<ts', 3*60*60);
	my($last_dt) = infer_dt_from_fin('>te', -3*60*60);
	die "$prog: <ts and >te dates don't appear to be on same day (" .
		$first_dt->iso8601() . "Z and " . $last_dt->iso8601() ."Z; you may force the date with --period=yyyy-mm-dd\n"
	    if ($first_dt->year != $last_dt->year ||
		$first_dt->month != $last_dt->month ||
		$first_dt->day != $last_dt->day);
	$period_start_dt = DateTime->new(
	    year => $first_dt->year,
	    month => $first_dt->month,
	    day => $first_dt->day,
	    hour => 0,
	    minute => 0,
	    second => 0,
	    time_zone => 'UTC');
	$period_end_dt = DateTime->new(
	    year => $first_dt->year,
	    month => $first_dt->month,
	    day => $first_dt->day,
	    hour => 23,
	    minute => 59,
	    second => 59,
	    time_zone => 'UTC');
    };
}

sub mkdir_p {
    my($path) = '';
    $path = shift @_;
    foreach (@_) {
	$path .= "/" . $_;
	if (! -d $path) {
	    mkdir($path, 0755) || die "$prog: cannot mkdir $path\n";
	};
    };
    return $path;
}

#
# new_output: open a file and write the yaml header
#
sub new_output {
    my($metric) = @_;

    infer_period();

    my($open_mode, $open_place) = (">:utf8");
    my $need_chmod = undef;
    if ($out_dir) {
	my($yyyy) = sprintf("%04d", $period_start_dt->year);
	my($mm) = sprintf("%02d", $period_start_dt->month);
	my($dd) = sprintf("%02d", $period_start_dt->day);
	my($path) = mkdir_p($out_dir, $yyyy, $mm, $metric);
	$open_place = "$path/$short_service-$yyyy$mm$dd-$metric.yaml";
        $need_chmod = 1;
    };
    if (!defined($open_place) || $open_place eq '-') {
	($open_mode, $open_place) = (">&=:utf8", "1");
    };

    my($out); 
    if (!open($out, $open_mode, $open_place)) {
        die "$prog: cannot open $open_place\n";
    };
    if (defined($open_place) && $open_place ne '-' && $need_chmod) {
        chmod(0644, $open_place) or warn "$prog: cannot chmod $open_place\n";
    };
    print $out "---\n";
    if ($version ne 'v2') {
        print $out "version: rssac002$short_version\n";
    };
    print $out "service: $service\n";
    print $out "start-period: " . $period_start_dt->iso8601() . "Z\n";
    if ($version eq 'v2') {
	print $out "end-period: " . $period_end_dt->iso8601() . "Z\n";
    };
    print $out "metric: $metric\n";
    return $out;
}

#
# output_mapping
# take counts from rssacfin directly to yaml with renaming
#
# for example, +64 to num-sources-ipv4
#
sub output_mapping($@) {
    my($out, $mapping, $required) = @_;
    # mapping: aref of aref pairs of yaml, rssacfin fields
    foreach (@$mapping) {
	my ($yaml_field, $rssacfin_field) = @$_;
	my($v) = $fin{$rssacfin_field};
        if (!defined($v)) {
            die "$prog: missing expected rssacfin field $rssacfin_field\n" if ($required);
            $v = 0;
        };
        next if ($v == 0 && !$required);
	print $out "$yaml_field: $v\n";
    };
}

#
# output_key_value
# yaml report a specific (key, value) pair
#
sub output_key_value($$$) {
    my($out, $key, $value) = @_;
    # mapping: aref of aref pairs of yaml, rssacfin fields
    print $out "$key: $value\n";
}

#
# output_bins
# take counts from rssacfin with a prefix, group them into bins
# thus: +4u1:911 and +4u1:921, etc. is mpaped to udp-response-size: 912-927
#
sub output_bins($$$$$$) {
    my($out, $yaml_field, $rssacfin_prefix, $min, $max, $bin_size) = @_;
    die "$prog: assertion failed: output_bins has max $max that is not multiple of bin_size $bin_size\n"
	if (int($max / $bin_size) != ($max + 0.0) / $bin_size);
    my $max_bin = int($max / $bin_size);
    my(@bins) = (0) x ($max_bin + 1);
    my($rssacfin_prefix_re) = quotemeta($rssacfin_prefix);
    my($bins_found) = 0;
    foreach my $key (grep(/^$rssacfin_prefix_re/, keys %fin)) {
	my($size_pre_bin) = ($key =~ /^$rssacfin_prefix_re(\d+)$/);
	if (!defined($size_pre_bin)) {
	    warn "bad key: $key\n";
	    next;
	};
	my($value) = $fin{$key};
	my $b = int($size_pre_bin /$bin_size);
	$b = $max_bin if ($b > $max_bin);
	$bins[$b] += $value;
	$bins_found++;
    };
    return if ($bins_found == 0);
    print $out "$yaml_field:\n";
    foreach (0..$#bins) {
	next if ($bins[$_] == 0);
        my($start) = $_ * $bin_size;
	my($end) = $start + $bin_size - 1;
	next if ($end < $min);
	my($display_start) = $start < $min ? $min : $start;
	my($display_end) = ($_ == $#bins) ? '' : $end;
	print $out "  ${display_start}-${display_end}: $bins[$_]\n";
    };
}

#
# output_tails
# find all keys that match the prefix and output them
# thus: +51:0 with prefix +51: is mapped to 0 for an rcode
#
# $yaml_field can be undef, in which case we output the tails at the toplevel.
#
sub output_tails($$$;$$) {
    my($out, $yaml_field, $rssacfin_prefix, $default_tail, $filter_accept_regexp) = @_;
    my $nesting = '';
    if (defined($yaml_field)) {
        print $out "$yaml_field:\n";
        $nesting = '  ';
    };
    my($rssacfin_prefix_re) = quotemeta($rssacfin_prefix);
    foreach my $key (sort grep(/^$rssacfin_prefix_re/, keys %fin)) {
	my($tail) = ($key =~ /^$rssacfin_prefix_re(.*)$/);
	if (!defined($tail) || $tail eq '') {
	    if (defined($default_tail)) {
		$tail = $default_tail;
	    } else {
		warn "rssacfin_to_rssacyaml: missing tail on $key\n";
		next;
	    };
	};
        # filter?
        if (defined($filter_accept_regexp)) {
            if ($tail !~ /$filter_accept_regexp/) {
                warn("rssacfin_to_rssacyaml: filtering $yaml_field $tail\n") if ($verbose);
                next;
            };
        };
	my($value) = $fin{$key};
        if ($tail =~ /:/) {
            $tail = '"' . $tail . '"';
        };
	print $out "$nesting$tail: $value\n"; 
    }; 
 }


sub process_file($) {
    my($file) = @_;
    my($open_mode, $open_place) = ("<:utf8", $file);
    if ($file eq '-') {
	($open_mode, $open_place) = ("<&=:utf8", 0);
    };
    my $in;
    if (!open($in, $open_mode, $open_place)) {
	die "$prog: cannot open $open_place\n";
    };
    while (<$in>) {
	chomp;
        if (/^#/) {
            if (/^#fsdb/) {
                die "unexpected schema: $_\n"
                    if ($_ !~ /^$in_schema/);
		next;
            };
            next;
        };
	my($key, $value) = split("\t");
	die ("$prog: cannot parse line $_\n") if (!defined($key) || !defined($value));
	die ("$prog: duplicate key $key in line $_\n") if (defined($fin{$key}));
	die ("$prog: input has rssacint data (like +64:192.0.2.1). reduce it with rssacint_reducer --count-ips\n")
	    if ($key =~ /^\+6[46a]:/);
	$fin{$key} = $value;
    };
    #
    # 2.1 load time
    # 2.2 zone size
    # : cannot be determined from rssacfin data
    #
    # 2.3 traffic-volume
    #
    my($out);
    $out = new_output('traffic-volume');
    output_mapping($out, [ [ 'dns-udp-queries-received-ipv4', '+3u04', 1 ],
			   [ 'dns-udp-queries-received-ipv6', '+3u06', 1 ],
			   [ 'dns-tcp-queries-received-ipv4', '+3t04', 1 ],
			   [ 'dns-tcp-queries-received-ipv6', '+3t06', 1 ],
			   [ 'b-dns-tls-queries-received-ipv4', '+3s04', 0 ],
			   [ 'b-dns-tls-queries-received-ipv6', '+3s06', 0 ],
			   [ 'b-dns-https-queries-received-ipv4', '+3h04', 0 ],
			   [ 'b-dns-https-queries-received-ipv6', '+3h06', 0 ],
			   [ 'dns-udp-responses-sent-ipv4', '+3u14', 1 ],
			   [ 'dns-udp-responses-sent-ipv6', '+3u16', 1 ],
			   [ 'dns-tcp-responses-sent-ipv4', '+3t14', 1 ],
			   [ 'dns-tcp-responses-sent-ipv6', '+3t16', 1 ],
			   [ 'b-dns-tls-responses-sent-ipv4', '+3s14', 0 ],
			   [ 'b-dns-tls-responses-sent-ipv6', '+3s16', 0 ],
			   [ 'b-dns-https-responses-sent-ipv4', '+3h14', 0 ],
			   [ 'b-dns-https-responses-sent-ipv6', '+3h16', 0 ] 
			]);
    close ($out);

    #
    # 2.4 traffic-sizes
    #
    $out = new_output('traffic-sizes');
    foreach ([ 'udp-request-sizes' , '+4u0:', 288 ],
	     [ 'udp-response-sizes' , '+4u1:', 4096 ],
	     [ 'tcp-request-sizes' , '+4t0:', 288 ],
	     [ 'tcp-response-sizes' , '+4t1:', 4096 ],
	     [ 'any-request-sizes' , '+40:', 288 ],
	     [ 'any-response-sizes' , '+41:', 4096 ]) {
        my($yaml_field, $rssacfin_prefix, $max) = @$_;
        output_bins($out, $yaml_field, $rssacfin_prefix, 16, $max, 16);
    }
    close($out);

    #
    # 2.5 rcode-volume
    #
    $out = new_output('rcode-volume');
    my($queries) = 0;
    foreach (qw(+3u04 +3u06 +3t04 +3t06 +3s04 +3s06 +3h04 +3h06)) {
        $queries += $fin{$_} if (defined($fin{$_}));
    };
    if (defined($fin{'+50'})) {
        # Backwards compatibility: early versions of message_to_rssacint
	# summed ALL rcodes, not just response rcodes.  We correct for that
	# by assuming all queries were rcode 0 and adjusting down by the number of requests.
	# This bug was fixed 2016-05-25.
	die "rssacfin_to_rssacyaml: cannot process rssacint with mixed old and new rcodes (are you B on 2016-05-27?).\n"
	    if (defined($fin{'+51:0'}));
	$fin{'+50'} -= $queries;
        output_tails($out, 'rcodes', '+5');
    } else {
	# New world order: the input splits out rcodes for queries and replies.
        if ($version eq 'v2') {
            output_tails($out, 'rcodes', '+51:');
        } else {
            # for v3 and later, no rcodes toplevel field
            output_tails($out, undef, '+51:');
        };
    };
    close ($out);

    #
    # 2.6 sources
    #
    $out = new_output('unique-sources');
    my(@sources_things) = ([ 'num-sources-ipv4', '+64', 1 ] );
    push(@sources_things, [ 'num-sources-ipv6', '+66', 1 ]) if ($version ne 'v4');
    push(@sources_things, [ 'num-sources-ipv6-aggregate', '+6a', 1 ]);
    output_mapping($out, \@sources_things);
    close($out);

    #
    # bonus stats
    #
    my($extra_tag) = 'b-extra';
    if ($version =~ /(v2|v3|v4.*)/) {
        $extra_tag = 'extra';
    };
    $out = new_output($extra_tag);
    if (defined($license)) {
        die "rssacfin_to_rssacyaml: license starts with special character,\n\tplease retry with quoting\n"
            if ($license =~ /^[\-\[\]!&*:?,#|{}>@`]/);
        output_key_value($out, 'license', $license);
    };
    if (defined($fin{'=rfileseqno'})) {
	# Old-style was missing a site suffix (sigh, only one site).
        output_tails($out, 'fileseqno', '=rfileseqno', 'site');
    } else {
        output_tails($out, 'fileseqno', '=rfileseqno:', 'site');
    };
    output_mapping($out, [ [ 'start-time', '<ts', 1 ],
    		           [ 'end-time', '>te', 1 ] ]);
    my $duration = $fin{'>te'} - $fin{'<ts'};
    output_key_value($out, 'duration', sprintf("%.3f", $duration));
    my $queries_per_second = ($queries * 1.0) / $duration;
    output_key_value($out, 'queries-per-second', sprintf("%.2f", $queries_per_second));
    output_key_value($out, 'queries-per-period', $queries)
        if (!($short_version eq 'v2' || $short_version eq 'v3'));
    # queries to each service address
    output_tails($out, 'queries-to-service-address', '+s:', undef, $service_address_regexp);
    output_tails($out, 'replies-from-service-address', '+S:', undef, $service_address_regexp);
    close($out);
};
    
push (@ARGV, "-") if ($#ARGV == -1);
foreach (@ARGV) {
    process_file($_);
};

exit 0;

=head1 SEE ALSO

L<dnsanon(1)>,
L<message_to_rssacint(1)>,
L<rssacint_reduce(1)>,
L<rssacfin_to_rssacyaml(1)>


=head1 AUTHOR and COPYRIGHT

This program was written by John Heidemann.

Copyright (C) 2016-2024 University of Southern California.

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut


