#!/usr/bin/perl -w

#
# rssacint_prefixify
#
# Copyright (C) 2023 University of Southern California.
# All rights reserved.                                            
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#                                                                

=head1 NAME

rssacint_prefixify - summarize per-prefix statistics for summarized rssacint data

=head1 SYNOPSIS

Convert per-IP stats into prefix-level rssacint information.

=head1 DESCRIPTION

Rssacint_reduce is a generic tool that just does operations on records
(except for some legacy code).
This tool takes an rssacint contining per-IP stats (+6[46] records)
and emits rssacint records that are prefix summarized.
These are done as record type +7 and +8 for uniqueness and query counts.

The output of this program can then be sorted and run through rssacint_reduce
to give prefix stats.

=head1 OPTIONS

=over

=item B<--check-sort>

Verify that input is sorted.  (By default, off.)

=item B<-d>

Enable debugging output.

=item B<-v>

Enable verbose output.

=item B<--help>

Show help.

=item B<--man>

Show full manual.

=back

=head1 FILE FORMAT AND OPERATORS

See L<rssacint_reduce> for a description of the file format and operators.

Here we only pay attention to +6[46] records.

Other records are passed through unchanged.



=back

=head1 SAMPLE USAGE

=head2 Input

	#fsdb -F t key count
	+64:128.9.168.85	1
	+64:128.9.168.85	1
	+64:128.9.168.85	1
	+64:128.9.169.1	10
	<ts	1451192632.254226
	=rfileseqno:-	1
	>te	1451192652.008452
	# message_to_rssacint.pl --file-seqno=1

=head2 Command

    < $< LC_COLLATE=C sort -k 1,1 | ../rssacint_prefixify >$@

=head2 Output

	#fsdb -F t key count
	# message_to_rssacint.pl --file-seqno=1
	+64:128.9.168.85	1
	!74:128.9.168.	1
	+84:128.9.168.	1
	+64:128.9.168.85	1
	!74:128.9.168.	1
	+84:128.9.168.	1
	+64:128.9.168.85	1
	!74:128.9.168.	1
	+84:128.9.168.	1
	+64:128.9.169.1	10
	!74:128.9.169.1	1
	+84:128.9.169.1	10
	<ts	1451192632.254226
	=rfileseqno:-	1
	>te	1451192652.008452
	# rssacint_reduce.pl -

=cut

use strict;
use Pod::Usage;
use Getopt::Long;
use Net::IP;

Getopt::Long::Configure ("bundling");
pod2usage(2) if ($#ARGV >= 0 && $ARGV[0] eq '-?');
#my(@orig_argv) = @ARGV;
my($prog) = $0;
my $debug = undef;
my $verbose = undef;
my $check_sort = 0;
&GetOptions(
	'cache-easy!' => \$cache_easy,
	'check-sort!' => \$check_sort,
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'd|debug+' => \$debug,   
        'v|verbose+' => \$verbose) or pod2usage(2);

my($in_schema) = "#fsdb -F t key count";
my($out_schema) = $in_schema;
my %unknown_ops;

binmode STDOUT, ":utf8";
print $out_schema . "\n";

######################################################################

# because Net::IP is way too slow
# copied from message_question_to_load
sub ip_to_hexip($) {
    my($ip) = @_;
    if ($ip =~ /:/) {
        my $head = '';
        my $tail = '';
        my $in_tail = undef;
        foreach (split(/:/, $ip)) {
            if (!$in_tail && $_ eq '') {
                $in_tail = 1;
                next;
            };
            my $part = substr("0000" . $_, -4);
            if ($in_tail) {
                $tail .= $part;
            } else {
                $head .= $part;
            };
        };
        my $mid = "0" x (32 - length($head) - length($tail));
        return $head . $mid . $tail;
    } else {
        # v4
        my(@f) = split(/\./, $ip);
        return undef if ($#f != 3);
        return sprintf("%02x%02x%02x%02x", @f);
    };
}


sub truncate_ip6($) {
    my($srcip) = @_;
#    # As nice as Net::IP is, it's slow to build objects, so we avoid
#    # it for IPv4.
#    my($srcip_ni) = new Net::IP($srcip);
#    if (!$srcip_ni) {
#	next if ($failed_ips > $MAX_FAILED_IPS);
#	print "+eip-fail:$msgid,$time,$srcip\t1\n";
#	return undef;
#    };
#    my $a_str = $srcip_ni->ip();
#    $a_str =~ s/....:....:....:....$/:/;
#    my $as = new Net::IP($a_str);
#    return $as->short();
    my($srcip_hex) = ip_to_hexip($srcip);
    # truncate to /64
    $srcip_hex =~ s/................$/0000000000000000/;
    # renormalize
    my @parts;
    foreach (unpack("(A4)*", $srcip_hex)) {
        s/^0*//;
        $_ = '0' if ($_ eq '');
        push(@parts, $_);
    };
    for (;;) {
        last if ($#parts == -1 || $parts[$#parts] ne '0');
        pop(@parts);
    };
    if ($#parts < 0) {
        $parts[0] = "::";
    } elsif ($#parts < 8) {
        $parts[$#parts] .= "::";
    };
    return join(":", @parts);
}

sub truncate_ip4($) {
    my($srcip) = @_;
    # IPv4 is much easier than IPv6
    # Assume input is in dotted quad.
    my(@parts) = split(/\./, $srcip);
    return undef if ($#parts != 3);
    $parts[3] = '';
    return join('.', @parts);
}


######################################################################

sub process_file($) {
    my($file) = @_;
    my($open_mode, $open_place) = ("<:utf8", $file);
    if ($file eq '-') {
	($open_mode, $open_place) = ("<&=", 0);
    };
    my $in;
    if (!open($in, $open_mode, $open_place)) {
	print ":eopen-fail\t$file\n";
	return;
    };
    my $last_sorting_key = undef;
    while (<$in>) {
	chomp;
        if (/^#/) {
            if (/^#fsdb/) {
                die "unexpected schema: $_\n"
                    if ($_ !~ /^$in_schema/);
		next;
            };
	    print "$_\n";;
            next;
        };
        my($key, $value) = split(/\t/);

	if ($check_sort) {
	    if (defined($last_sorting_key)) {
		die "input is not sorted, compare:\n\t$last_sorting_key\n\t$key\n"
		    if ($last_sorting_key gt $key);
	    };
	    # keep our own copy so as not get confused by the cache
	    $last_sorting_key = $key;
	};

	if ($key !~ /^.6/) {
            # pass through
            print "$_\n";
            next;
	};

	my($op, $section, $rest) = ($key =~ /^(.)(.)(.*)$/);
	if (!defined($op)) {
	    print ":ecannot-parse:$key\t1\n";
	    next;
	};
        if ($op ne '+') {
	    print ":einternal-error:expected-+-op-in:$key\t1\n";
	    next;
        };
        if ($section ne '6') {
	    print ":einternal-error:unknown-section-in:$key\t1\n";
	    next;
        };
        # pass through
        print "$_\n";

        # and compute prefix
        my($v46, $ip) = ($rest =~ /^(.):(.*)$/);
        if (!defined($ip)) {
	    print ":einternal-error:+6x-no-ip:$key\t1\n";
	    next;
        };
        
        my($prefix) = ($v46 eq '4' ? truncate_ip4($ip): truncate_ip6($ip);
        print "!7$v46:$prefix\t$value\n";
        print "+8$v46:$prefix\t1\n";


	#
	# if it's easy, keep it in the cache
	#
	if ($cache_easy && index($hard_ops, $op) == -1 && index($hard_sections, $section) == -1) {
	    # assert($matcher eq $key);
	    # assert($op ne '!')
	    if (defined($easy{$matcher})) {
		$easy{$matcher} = reduce_pair($op, $matcher, $easy{$matcher}, $value, undef, undef);
		die ("rssacint_reduce: internal error: reduce_pair on easy key $matcher returned undef.\n")
		    if (!defined($easy{$matcher}));
	    } else {
		$easy{$matcher} = $value;
	    };
	    next;
	};
	#
	# can we do it?
	#
	if ($last_matcher) {
            if ($matcher eq $last_matcher) {
		my $new_value = reduce_pair($op, $matcher, $last_value, $value, $last_unique, $unique);
		if (defined($new_value)) {
		    # success
		    $value = $new_value;
		} else {
		    # reduce fail, so output both
		    output_last($last_matcher, $last_value);
		};
            } else {
                # moved on¸ so dump saved value
		output_last($last_matcher, $last_value);
            };
	};
        $last_matcher = $matcher;
	$last_unique = $unique;
        $last_value = $value;
    };
    output_last($last_matcher, $last_value);
    close $in;
    print "# rssacint_reduce.pl $file\n";
}

push (@ARGV, "-") if ($#ARGV == -1);
foreach (@ARGV) {
    process_file($_);
};

exit 0;

=head1 SEE ALSO

L<message_to_rssacint(1)>,
L<rssacint_reduce(1)>


=head1 AUTHOR and COPYRIGHT

This program was written by John Heidemann.

Copyright (C) 2023 University of Southern California.

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut


