#!/usr/bin/perl -w

#
# message_to_rssacint
#
# Copyright (C) 2016-2025 University of Southern California.
# All rights reserved.                                            
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# 

=head1 NAME

message_to_rssacint - convert dnsanon messages to RSSAC intermediate format

=head1 SYNOPSIS

message_to_rssacint < foo.message.fsdb > foo.rssacint

=head1 DESCRIPTION

This program reads dnsanon message (or message_question) files and produces RSSAC
intermediate format, a simple key-value format that is all one needs
to compute RSSAC-002 statistics.

It runs in linear time and has constant memory usage.

The output format ("rssacint") is documented in L<rssacint_reduce(1)>.

=head1 OPTIONS

=over

=item B<--file-seqno>

File sequence number to pass along in output.  (Default: none.)
Format is either an integer (1), or site:integer (like lax:1)
or "comment".  If given as "comment", it extracts site:seqno from a comment
that matches "#.*dnsanon.*-f \d+-\d+-(\d+)\.([\-\w]+)"
or "#.*dnstapmq.* \d+-\d+-(\d+)\.([\-\w]+)".

=item B<--queries-each-second>

Report number of queries and responses, binned in each second.
(Default: off.)

=item B<--no-active-ranges> and B<--active-ranges>

Report time ranges traffic is received.  (Default: on.)

=item B<--transactions> and B<--no-transactions>

Report (or do not report) specific transactions,
to estimate transaction time  (Default: off.)

=item B<--no-cache-easy> and B<--cache-easy>

Do not cache (or do cache) of "easy" keys.  (Default: they are cached.)

=item B<--no-prefixes> and B<--prefixes>

Do not emit (or do emit) of +6a records for v6 /64s and +6b records for v4 /24s.
(Default: they are emitted.)

=item B<--no-count-basic> and B<--count-basic>

Do not report (or do report) basic RSSAC002 statsitics:
query and response sizes, rcodes.
Do not report (or do report) counts of client addresses.
(Default: on.)

=item B<--no-count-clients> and B<--count-clients>

Do not report (or do report) counts of client addresses.
(Default: on.)

=item B<--count-service-clients> and B<--no-count-service-clients>

Do not report (or do report) counts of clients per service address.
(Default: off; expensive.)

=item  B<--count-tlds> and B<--no-count-tlds>

Report (or do not report)
counts of normalized TLDs (truncate qname to the last component
and normalize case to lowercase)
as +m records (like +6 records).
(Default: off; expensive.)

=item  B<--legacy-service-field>

DEPRECATED.
When counting service addresses, output it as a +6 key
rather than a +d key.
(Default: off, outputing +d keys.)
(Note that using the +6 key mixes service IPs with client IPs.)

=item B<--mode> MODE

DEPRECATED.
Enable combinations fo features.
MODE is "rssacint" (the default).
Or "service", like rssacint, but with --count-service-clients --no-count-clients --no-count-basic, and --legacy-service-field.
Or "tld", like "rssacint" but with --count-service-clients and --count-tlds.
 
=item B<-d>

Enable debugging output.

=item B<-v>

Enable verbose output.

=item B<--help>

Show help.

=item B<--man>

Show full manual.

=back


=head1 OUTPUT

Output is in rssacint format, a odd format designed to be
suport some basic operations that allow us to compute some information
over data records.  The format is described in L<rssacint_reduce>;
but briefly each line consists of an one-character operator,
a key, and a value.  Sometimes the key has a subkey.

All RSSAC statistics are computations over all rows with the same key.

By default, we output the following information:

=over

=item B<+64:192.0.2.0>

For key "64", we count
how many queries are sent from a given IPv4 address (the subkey).
IPv4 addresses must be a consistent format
(typically dotted-quad format with no leading zeros).
(Per-IP data for num-sources-ipv4 from RSSAC002v2 section 2.6.)

=item B<+66:2001:db8::1>

Key "66" counts
how many queries are sent from a given IPv6 address (the subkey).
IPv6 addresses must be a consistent format
(typically IPv6 preferred form with zero replacement, form 2 in RFC-1884).
(Per-IP data for num-sources-ipv6 from RSSAC002v2 section 2.6.)

=item B<+6a:2001:db8::>

Key "6a" counts
how many queries are sent from a given IPv6 /64 prefix.
IPv6 prefixes must be a consistent format
(typically IPv6 preferred form with zero replacement for the low 64 bits,
form 2 in RFC-1884).
(Per-IP-prefix data for num-sources-ipv6-aggregate RSSAC002v2 section 2.6.)

=item B<+6b:192.0.2.>

Key "6b" counts
how many queries are sent from a given IPv4 /24 prefix.
This field is not actually used in RSSAC002.


=item B<+74:2001:db8::>, B<+76:192.0.2.>

Keys "74" and "76" count
how many unique IPs are sent from the given v4 or v6 prefix.

=item B<+3t04>, B<+3t14>,  B<+3t06>, B<+3t16>, B<+3u04>, B<+3u14>,  B<+3u06>, B<+3u16>

The 16 keys matching pattern "3[tush][01][46]" count
how many TCP (t), UDP (u), TLS (s), or HTTPS (h) 
queries were received (0) or responses were sent (1)
for with IPv4 (4) or IPv6 (6).
(RSSAC002v2 section 2.3.  Note that TLS and HTTP are extensions
as of RSSAC002v5.)

=item B<+4t0:100>, B<+4t1:100>, B<+4u0:100>, B<+4u1:100>

The 16 keys matching pattern "3[tush][01][46]" count
how many TCP (t), UDP (u), TLS (s), or HTTPS (h) 
queries were received (0) or responses were sent (1)
for with IPv4 (4) or IPv6 (6)
of a given size (the number after the :, a subkey).
We process all sizes, and leave binning to L<rssacfin_to_rssacyaml(1)>.
(RSSAC002v2 section 2.4.)

=item B<+50:2>, B<+51:2>

Keys "50" and "51" 
count how many
queries were received (0) or responses were sent (1)
with a given DNS reply code (where reply code is the value after the colon, a subkey).
(RSSAC002v2 section 2.5.)

=item B<&lt;ts>

The earliest timestamp (in Unix epoch seconds) seen.
(Extra, not in RSSAC002.)

=item B<&gt;ts>

The latest timestamp (in Unix epoch seconds) seen.
(Extra, not in RSSAC002.)

=item B<-ts>

A list of all seconds (in Unix epoch seconds) that see traffic.
(Extra, not in RSSAC002.)

=item B<=rfileseqno:site>

A list of all file sequence numbers (given as integers)
seen for a given site.
(Extra, not in RSSAC002.)

=item B<+d:192.0.2.0> and B<+Dd:192.0.2.0>

Keys "s" and "S"
count
the number of queries ("s") and replies ("S") going to
a service IP address given as a subkey.
(One must recognize IPv4 and IPv6 by the address format.)
(Extra, not in RSSAC002.)

=back



=head1 SAMPLE USAGE

=head2 Input

	#fsdb -F t msgid time srcip srcport dstip dstport protocol id qr opcode aa tc rd ra z ad cd rcode qdcount ancount nscount arcount edns_present edns_udp_size edns_extended_rcode edns_version edns_z msglen 
	1	1451192632.254226	128.9.168.85	39142	192.228.79.201	53	udp	21461	0	0	0	0	1	0	0	0	1	0	1	0	0	0	1	4096	0	0	0	40
	2	1451192632.255816	192.228.79.201	53	128.9.168.85	39142	udp	21461	1	0	0	0	1	0	0	0	0	0	1	0	13	15	1	4096	0	0	0	528
	3	1451192637.645691	128.9.168.85	59556	192.228.79.201	53	udp	63206	0	0	0	0	1	0	0	0	1	0	1	0	0	0	1	4096	0	0	0	40
	4	1451192637.647542	192.228.79.201	53	128.9.168.85	59556	udp	63206	1	0	0	0	1	0	0	0	0	0	1	0	13	15	1	4096	0	0	0	525
	5	1451192652.006988	128.9.168.85	42548	192.228.79.201	53	tcp	33197	0	0	0	0	1	0	0	0	1	0	1	0	0	0	1	4096	0	0	0	32
	6	1451192652.008452	192.228.79.201	53	128.9.168.85	42548	tcp	33197	1	0	0	0	1	0	0	0	0	0	1	0	6	12	1	4096	0	0	0	434

(Input may have additional fields: ipttl, rtt, name, type, class.  If
so, they are ignored.  However, parsing does I<not> use proper fsdb
libraries, so the order of the fields in the input scheme is
manditory.)

=head2 Command

    ./message_to_rssacint --file-seqno=1

=head2 Output

	#fsdb -F t key count
	+64:128.9.168.85	1
	+64:128.9.168.85	1
	+64:128.9.168.85	1
	+3t04	1
	+3t14	1
	+3u04	2
	+3u14	2
	+4t0:32	1
	+4t1:434	1
	+4u0:40	2
	+4u1:525	1
	+4u1:528	1
	+50:0	3
	+51:0	3
	-rt	1451192632,1451192637,1451192652
	<ts	1451192632.254226
	=rfileseqno:-	1
	>te	1451192652.008452
	# message_to_rssacint.pl --file-seqno=1


=back

=cut

use strict;
use Pod::Usage;
use Getopt::Long;
use Net::IP;

Getopt::Long::Configure ("bundling");
pod2usage(2) if ($#ARGV >= 0 && $ARGV[0] eq '-?');
my(@orig_argv) = @ARGV;
my($prog) = $0;
my $debug = undef;
my $verbose = undef;
my $cache_easy = 1;
my $count_basic = 1;
my $count_clients = 1;
my $count_service_clients = undef;
my $count_tlds = undef;
my $emit_prefixes = 1;
my $file_seqno = undef;
my $active_ranges = 1;
my $legacy_service_field = undef;
my $mode_str = 'rssacint';
my $queries_each_second = undef;
my $count_transactions = undef;
&GetOptions(
	'active-ranges!' => \$active_ranges,
	'cache-easy!' => \$cache_easy,
	'count-clients!' => \$count_clients,
	'count-service-clients!' => \$count_service_clients,
	'count-tlds!' => \$count_tlds,
	'file-seqno=s' => \$file_seqno,
	'legacy-service-field!' => \$legacy_service_field,
	'm|mode=s' => \$mode_str,
	'queries-each-second!' => \$queries_each_second,
	'transactions!' => \$count_transactions,
	'prefixes!' => \$emit_prefixes,
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'd|debug+' => \$debug,   
        'v|verbose+' => \$verbose) or pod2usage(2);

$active_ranges = 1 if ($queries_each_second);

if ($mode_str eq 'rssacint') {
    # all defaults
} elsif ($mode_str eq 'service') {
    $count_basic = undef;
    $count_clients = undef;
    $count_transactions = undef;
    $count_service_clients = 1;
    $legacy_service_field = 1;
} elsif ($mode_str eq 'tld') {
    $count_clients = undef;
    $count_service_clients = undef;
    $count_tlds = 1;
} else {
    die "message_to_rssacint: unknown mode $mode_str\n";
};
my ($service_clients_field_name) = (defined($legacy_service_field) ? '6' : 'd');

my($out_schema) = "#fsdb -F t key count";
#
# Sigh, schemas have changed:
#
my(@in_schemas) =  (
    # 0: 2016-01: initial format, with misspelling
    '#fsdb msgid time srcip srcport dstip dstport protocol id qr opcode aa tc rd ra z ad cd rcode qdcount ancount nscount arcount edns_present edns_size ends_version ends_flags msglen name type class',
    # 1: 2016-03:   -F t, change edns
    '#fsdb -F t msgid time srcip srcport dstip dstport protocol id qr opcode aa tc rd ra z ad cd rcode qdcount ancount nscount arcount edns_present edns_udp_size edns_extended_rcode edns_version edns_z msglen name type class',
    # 2: 2021-03: add ipttl
    '#fsdb -F t msgid time srcip srcport dstip dstport protocol id qr opcode aa tc rd ra z ad cd rcode qdcount ancount nscount arcount edns_present edns_udp_size edns_extended_rcode edns_version edns_z msglen ipttl name type class',
    # 3: 2024-06: add rtt
    '#fsdb -F t msgid time srcip srcport dstip dstport protocol id qr opcode aa tc rd ra z ad cd rcode qdcount ancount nscount arcount edns_present edns_udp_size edns_extended_rcode edns_version edns_z msglen ipttl rtt name type class',
    # 2024-07: remove rtt (error)---reverts to 2021-03
    # 4: "message" format, sample data without name/type/class, but with ipttl rtt
    '#fsdb -F t msgid time srcip srcport dstip dstport protocol id qr opcode aa tc rd ra z ad cd rcode qdcount ancount nscount arcount edns_present edns_udp_size edns_extended_rcode edns_version edns_z msglen ipttl rtt',
    # 5: "message" format, sample data without name/type/class
    '#fsdb -F t msgid time srcip srcport dstip dstport protocol id qr opcode aa tc rd ra z ad cd rcode qdcount ancount nscount arcount edns_present edns_udp_size edns_extended_rcode edns_version edns_z msglen',
);
    
# error handling
my %unknown_ops;

binmode STDOUT, ":utf8";
print $out_schema . "\n";


my($failed_ips) = 0;
my $MAX_FAILED_IPS = 1000;

my(%easy) = ();
# cache stats counting
# For one file, I see 147 fastpath new seconds, 6165597 fastpath same updates, and 21074 slow paths.
# I.e., caching works very well.
#foreach (qw(x+s x+fi x+fs)) {
#    $easy{$_} = 0;
#}

# because Net::IP is way too slow
# copied from message_question_to_load
sub ip_to_hexip($) {
    my($ip) = @_;
    if ($ip =~ /:/) {
        my $head = '';
        my $tail = '';
        my $in_tail = undef;
        foreach (split(/:/, $ip)) {
            if (!$in_tail && $_ eq '') {
                $in_tail = 1;
                next;
            };
            my $part = substr("0000" . $_, -4);
            if ($in_tail) {
                $tail .= $part;
            } else {
                $head .= $part;
            };
        };
        my $mid = "0" x (32 - length($head) - length($tail));
        return $head . $mid . $tail;
    } else {
        # v4
        my(@f) = split(/\./, $ip);
        return undef if ($#f != 3);
        return sprintf("%02x%02x%02x%02x", @f);
    };
}


sub truncate_ip6($) {
    my($srcip) = @_;
#    # As nice as Net::IP is, it's slow to build objects, so we avoid
#    # it for IPv4.
#    my($srcip_ni) = new Net::IP($srcip);
#    if (!$srcip_ni) {
#	next if ($failed_ips > $MAX_FAILED_IPS);
#	print "+eip-fail:$msgid,$time,$srcip\t1\n";
#	return undef;
#    };
#    my $a_str = $srcip_ni->ip();
#    $a_str =~ s/....:....:....:....$/:/;
#    my $as = new Net::IP($a_str);
#    return $as->short();
    my($srcip_hex) = ip_to_hexip($srcip);
    # truncate to /64
    $srcip_hex =~ s/................$/0000000000000000/;
    # renormalize
    my @parts;
    foreach (unpack("(A4)*", $srcip_hex)) {
        s/^0*//;
        $_ = '0' if ($_ eq '');
        push(@parts, $_);
    };
    for (;;) {
        last if ($#parts == -1 || $parts[$#parts] ne '0');
        pop(@parts);
    };
    if ($#parts < 0) {
        $parts[0] = "::";
    } elsif ($#parts < 8) {
        $parts[$#parts] .= "::";
    };
    return join(":", @parts);
}

sub truncate_ip4($) {
    my($srcip) = @_;
    # IPv4 is much easier than IPv6
    # Assume input is in dotted quad.
    my(@parts) = split(/\./, $srcip);
    return undef if ($#parts != 3);
    $parts[3] = '';
    return join('.', @parts);
}

######################################################################
# Rangelist code.
# same code in message_to_rssacinc.pl and rssacint_reduce.pl

#
# Take the string form of a rangelist and break it into start and end arrays.
#
sub decompose_rangelist($) {
    my($rl_str) = $_[0];
    my(@ss, @es);
    foreach (split(/,/, $rl_str)) {
	my($s, $e) = split(/-/, $_);
	die "unparsable range $_\n" if (!defined($s));
	$e //= $s;
	push(@ss, $s);
	push(@es, $e);
    };
    return (\@ss, \@es);
			}

sub min($$) {
    return $_[0] < $_[1] ? $_[0] : $_[1];
}

#
# Take two range lists (format like: 1-2,4,6-7)
# and merge them.
#
sub merge_ranges($$;$) {
    my(@rangelists) = ($_[0], $_[1]);
    my($report_overlap_as_error) = $_[2];

    #
    # fast path
    # optimize appending a simple other on a ranged one
    # merge_ranges("1-2", "3") => "1-3"
    #
    my($one, $other) = (undef, undef);
    if ($rangelists[1] =~ /^\d+$/) {
	($one, $other) = (0, 1);
    } elsif ($rangelists[0] =~ /^\d+$/) {
	($one, $other) = (1, 0);
    };
    if (defined($other) && $rangelists[$one] =~ /\-(\d+)$/) {
	# can try to fastpath
	# one:  1-2   (or more complex)
	# other:    3 (hopefully)
	my($one_e) = $1;
	my($other_s) = $rangelists[$other];
        if ($one_e == $other_s) {
            # no change needed
	    # $easy{'x+fs'}++;
            if ($report_overlap_as_error) {
		print ":eoverlapping-regions\t$rangelists[$one]+$other_s\n" if ($report_overlap_as_error > 1);
		return $rangelists[$one] . "/e";
	    } else {
		return $rangelists[$one];
	    };
        } elsif ($one_e + 1 == $other_s) {
	    # $easy{'x+fi'}++;
            $rangelists[$one] =~ s/(\D?)(\d+)$/$1$other_s/;
            return $rangelists[$one];
    	};
	# fall through for slow path
    };
    # $easy{'x+s'}++;

    #
    # slow path
    #
    # Decompose comma-separated list into array of ranges (start and ends).
    #    
    my(@ss, @es);
    foreach (0..1) {
        ($ss[$_], $es[$_]) = decompose_rangelist($rangelists[$_]);
    };

    #
    # Count how many lists each range occurs in.
    # If there is overlap, make more intermediate ranges.
    #
    # On exit of this loop, we have ONE rangelist in an array, plus counts.
    #
    # (And ick: this code is ALL corner cases.)
    #
    my(@count, @s, @e);
  buildcount:
    while (1) {
	#
	# Check if either side has drained.
	#
	foreach $one (0, 1) {
	    # print "checking $one for emtpy, is $#{$ss[$one]}\n";
	    $other = 1 - $one;
	    if ($#{$ss[$one]} == -1) {
		push(@count, (1) x ($#{$ss[$other]} + 1));
		push(@s, @{$ss[$other]});
		push(@e, @{$es[$other]});
		last buildcount;
	    };
	};
	#
	# assert(have stuff left in both)
	#
	# Make $one be the one the starts first
	# (so we only have a million cases, not four million.)
	#
	my($new_count) = 1;
	if ($ss[0][0] < $ss[1][0]) {
	    ($one, $other) = (0, 1);
	} elsif ($ss[0][0] > $ss[1][0]) {
	    ($one, $other) = (1, 0);
	} else {
	    # both start at same time
	    $new_count = 2;
	    # $one becomes the one that ends first
	    if ($es[0][0] <= $es[1][0]) {
		($one, $other) = (0, 1);
	    } else {
		($one, $other) = (1, 0);
	    };
	};
	#
	# assert($lists[$one] starts first (or at same time))
	#
	my($consume_one) = undef;
	push(@count, $new_count);
	push(@s, $ss[$one][0]);
	if ($ss[$one][0] < $ss[$other][0]) {
	    # one starts first
	    if ($es[$one][0] < $ss[$other][0]) {
		# and ends before other
		# +----+
		#         +----+
		# or abutts other (in which case we will merge later)
		# +----+
		#       +----+
		push(@e, $es[$one][0]);
		$consume_one = 1;
	    } elsif ($es[$one][0] >= $ss[$other][0]) {
		# and overlaps with other
		# +----+
		#      +----+
		# or
		# +----+
		#    +----+
		push(@e, $ss[$other][0]-1);
		$ss[$one][0] = $ss[$other][0];
		$consume_one = 0;
	    } else {
		die "invariant violated: one $one starts first\n";
	    };
	} elsif ($ss[$one][0] == $ss[$other][0]) {
	    # start at same time
	    push(@e, $es[$one][0]);
	    $consume_one = 1;
	    if ($es[$one][0] < $es[$other][0]) {
		# but one ends first
		# +----+
		# +--------+
		$ss[$other][0] = $es[$one][0]+1;
	    } elsif ($es[$one][0] == $es[$other][0]) {
		# complete overlap
		# +----+
		# +----+
		#
		# so also consume other here:
		shift @{$ss[$other]};
		shift @{$es[$other]};
	    } else {
		die "invariant violated: one $one and other $other start at same time and other ends first\n";
	    };
	} else {
	    die "invariant violated: one $one starts after other $other\n";
	};
	if ($consume_one) {
	    shift @{$ss[$one]};
	    shift @{$es[$one]};
	};
    };

    #
    # We now have a clean, single rangelist in an array, with counts.
    #
    # Now concatinate adjacent ranges and report overlap.
    #
    my($out, $error_out) = ("", "");
    while ($#s != -1) {
	if ($count[0] == 2) {
	    if ($report_overlap_as_error) {
		print ":eoverlapping-regions\t$s[0]-$e[0]\n" if ($report_overlap_as_error > 1);
		$error_out = "/e";
	    };
	};
	# merge?
	if ($#s >= 1) {
	    if ($e[0]+1 >= $s[1]) {
		$s[1] = $s[0];
		shift @count;
		shift @s;
		shift @e;
		# no output
		next;
	    };
	};
	$out .= "," if ($out ne "");
	$out .= ($s[0] == $e[0] ? $s[0] : $s[0] . "-" . $e[0]);
	shift @count;
	shift @s;
	shift @e;
    };
    return $out . $error_out;
}

######################################################################

# same code in message_to_rssacinc.pl and rssacint_reduce.pl
sub reduce_pair($$$$;$$) {
    my($op, $matcher, $last_value, $value, $last_unique, $unique) = @_;
    # reduce!
    if ($op eq '+') {
        $value += $last_value;             
    } elsif ($op eq '!') {
	die "internal error: undef unique\n" if (!defined($unique));
	if (!defined($last_unique)) {
	    $value = 1;
	} else {
	    if ($unique ne $last_unique) {
		$value = $last_value + 1;
	    } else {
		$value = $last_value;
	    };
	};
    } elsif ($op eq '-') {
	# range
	$value = merge_ranges($last_value, $value);
    } elsif ($op eq '=') {
	# ranges with overlap reporting
	$value = merge_ranges($last_value, $value, 2);
    } elsif ($op eq '<') {
        # lexical comparision! (not numeric)
        $value = $last_value if ($last_value lt $value);
    } elsif ($op eq '>') {
        $value = $last_value if ($last_value gt $value);
    } elsif ($op eq 'q') {
	return undef;
    } else {
	# pass other operators through unchanged
	$unknown_ops{$op} //= 0;
	$unknown_ops{$op}++;
	return undef;
    };
    return $value;
}


######################################################################


sub cachable_output($$$) {
    my($op, $key, $value) = @_;

    if (!$cache_easy) {
	print "$key\t$value\n";
	return;
    };
    if (defined($easy{$key})) {
	$easy{$key} = reduce_pair($op, $key, $easy{$key}, $value);
    } else {
	$easy{$key} = $value;
    };
}

sub flush_cachable_output() {
    foreach (sort keys %easy) {
	print "$_\t$easy{$_}\n";
    };
}

#
# take a qname and produce a clean TLD
#
# input qname is binary
# (clean means: unicode-clean, no : (my thing),
# and normalize case to lowercase.)
#
sub name_to_tld($) {
    my($name) = @_;
    # all names must be terminated
    return '-' if (!defined($name) || $name !~ /\.$/);
    # keep just the last component
    $name =~ s/^.*\.([^\.]+\.)$/$1/;
    # encode : as \072, the octal encoding
    # This follows ldns_rdf2str's convention, for better or worse.
    # Note we assume that all other chars have ALREADY beeen so processed.
    $name =~ s/:/\\072/g;
    # fold case, but only for US ASCII, not all unicode (lc is too powerful)
    $name =~ tr/A-Z/a-z/;
    return $name;
}

sub process_file($) {
    my($file) = @_;
    my($open_mode, $open_place) = ("<:utf8", $file);
    if ($file eq '-') {
	($open_mode, $open_place) = ("<&=", 0);
    };
    my $in;
    if (!open($in, $open_mode, $open_place)) {
	print "+eopen-fail:$file\t1\n";
	return;
    };
    if (defined($file_seqno) && $file_seqno ne 'comment') {
	my(@parts) = split(/:/, $file_seqno);
	unshift(@parts, "-") if ($#parts < 1);
	die ("--file-seqno ($parts[1] is not digits or site:digits)\n")
	    if ($parts[1] !~ /^\d+/);
	# extra: keep track of what files we scanned, with site
        cachable_output("=", "=rfileseqno:$parts[0]", $parts[1]);
    };
    my($schema_version) = undef;
    while (<$in>) {
	chomp;
        if (/^#/) {
            if (!defined($schema_version) && /^#fsdb/) {
                $schema_version = undef;
                my $i = 0;
                for my $schema (@in_schemas) {
                    if ($_ =~ /^$schema/) {
                        $schema_version = $i;
                        last;
                    };
                    $i++;
                }
                die "unexpected schema: $_\n"
                    if (!defined($schema_version));
                print "# schema #$schema_version\n" if ($verbose);
		next;
            };
	    if ($file_seqno eq 'comment') {
		my($seqno, $site);
                if (/^#.*dnsanon .*-f \d+-\d+-(\d+)\.([\-\w-]+)/) {
                    ($seqno, $site) = ($1, $2);
                } elsif (/^#.*dnstapmq .*\d+-\d+-(\d+)\.([\-\w]+)\./) {
                    ($seqno, $site) = ($1, $2);
                    };
                if (defined($seqno)) {
                    $site //= '-';
		    cachable_output("=", "=rfileseqno:$site", $seqno);
		    next;
		};
	    };
	    print "$_\n";
            next;
        };
	my($msgid, $time, $srcip, $srcport, $dstip, $dstport, $protocol, $id, $qr, $opcode, $aa, $tc, $rd, $ra, $z, $ad, $cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_extended_rcode, $edns_version, $edns_z, $msglen, $ipttl, $rtt, $name, $type, $class, $edns_flags);
        my(@f) = split(/\s+/);
        # This is the most crocked way to do fsdb parsing ever. :-(
        if ($schema_version == 0) {
            ($msgid, $time, $srcip, $srcport, $dstip, $dstport, $protocol, $id, $qr, $opcode, $aa, $tc, $rd, $ra, $z, $ad, $cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_version, $edns_flags, $msglen, $name, $type, $class) = @f;
        } elsif ($schema_version == 1) {
	    ($msgid, $time, $srcip, $srcport, $dstip, $dstport, $protocol, $id, $qr, $opcode, $aa, $tc, $rd, $ra, $z, $ad, $cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_extended_rcode, $edns_version, $edns_z, $msglen, $name, $type, $class) = @f;
        } elsif ($schema_version == 2) {
	    ($msgid, $time, $srcip, $srcport, $dstip, $dstport, $protocol, $id, $qr, $opcode, $aa, $tc, $rd, $ra, $z, $ad, $cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_extended_rcode, $edns_version, $edns_z, $msglen, $ipttl, $name, $type, $class) = @f;
        } elsif ($schema_version == 3) {
	    ($msgid, $time, $srcip, $srcport, $dstip, $dstport, $protocol, $id, $qr, $opcode, $aa, $tc, $rd, $ra, $z, $ad, $cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_extended_rcode, $edns_version, $edns_z, $msglen, $ipttl, $rtt, $name, $type, $class) = @f;
#            # A bug in dnsanon omits the tab after cd.  Fixed on 2025-09-25,
#            # but work around the problem in old data.
            # xxx: no this was a cut-and-paste problem.
#            if (length($cd) > 1) {
#                my($true_cd, $true_rcode) = ($cd =~ /^(.)(.*)$/);
#                ($cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_extended_rcode, $edns_version, $edns_z, $msglen, $ipttl, $rtt, $name, $type, $class) = ($true_cd, $true_rcode, @f[17..32]);
#            };
        } elsif ($schema_version == 4) {
            ($msgid, $time, $srcip, $srcport, $dstip, $dstport, $protocol, $id, $qr, $opcode, $aa, $tc, $rd, $ra, $z, $ad, $cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_extended_rcode, $edns_version, $edns_z, $msglen, $ipttl, $rtt) = @f;
        } elsif ($schema_version == 5) {
            ($msgid, $time, $srcip, $srcport, $dstip, $dstport, $protocol, $id, $qr, $opcode, $aa, $tc, $rd, $ra, $z, $ad, $cd, $rcode, $qdcount, $ancount, $nscount, $arcount, $edns_present, $edns_udp_size, $edns_extended_rcode, $edns_version, $edns_z, $msglen) = @f;
        } else {
            die "unknown schema\n";
        };

	# extra: keep track of measurement period
        cachable_output("<", "<ts", $time);
        cachable_output(">", ">te", $time);
	# extra: keep track of which seconds have activity
	my($time_secs) = undef;
	if ($active_ranges) {
	    if ($time =~ /^(\d+)(\.\d*)$/) {
	        $time_secs = $1;
		cachable_output("-", "-rt", $time_secs);
	    };
	};
        my($v46) = ($srcip =~ /\./ ? 4 : 6);
        # map TLS and DOx to different chars
        if ($protocol eq "tls" || $protocol eq "dot") {
            $protocol = "ssl";
        } elsif ($protocol eq "doh") {
            $protocol = "https";
        };
	my($short_protocol) = ($protocol =~ /^(.)/);
        #
        # rssac-002v2 section 2.3: number of queries
        # (we do this for every mode)
        #
        cachable_output("+", "+3$short_protocol$qr$v46", 1);
        # extra: queries per second
        if ($queries_each_second && defined($time_secs)) {
            # we don't split out v4 / v6
            cachable_output("+", "+3S$qr:$time_secs", 1);
        };
        if ($count_basic) {
            # rssac-002v2 section 2.4: query and response size distribution
            cachable_output("+", "+4$short_protocol$qr:$msglen", 1);
            my($final_rcode) = $rcode;
            $final_rcode = $edns_extended_rcode * 16 + $rcode if (defined($edns_extended_rcode) && $edns_extended_rcode ne '-');
            # rssac-002v2 section 2.5: rcode distribution (responses only, but we do both)
            cachable_output("+", "+5$qr:$final_rcode", 1);
        };
        if ($count_clients) {
            # rssac-002v2 section 2.6: unique sources
            if ($qr == 0) {
                print "+6$v46:$srcip\t1\n";
                if ($emit_prefixes) {
                    my($prefix) = ($v46 eq '6' ? truncate_ip6($srcip) : truncate_ip4($srcip));
                    print "+6" . ($v46 eq '6' ? 'a' : 'b') . ":$prefix\t1\n" if ($prefix);
                };
            };
        };
        # extra: transactions
        if ($count_transactions) {
                my($client_ip, $client_port) = ($qr == 0 ? ($srcip, $srcport) : ($dstip, $dstport));
                print "q$short_protocol$client_ip,$client_port,$id\t$qr,$time\n";
        };
        if ($count_service_clients) {
            #
            # custom for servicerssacint:
            # we want to count unique IPs to each service address
            #
            if ($qr == 0) {
                my $clean_dstip = $dstip;
                $clean_dstip =~ s/:/_/g;
                print "+$service_clients_field_name$v46$clean_dstip:$srcip\t1\n";
                if ($emit_prefixes) {
                    my($prefix) = ($v46 eq '6' ? truncate_ip6($srcip) : truncate_ip4($srcip));
                    print "+$service_clients_field_name" . ($v46 eq '6' ? 'a' : 'b') . "$clean_dstip:$prefix\t1\n" if ($prefix);
                };
            };
        };
        if ($count_tlds) {
            #
            # custom for tld_rssacint
            # we want to count clients ot each unique TLD.
            #
            if ($qr == 0) {
                my $clean_tld = name_to_tld($name);
                print "+m$v46$clean_tld:$srcip\t1\n";
                if ($emit_prefixes) {
                    my($prefix) = ($v46 eq '6' ? truncate_ip6($srcip) : truncate_ip4($srcip));
                    print "+m" . ($v46 eq '6' ? 'a' : 'b') . "$clean_tld:$prefix\t1\n" if ($prefix);
                };
            };
        };
        # extra: queries per service address
        # extra: replies per service address, added 2025-06-09
        if ($qr == 0) {
            cachable_output("+", "+s:$dstip", 1);
        } elsif ($qr == 1) {
            cachable_output("+", "+S:$srcip", 1);
        };
    };
    close $in;
    flush_cachable_output();
    print "+eip-fail:too-many\t$failed_ips\n" if ($failed_ips > $MAX_FAILED_IPS);
    print "# message_to_rssacint " . join(" ", @orig_argv) . "\n";
};

push (@ARGV, "-") if ($#ARGV == -1);
foreach (@ARGV) {
    process_file($_);
};

exit 0;

=head1 SEE ALSO

L<dnsanon(1)>,
L<message_to_rssacint(1)>,
L<rssacint_reduce(1)>,
L<rssacfin_to_rssacyaml(1)>


=head1 AUTHOR and COPYRIGHT

This program was written by John Heidemann.

Copyright (C) 2016-2025 University of Southern California.

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut


