#!/usr/bin/perl
# returns stats from an apache log file (ver 1.3.20 on Mandrake)
# Counts bums on seats and hits for visitors, robots and viri
# Gerry Patterson, May 2002
use Time::Local;
use Getopt::Std;
require "/usr/local/sbin/agent_id";
require "/usr/local/sbin/agent_data";

# options:
#   -a  print a list of agents.
#   -A  print a list of agents (using short name).
#   -d  print a list of referer domains with stats.
#   -f  print a list of referers with stats.
#   -i [input_file] read initial variables from [init_file].
#   -n  print a list of hosts (IP address) and visits.
#   -N  print a list of subnets and visits.
#   -o [output_file] write final value of variables to [output_file].
#   -p  print a list of pages with stats.
#   -r  print a list of robot agents.
#   -R  print a list of robot hits.
#   -s  print a summary of statistics (can be used to update HTML).
#   -v  print a list of visitor agents.
#   -V  print a list of visitor hits.
#   -w  print a list of hits from worms

# ------------------------------------------------------------------------

sub LoadSuspects{
	my @t = chomp_sql("select ip_addr,start_time,end_time from robot_suspects where confirmed;");
	foreach my $x(@t){
		my @w = split(/\t/,$x);
		my @a = (time_psql($w[1]),time_psql($w[2]));
		push (@{$Suspects{$w[0]}},@a);
	}
}

# ------------------------------------------------------------------------

# examine given IP addr, time to determine if on suspect list
sub is_suspect{
	return(0) unless ($Suspects{$_[0]});
	my @a = @{$Suspects{$_[0]}};
	while(@a){
		my $t1 = shift(@a);
		my $t2 = shift(@a);
		return(1) if ($t1 <= $_[1] && $_[1] <= $t2);
	}
	return(0);
}

# ------------------------------------------------------------------------

# convert psql string to system time
sub time_psql{
	my $t = $_[0];
	$t =~ s/\+[0-9]+$//;
	my @t = split(' ',$t);
	my @t1 = split (/-/,$t[0]);
	my @t2 = split (/:/,$t[1]);
	my @tt = (reverse(@t2),reverse(@t1));
	$tt[4]--;
	return timelocal(@tt);
}

# ------------------------------------------------------------------------

# convert a time to psql format string
sub psql_time{
	my @T = localtime $_[0];
	return (sprintf "%04d-%02d-%02d %02d:%02d:%02d",$T[5]+1900,
		$T[4]+1,$T[3],$T[2],$T[1],$T[0]);
}

# ------------------------------------------------------------------------

sub parse_log{
	my @w = split ( ' ', $_[0]);
	# First perform a sanity check
	# My log uses the 'combined' format as follows:
	# remotehost login authuser [date] "request" status bytes "Referer" "Agent"
	# where: remotehost = IP address
	#        [date]     = timestamp and tz (always +1000 or +1100 for VIC)
	#        login      = remote login as per RFC931 (always -)
	#        authuser   = authenticated username (always -)
	#        "request"  = request cmd sent from the remote agent
	#        status     = numeric status returned by apache
	#        bytes      = number of bytes transmitted
	#        "Referer"  = URL of the Referer
	#        "Agent"    = name of the remote user agent
	# This will have to be customised for individual sites
	die "Error insane at line $.:$_[0]" unless
		( $w[0] =~ /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ &&
		  $w[1] =~ /-/ && $w[2] =~ /-/ &&
		  $w[4] =~ /^\+1[01]00\]$/);
	# Exclude these subnets/addresses
	return() if ( $w[0] =~ /127\.0\.0\.1/       ||
		      $w[0] =~ /1\.2\.3\..*/    ||
		      $w[0] =~ /1\.2\.3\.1[01]$/ );
	# remove the '[' from the date and convert to timestamp with timelocal()
	$w[3] =~ s/^\[//;
	my @t = split(/:/,$w[3]);
	my @d = split( /\//,shift( @t) );
	my @Htime = (reverse(@t),$d[0],$mth{$d[1]},$d[2] - 1900);
	my $ltime = timelocal(@Htime);
	# Original HTML cmd, referer and agent are all enclosed in '"'
	@t = split( '"',$_[0]);
	# extract the status and size
	$t[2] =~ s/^\s+//;
	my @t1 = split(' ',$t[2]);
	# Make allowance for '"' embedded in query strings
	# (split on ' "' and remove trailing '"')
	my @t2 = split( ' "',$_[0]);
	chop $t2[2];
	chomp $t2[3];
	chop $t2[3];
	return($w[0],$ltime,$t[1],$t1[0],$t1[1],$t2[2],$t2[3]);
}

# ------------------------------------------------------------------------

# init variables
LoadSuspects();
%mth = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11);
$virus_bum = $virus_hit = $robot_bum = $robot_hit = $visitor_bum = $visitor_hit = 0;
getopt('io');
if ( $opt_i){
	# initialise the counters from this file
	require $opt_i;
}
if ( $opt_o){
	# if the output file exists, intialise the counters from this file
	require $opt_o if (-f $opt_o);
}
while(<>){
	next unless (($IP,$htime,$cmd,$status,$bytes,$referer,$agent) = parse_log($_));
	print "$cmd\n" if ($opt_c);
	my @t = split( ' ',$cmd);
	# strip query part and www part from referer
	next unless ($t[0] =~ /^GET$/i || $t[0] =~ /^HEAD$/i );
	$referer =~ s/^http:\/\///;
	$referer =~ s/^www\.//;
	$referer =~ s/\?.*$//;
	$referer =~ s/\/$//;
	$page = $t[1];
	# Construct a robot gang ID from the 24-bit subnet and the agent_id
	$subnet = $IP;
	$subnet =~ s/\.[0-9]+$//;
	$GangID = $subnet . ":" . $agent;
	# Microsoft viri don't identify themselves (agent and referer is "-")
	if (length($agent) < 2){
		# it's a worm or an attack -- forget it (for now)
		# Some time (when I get round to it), I will put some code here
		# to analyse behaviour patterns of Microsoft worms
		$agent = "-";
		# assume it is a virus if agent is "-"
		$type = "virus";
		$virus_bum++ if ( $htime - $timeIP{$IP} > 3000 );
		$virus_hit++;
		print $_ if ($opt_w);
	}
	unless ($agent_id = $AgentID{$agent}){
		push (@agent_array, $agent);
		$agent_id = $AgentID{$agent} = $#agent_array;
	}
	# ignore these agents (put submission aids here)
	next if ( $agent =~ /^W3C_Validator\/1.183/ ||
		  $agent =~ /^W3C-checklink\/2.90/
							);
	if ($RobotArray[$agent_id] || is_suspect($IP,$htime)){
		$type = "robot";
		if ( $htime - $Gang{$GangID} > 4800){
			$robot_bum++;
			$robot_agent_bum{$agent}++;
			$robot_agent_hit{$agent} += 0;
			print psql_time($htime) . " $IP\t$agent\n" if ($opt_r);
		}
		print $_ if ($opt_R);
		$Gang{$GangID} = $htime;
		$robot_agent_hit{$agent}++;
		$robot_hit++;
	}
	else {
		$type = "visitor";
		# Count bum on seat if last hit for this IP is > 30 min
		if ( $htime - $timeIP{$IP} > 3000 ){
			$visitor_bum++;
			$visitor_agent_bum{$agent}++;
			$visitor_agent_hit{$agent} += 0;
			print psql_time($htime) . " $IP\t$agent\n" if ($opt_v);
		}
		unless ( $page =~ /^\/icons/ || $page =~ /^\/images/){
			$visitor_hit++;
			$visitor_agent_hit{$agent}++;
			$visitor_ip_hit{$IP}++;
			$visitor_page_hit{$page}++;
			$visitor_ref_hit{$referer}++;
			print $_ if ($opt_V);
		}
	}
	print "$cmd\n" if ($opt_c);
	unless ($agent_id = $AgentID{$agent}){
		push (@agent_array, $agent);
		$agent_id = $AgentID{$agent} = $#agent_array;
	}
	$timeIP{$IP} = $htime;
	$typeIP{$IP} = $type;
}
# This next section produces output according to the options ...

# --------------------------------------------------------------

# Write the variables to disk -- only do this when the log files are rolled
if ( $opt_o){
	open(OUTS,">$opt_o") || die "Error opening $opt_o for output\n";
	print OUTS '$virus_bum='."$virus_bum; ".'$virus_hit='."$virus_hit;\n";
	print OUTS '$robot_bum='."$robot_bum; ".'$robot_hit='."$robot_hit;\n";
	print OUTS '$visitor_bum='."$visitor_bum; ".'$visitor_hit='."$visitor_hit;\n1;\n";
}
# Print out a list of the (full length) agent strings and stats
if ( $opt_a){
	print "Robots:\n";
	foreach $x (sort keys %robot_agent_hit){
		printf "%6d %6d %s\n",$robot_agent_bum{$x},$robot_agent_hit{$x},$x;
	}
	print "Visitors:\n";
	foreach $x (sort keys %visitor_agent_hit){
		printf "%6d %6d %s\n",$visitor_agent_bum{$x},$visitor_agent_hit{$x},$x;
	}
}
print "Total robots: $robot_bum  hits: $robot_hit\n" if ($opt_r);
print "Total visitors: $visitor_bum  hits: $visitor_hit\n" if ($opt_v);
# Print out a list of the (short) agents and stats
if ( $opt_A){
	# Extract short agent name and stats for visitors from long name stats
	foreach $x (sort keys %visitor_agent_hit){
		$Agent = which_browser($x);
		$Visitor_hit{$Agent} += $visitor_agent_hit{$x};
		$Visitor_bum{$Agent} += $visitor_agent_bum{$x};
	}
	# print the list, in descending hits order
	foreach $x (sort{$Visitor_hit{$b}<=>$Visitor_hit{$a}} keys %Visitor_hit){
 		printf "Visitor %6d %6d %s\n",$Visitor_bum{$x},$Visitor_hit{$x},$x;
	}
	# Extract short agent name and stats for robots
	foreach $x (sort keys %robot_agent_hit){
		$Agent = which_robot($x);
		$Robot_hit{$Agent} += $robot_agent_hit{$x};
		$Robot_bum{$Agent} += $robot_agent_bum{$x};
	}
	# print the list, in descending hits order
	foreach $x (sort{$Robot_hit{$b}<=>$Robot_hit{$a}} keys %Robot_hit){
 		printf "Robot   %6d %6d %s\n",$Robot_bum{$x},$Robot_hit{$x},$x;
	}
}
if ( $opt_p){
	# print a list of pages with number of hits for each page
	foreach $x (sort{$visitor_page_hit{$b}<=>$visitor_page_hit{$a}}
		keys %visitor_page_hit){
 		printf "Visitor %6d %s\n",,$visitor_page_hit{$x},$x;
	}
	foreach $x (sort{$robot_page_hit{$b}<=>$robot_page_hit{$a}}
		keys %robot_page_hit){
 		printf "Robot   %6d %s\n",,$robot_page_hit{$x},$x;
	}
}
if ( $opt_f){
	# print a list of referers with number of hits for each referer
	foreach $x (sort{$visitor_ref_hit{$b}<=>$visitor_ref_hit{$a}}
		keys %visitor_ref_hit){
 		printf "%6d %s\n",$visitor_ref_hit{$x},$x;
	}
}
if ( $opt_d){
	# extract hits from each domain from referer stats
	foreach $x (keys %visitor_ref_hit){
		$domain = (split( /\//,$x))[0];
 		$domain_ref{$domain} += $visitor_ref_hit{$x};
	}
	# print a list of domains with number of hits for each domain
	foreach $x (sort{$domain_ref{$b}<=>$domain_ref{$a}}
		keys %domain_ref){
 		printf "%6d %s\n",,$domain_ref{$x},$x;
	}
}
if ( $opt_s){
	# Print site stats (for update of HTML or whatever)
	printf "Virus   %6d %6d\n",$virus_bum, $virus_hit;
	printf "Robot   %6d %6d\n", $robot_bum, $robot_hit;
	printf "Visitor %6d %6d\n", $visitor_bum, $visitor_hit;
}
if ( $opt_n){
	# print a list of hosts with number of hits for each host
	foreach $x (sort{$visitor_ip_hit{$b}<=>$visitor_ip_hit{$a}}
		keys %visitor_ip_hit){
 		printf "%-16s %6d\n",$x,$visitor_ip_hit{$x};
	}
}
if ( $opt_N){
	# extract hits from each domain from referer stats
	foreach $x (keys %visitor_ip_hit){
		$subnet = $x;
		$subnet =~ s/\.[0-9]+$//;
 		$subnet_hit{$subnet} += $visitor_ip_hit{$x};
	}
	# print a list of subnets with number of hits for each subnet
	foreach $x (sort{$subnet_hit{$b}<=>$subnet_hit{$a}}
		keys %subnet_hit){
 		printf "%-16s %6d\n",$x,$subnet_hit{$x};
	}
}
