#!/usr/bin/perl # returns stats from an apache log file (ver 1.3.20 on Mandrake) # Counts bums on seats and hits for visitors, robots and viri # Gerry Patterson, May 2002 use Time::Local; use Getopt::Std; require "/usr/local/sbin/agent_id"; require "/usr/local/sbin/agent_data"; # options: # -a print a list of agents. # -A print a list of agents (using short name). # -d print a list of referer domains with stats. # -f print a list of referers with stats. # -i [input_file] read initial variables from [init_file]. # -n print a list of hosts (IP address) and visits. # -N print a list of subnets and visits. # -o [output_file] write final value of variables to [output_file]. # -p print a list of pages with stats. # -r print a list of robot agents. # -R print a list of robot hits. # -s print a summary of statistics (can be used to update HTML). # -v print a list of visitor agents. # -V print a list of visitor hits. # -w print a list of hits from worms # ------------------------------------------------------------------------ sub LoadSuspects{ my @t = chomp_sql("select ip_addr,start_time,end_time from robot_suspects where confirmed;"); foreach my $x(@t){ my @w = split(/\t/,$x); my @a = (time_psql($w[1]),time_psql($w[2])); push (@{$Suspects{$w[0]}},@a); } } # ------------------------------------------------------------------------ # examine given IP addr, time to determine if on suspect list sub is_suspect{ return(0) unless ($Suspects{$_[0]}); my @a = @{$Suspects{$_[0]}}; while(@a){ my $t1 = shift(@a); my $t2 = shift(@a); return(1) if ($t1 <= $_[1] && $_[1] <= $t2); } return(0); } # ------------------------------------------------------------------------ # convert psql string to system time sub time_psql{ my $t = $_[0]; $t =~ s/\+[0-9]+$//; my @t = split(' ',$t); my @t1 = split (/-/,$t[0]); my @t2 = split (/:/,$t[1]); my @tt = (reverse(@t2),reverse(@t1)); $tt[4]--; return timelocal(@tt); } # ------------------------------------------------------------------------ # convert a time to psql format string sub psql_time{ my @T = localtime $_[0]; return (sprintf "%04d-%02d-%02d %02d:%02d:%02d",$T[5]+1900, $T[4]+1,$T[3],$T[2],$T[1],$T[0]); } # ------------------------------------------------------------------------ sub parse_log{ my @w = split ( ' ', $_[0]); # First perform a sanity check # My log uses the 'combined' format as follows: # remotehost login authuser [date] "request" status bytes "Referer" "Agent" # where: remotehost = IP address # [date] = timestamp and tz (always +1000 or +1100 for VIC) # login = remote login as per RFC931 (always -) # authuser = authenticated username (always -) # "request" = request cmd sent from the remote agent # status = numeric status returned by apache # bytes = number of bytes transmitted # "Referer" = URL of the Referer # "Agent" = name of the remote user agent # This will have to be customised for individual sites die "Error insane at line $.:$_[0]" unless ( $w[0] =~ /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ && $w[1] =~ /-/ && $w[2] =~ /-/ && $w[4] =~ /^\+1[01]00\]$/); # Exclude these subnets/addresses return() if ( $w[0] =~ /127\.0\.0\.1/ || $w[0] =~ /1\.2\.3\..*/ || $w[0] =~ /1\.2\.3\.1[01]$/ ); # remove the '[' from the date and convert to timestamp with timelocal() $w[3] =~ s/^\[//; my @t = split(/:/,$w[3]); my @d = split( /\//,shift( @t) ); my @Htime = (reverse(@t),$d[0],$mth{$d[1]},$d[2] - 1900); my $ltime = timelocal(@Htime); # Original HTML cmd, referer and agent are all enclosed in '"' @t = split( '"',$_[0]); # extract the status and size $t[2] =~ s/^\s+//; my @t1 = split(' ',$t[2]); # Make allowance for '"' embedded in query strings # (split on ' "' and remove trailing '"') my @t2 = split( ' "',$_[0]); chop $t2[2]; chomp $t2[3]; chop $t2[3]; return($w[0],$ltime,$t[1],$t1[0],$t1[1],$t2[2],$t2[3]); } # ------------------------------------------------------------------------ # init variables LoadSuspects(); %mth = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11); $virus_bum = $virus_hit = $robot_bum = $robot_hit = $visitor_bum = $visitor_hit = 0; getopt('io'); if ( $opt_i){ # initialise the counters from this file require $opt_i; } if ( $opt_o){ # if the output file exists, intialise the counters from this file require $opt_o if (-f $opt_o); } while(<>){ next unless (($IP,$htime,$cmd,$status,$bytes,$referer,$agent) = parse_log($_)); print "$cmd\n" if ($opt_c); my @t = split( ' ',$cmd); # strip query part and www part from referer next unless ($t[0] =~ /^GET$/i || $t[0] =~ /^HEAD$/i ); $referer =~ s/^http:\/\///; $referer =~ s/^www\.//; $referer =~ s/\?.*$//; $referer =~ s/\/$//; $page = $t[1]; # Construct a robot gang ID from the 24-bit subnet and the agent_id $subnet = $IP; $subnet =~ s/\.[0-9]+$//; $GangID = $subnet . ":" . $agent; # Microsoft viri don't identify themselves (agent and referer is "-") if (length($agent) < 2){ # it's a worm or an attack -- forget it (for now) # Some time (when I get round to it), I will put some code here # to analyse behaviour patterns of Microsoft worms $agent = "-"; # assume it is a virus if agent is "-" $type = "virus"; $virus_bum++ if ( $htime - $timeIP{$IP} > 3000 ); $virus_hit++; print $_ if ($opt_w); } unless ($agent_id = $AgentID{$agent}){ push (@agent_array, $agent); $agent_id = $AgentID{$agent} = $#agent_array; } # ignore these agents (put submission aids here) next if ( $agent =~ /^W3C_Validator\/1.183/ || $agent =~ /^W3C-checklink\/2.90/ ); if ($RobotArray[$agent_id] || is_suspect($IP,$htime)){ $type = "robot"; if ( $htime - $Gang{$GangID} > 4800){ $robot_bum++; $robot_agent_bum{$agent}++; $robot_agent_hit{$agent} += 0; print psql_time($htime) . " $IP\t$agent\n" if ($opt_r); } print $_ if ($opt_R); $Gang{$GangID} = $htime; $robot_agent_hit{$agent}++; $robot_hit++; } else { $type = "visitor"; # Count bum on seat if last hit for this IP is > 30 min if ( $htime - $timeIP{$IP} > 3000 ){ $visitor_bum++; $visitor_agent_bum{$agent}++; $visitor_agent_hit{$agent} += 0; print psql_time($htime) . " $IP\t$agent\n" if ($opt_v); } unless ( $page =~ /^\/icons/ || $page =~ /^\/images/){ $visitor_hit++; $visitor_agent_hit{$agent}++; $visitor_ip_hit{$IP}++; $visitor_page_hit{$page}++; $visitor_ref_hit{$referer}++; print $_ if ($opt_V); } } print "$cmd\n" if ($opt_c); unless ($agent_id = $AgentID{$agent}){ push (@agent_array, $agent); $agent_id = $AgentID{$agent} = $#agent_array; } $timeIP{$IP} = $htime; $typeIP{$IP} = $type; } # This next section produces output according to the options ... # -------------------------------------------------------------- # Write the variables to disk -- only do this when the log files are rolled if ( $opt_o){ open(OUTS,">$opt_o") || die "Error opening $opt_o for output\n"; print OUTS '$virus_bum='."$virus_bum; ".'$virus_hit='."$virus_hit;\n"; print OUTS '$robot_bum='."$robot_bum; ".'$robot_hit='."$robot_hit;\n"; print OUTS '$visitor_bum='."$visitor_bum; ".'$visitor_hit='."$visitor_hit;\n1;\n"; } # Print out a list of the (full length) agent strings and stats if ( $opt_a){ print "Robots:\n"; foreach $x (sort keys %robot_agent_hit){ printf "%6d %6d %s\n",$robot_agent_bum{$x},$robot_agent_hit{$x},$x; } print "Visitors:\n"; foreach $x (sort keys %visitor_agent_hit){ printf "%6d %6d %s\n",$visitor_agent_bum{$x},$visitor_agent_hit{$x},$x; } } print "Total robots: $robot_bum hits: $robot_hit\n" if ($opt_r); print "Total visitors: $visitor_bum hits: $visitor_hit\n" if ($opt_v); # Print out a list of the (short) agents and stats if ( $opt_A){ # Extract short agent name and stats for visitors from long name stats foreach $x (sort keys %visitor_agent_hit){ $Agent = which_browser($x); $Visitor_hit{$Agent} += $visitor_agent_hit{$x}; $Visitor_bum{$Agent} += $visitor_agent_bum{$x}; } # print the list, in descending hits order foreach $x (sort{$Visitor_hit{$b}<=>$Visitor_hit{$a}} keys %Visitor_hit){ printf "Visitor %6d %6d %s\n",$Visitor_bum{$x},$Visitor_hit{$x},$x; } # Extract short agent name and stats for robots foreach $x (sort keys %robot_agent_hit){ $Agent = which_robot($x); $Robot_hit{$Agent} += $robot_agent_hit{$x}; $Robot_bum{$Agent} += $robot_agent_bum{$x}; } # print the list, in descending hits order foreach $x (sort{$Robot_hit{$b}<=>$Robot_hit{$a}} keys %Robot_hit){ printf "Robot %6d %6d %s\n",$Robot_bum{$x},$Robot_hit{$x},$x; } } if ( $opt_p){ # print a list of pages with number of hits for each page foreach $x (sort{$visitor_page_hit{$b}<=>$visitor_page_hit{$a}} keys %visitor_page_hit){ printf "Visitor %6d %s\n",,$visitor_page_hit{$x},$x; } foreach $x (sort{$robot_page_hit{$b}<=>$robot_page_hit{$a}} keys %robot_page_hit){ printf "Robot %6d %s\n",,$robot_page_hit{$x},$x; } } if ( $opt_f){ # print a list of referers with number of hits for each referer foreach $x (sort{$visitor_ref_hit{$b}<=>$visitor_ref_hit{$a}} keys %visitor_ref_hit){ printf "%6d %s\n",$visitor_ref_hit{$x},$x; } } if ( $opt_d){ # extract hits from each domain from referer stats foreach $x (keys %visitor_ref_hit){ $domain = (split( /\//,$x))[0]; $domain_ref{$domain} += $visitor_ref_hit{$x}; } # print a list of domains with number of hits for each domain foreach $x (sort{$domain_ref{$b}<=>$domain_ref{$a}} keys %domain_ref){ printf "%6d %s\n",,$domain_ref{$x},$x; } } if ( $opt_s){ # Print site stats (for update of HTML or whatever) printf "Virus %6d %6d\n",$virus_bum, $virus_hit; printf "Robot %6d %6d\n", $robot_bum, $robot_hit; printf "Visitor %6d %6d\n", $visitor_bum, $visitor_hit; } if ( $opt_n){ # print a list of hosts with number of hits for each host foreach $x (sort{$visitor_ip_hit{$b}<=>$visitor_ip_hit{$a}} keys %visitor_ip_hit){ printf "%-16s %6d\n",$x,$visitor_ip_hit{$x}; } } if ( $opt_N){ # extract hits from each domain from referer stats foreach $x (keys %visitor_ip_hit){ $subnet = $x; $subnet =~ s/\.[0-9]+$//; $subnet_hit{$subnet} += $visitor_ip_hit{$x}; } # print a list of subnets with number of hits for each subnet foreach $x (sort{$subnet_hit{$b}<=>$subnet_hit{$a}} keys %subnet_hit){ printf "%-16s %6d\n",$x,$subnet_hit{$x}; } }