#!/usr/bin/perl # returns stats from an apache log file (ver 1.3.20 on Mandrake) # Counts bums on seats and hits for visitors, robots and viri # Gerry Patterson, May 2002 use Time::Local; use Getopt::Std; # options: # -a print a list of agents. # -A print a list of agents (using short name). # -d print a list of referer domains with stats. # -f print a list of referers with stats. # -i [input_file] read initial variables from [init_file]. # -n print a list of hosts (IP address) and visits. # -N print a list of subnets and visits. # -o [output_file] write final value of variables to [output_file]. # -p print a list of pages with stats. # -r print a list of robot agents. # -s print a summary of statistics (can be used to update HTML). # -v print a list of visitor agents. # -V print a list of visitor hits. # init variables %mth = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11); $virus_bum = $virus_hit = $robot_bum = $robot_hit = $visitor_bum = $visitor_hit = 0; getopt('io'); if ( $opt_i){ # initialise the counters from this file require $opt_i; } if ( $opt_o){ # if the output file exists than intialise the counters require $opt_o if (-f $opt_o); } while(<>){ @w = split ( ' ', $_); # First perform a sanity check # My log uses the 'combined' format as follows: # remotehost login authuser [date] "request" status bytes "Referer" "Agent" # where: remotehost = IP address # [date] = timestamp and tz (always +1000 or +1100 for VIC) # login = remote login as per RFC931 (always -) # authuser = authenticated username (always -) # "request" = request cmd sent from the remote agent # status = numeric status returned by apache # bytes = number of bytes transmitted # "Referer" = URL of the Referer # "Agent" = name of the remote user agent # This will have to be customised for individual sites die "Error insane at line $.:$_" unless ( $w[0] =~ /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ && $w[1] =~ /-/ && $w[2] =~ /-/ && $w[4] =~ /^\+1[01]00\]$/); # Exclude these subnets/addresses next if ( $w[0] =~ /127\.0\.0\.1/ || $w[0] =~ /1\.2\.3\..*/ || $w[0] =~ /1\.2\.3\.4[01]$/ ); # remove the '[' from the date and convert to timestamp with timelocal() $w[3] =~ s/^\[//; @t = split(/:/,$w[3]); @d = split( /\//,shift( @t) ); @Htime = (reverse(@t),$d[0],$mth{$d[1]},$d[2] - 1900); $htime = timelocal(@Htime); # Original HTML cmd, referer and agent are all enclosed in '"' @t = split( '"',$_); ($cmd,$referer,$agent) = ($t[1],$t[3],$t[5]); @t = split( ' ',$cmd); # strip query part and www part from referer $referer =~ s/^http:\/\///; $referer =~ s/^www\.//; $referer =~ s/\?.*$//; $referer =~ s/\/$//; next unless ($t[0] =~ /^GET$/i); $page = $t[1]; # Construct a robot gang ID from the 24-bit subnet and the agent string $subnet = $w[0]; $subnet =~ s/\.[0-9]+$//; $GangID = $subnet . ":" . $agent; # ignore these agents (put submission aids here) next if ( $agent =~ /^Submission_Aid 9.9/ || $agent =~ /^W3C_Validator\/1.183/ ); # add cloaked robot agents here $agent = 'larbin_unspecifiedBot' if ($agent =~ /larbin.*\@unspecified.mail/); # Microsoft viri don't identify themselves (agent and referer is "-") if ( $w[$#w] eq '"-"' ){ # assume it is a virus if agent is "-" $type = "virus"; $virus_bum++ if ( $htime - $timeIP{$w[0]} > 3000 ); $virus_hit++; } # well behaved robots should start with /robots.txt elsif ( ($htime - $timeIP{$w[0]} > 3000 && $page =~ /^\/robots.txt/) || ($htime - $timeIP{$w[0]} <= 3000 && $typeIP{$w[0]} eq "robot") || ($htime - $timeIP{$w[0]} <= 3000 && $typeIP{$w[0]} eq "robot") || ($Gang{$GangID}) || # Add unruly robots to this list $agent =~ /^bumblebee\@relevare.com/ || $agent =~ /^ExactSeek Crawler/ || $agent =~ /^Robozilla/ || $agent =~ /^SplatSearch.com/ ) { $type = "robot"; if ( $htime - $Gang{$GangID} > 3000){ $robot_bum++; $robot_agent_bum{$agent}++; $robot_agent_hit{$agent} += 0; print "$w[3] $w[0]\t$agent\n" if ($opt_r); } # put your hit exclusion list for robots here unless ( # $page =~ /^\/icons/ || # $page =~ /^\/images/ || $page =~ /^\/robots.txt/ || $page =~ /^\/download\/\?/ ){ $robot_hit++; $robot_agent_hit{$agent}++; $robot_page_hit{$page}++; } $Gang{$GangID} = $htime; } else { $type = "visitor"; # Count bum on seat if last hit for this IP is > 30 min if ( $htime - $timeIP{$w[0]} > 3000 ){ $visitor_bum++; $visitor_agent_bum{$agent}++; $visitor_agent_hit{$agent} += 0; } unless ( $page =~ /^\/icons/ || $page =~ /^\/images/){ $visitor_hit++; $visitor_agent_hit{$agent}++; $visitor_ip_hit{$w[0]}++; $visitor_page_hit{$page}++; $visitor_ref_hit{$referer}++; print $_ if ($opt_V); } print "$w[3] $w[0]\t$agent\n" if ( $htime - $timeIP{$w[0]} > 3000 && $opt_v); } $timeIP{$w[0]} = $htime; $typeIP{$w[0]} = $type; } # This next section produces output according to the options ... # -------------------------------------------------------------- # Write the variables to disk -- only do this when the log files are rolled if ( $opt_o){ open(OUTS,">$opt_o") || die "Error opening $opt_o for output\n"; print OUTS '$virus_bum='."$virus_bum; ".'$virus_hit='."$virus_hit;\n"; print OUTS '$robot_bum='."$robot_bum; ".'$robot_hit='."$robot_hit;\n"; print OUTS '$visitor_bum='."$visitor_bum; ".'$visitor_hit='."$visitor_hit;\n1;\n"; } # Print out a list of the (full length) agent strings and stats if ( $opt_a){ print "Robots:\n"; foreach $x (sort keys %robot_agent_hit){ printf "%6d %6d %s\n",$robot_agent_bum{$x},$robot_agent_hit{$x},$x; } print "Visitors:\n"; foreach $x (sort keys %visitor_agent_hit){ printf "%6d %6d %s\n",$visitor_agent_bum{$x},$visitor_agent_hit{$x},$x; } } print "Total robots: $robot_bum hits: $robot_hit\n" if ($opt_r); print "Total visitors: $visitor_bum hits: $visitor_hit\n" if ($opt_v); # Print out a list of the (short) agents and stats if ( $opt_A){ # Extract short agent name and stats for visitors from long name stats foreach $x (sort keys %visitor_agent_hit){ # MSIE browsers have the user_agent in the system string if ( $x =~ /compatible; MSIE (\S+)/){ $Agent = "MSIE $1"; } else { $Agent = (split(' ',$x))[0]; } $Visitor_hit{$Agent} += $visitor_agent_hit{$x}; $Visitor_bum{$Agent} += $visitor_agent_bum{$x}; } # print the list, in descending hits order foreach $x (sort{$Visitor_hit{$b}<=>$Visitor_hit{$a}} keys %Visitor_hit){ printf "Visitor %6d %6d %s\n",$Visitor_bum{$x},$Visitor_hit{$x},$x; } # Extract short agent name and stats for robots foreach $x (sort keys %robot_agent_hit){ # MSIECrawlers have the user_agent in the system string if ( $x =~ /MSIECrawler\)/){ $Agent = "MSIECrawler"; } else { $Agent = (split(' ',$x))[0]; } $Robot_hit{$Agent} += $robot_agent_hit{$x}; $Robot_bum{$Agent} += $robot_agent_bum{$x}; } # print the list, in descending hits order foreach $x (sort{$Robot_hit{$b}<=>$Robot_hit{$a}} keys %Robot_hit){ printf "Robot %6d %6d %s\n",$Robot_bum{$x},$Robot_hit{$x},$x; } } if ( $opt_p){ # print a list of pages with number of hits for each page foreach $x (sort{$visitor_page_hit{$b}<=>$visitor_page_hit{$a}} keys %visitor_page_hit){ printf "Visitor %6d %s\n",,$visitor_page_hit{$x},$x; } foreach $x (sort{$robot_page_hit{$b}<=>$robot_page_hit{$a}} keys %robot_page_hit){ printf "Robot %6d %s\n",,$robot_page_hit{$x},$x; } } if ( $opt_f){ # print a list of referers with number of hits for each referer foreach $x (sort{$visitor_ref_hit{$b}<=>$visitor_ref_hit{$a}} keys %visitor_ref_hit){ printf "%6d %s\n",$visitor_ref_hit{$x},$x; } } if ( $opt_d){ # extract hits from each domain from referer stats foreach $x (keys %visitor_ref_hit){ $domain = (split( /\//,$x))[0]; $domain_ref{$domain} += $visitor_ref_hit{$x}; } # print a list of domains with number of hits for each domain foreach $x (sort{$domain_ref{$b}<=>$domain_ref{$a}} keys %domain_ref){ printf "%6d %s\n",,$domain_ref{$x},$x; } } if ( $opt_s){ # Print site stats (for update of HTML or whatever) printf "Virus %6d %6d\n",$virus_bum, $virus_hit; printf "Robot %6d %6d\n", $robot_bum, $robot_hit; printf "Visitor %6d %6d\n", $visitor_bum, $visitor_hit; } if ( $opt_n){ # print a list of hosts with number of hits for each host foreach $x (sort{$visitor_ip_hit{$b}<=>$visitor_ip_hit{$a}} keys %visitor_ip_hit){ printf "%-16s %6d\n",$x,$visitor_ip_hit{$x}; } } if ( $opt_N){ # extract hits from each domain from referer stats foreach $x (keys %visitor_ip_hit){ $subnet = $x; $subnet =~ s/\.[0-9]+$//; $subnet_hit{$subnet} += $visitor_ip_hit{$x}; } # print a list of subnets with number of hits for each subnet foreach $x (sort{$subnet_hit{$b}<=>$subnet_hit{$a}} keys %subnet_hit){ printf "%-16s %6d\n",$x,$subnet_hit{$x}; } }