#!/usr/bin/perl # suspects - search for crawlers # Gerry Patterson, Nov 2002 use Time::Local; use Getopt::Std; require "/Mypath/agent_id"; require "/Mypath/agent_data"; # options: # -D Debug level, where the level = suspicion index (or -1) # -l print a list of suspects on the console # -u run SQL to Update the agents data # -U run SQL to Update the robots data (month end) # init variables %mth = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11); @Mth = qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/; # Page Types $NULL = 0; $IMAGE = 1; $BIN = 2; $ROBOT = 3; $HTML = 4; $TXT = 5; # Referer Types $ANON = 0; $LOCAL = 1; $REMOTE = 2; getopt('D'); $opt_D += 0; # ------------------------------------------------------------------------ # convert a time into (log format) time string sub time_str{ my @T = localtime($_[0]); my $tm = sprintf "%02d/%s/%04d:%02d:%02d:%02d",$T[3],$Mth[$T[4]], $T[5]+1900,$T[2],$T[1],$T[0]; return($tm); } # ------------------------------------------------------------------------ # convert a time to psql format string sub psql_time{ my @T = localtime $_[0]; return (sprintf "%04d-%02d-%02d %02d:%02d:%02d",$T[5]+1900, $T[4]+1,$T[3],$T[2],$T[1],$T[0]); } # ------------------------------------------------------------------------ # convert psql string to system time sub time_psql{ my $t = $_[0]; $t =~ s/\+[0-9]+$//; my @t = split(' ',$t); my @t1 = split (/-/,$t[0]); my @t2 = split (/:/,$t[1]); my @tt = (reverse(@t2),reverse(@t1)); $tt[4]--; return timelocal(@tt); } # ------------------------------------------------------------------------ # extract a 16-bit subnet from a dotted quad sub sub16{ my @s = split(/\./,$_[0]); return("$s[0].$s[1]"); } # ------------------------------------------------------------------------ sub LoadNonSuspects{ my @t = chomp_sql("select ip_addr,start_time,end_time from robot_suspects where not confirmed;"); foreach my $x(@t){ my @w = split(/\t/,$x); my @a = (time_psql($w[1]),time_psql($w[2])); push (@{$NonSuspects{$w[0]}},@a); } } # ------------------------------------------------------------------------ # examine given IP addr, time to determine if exempt from suspect list sub non_suspect{ return(0) unless ($NonSuspects{$_[0]}); my @a = @{$NonSuspects{$_[0]}}; while(@a){ my $t1 = shift(@a); my $t2 = shift(@a); return(1) if ($t1 <= $_[1] && $_[1] <= $t2); } return(0); } # ------------------------------------------------------------------------ sub FindCrawler{ my $session_start; my @TD = @_; my @result; for (my $Ndx = 0; $Ndx <= $#TD; $Ndx++){ my @v = @{$TD[$Ndx]}; printf "%-15s %s %s %s\n", $IP,time_str($v[0]),$v[2],$v[3] if ($opt_D == -1); unless ($session_start){ $prev_time = $session_start = $v[0]; @Ptype = (0,0,0,0,0,0); @Rtype = (0,0,0,0,0,0); $hits = 0; undef @session_agents; $suspicion = 0; } if (non_suspect($IP,$v[0])){ $SusAgent[$v[1]] = 0; $non_suspect[$v[1]]++; next; } $session_agents[$v[1]]++; $hits++; $Ptype[$v[2]]++; $Rtype[$v[3]]++; $HitIP{$IP}++; if ( ( ($v[0] - $prev_time) > 14400) || $Ndx == $#TD){ # round up the agent_ids my @Sagent; for my $i(0 .. $#session_agents){push(@Sagent,$i) if ($session_agents[$i])}; # now, check whether previous session is a crawler suspect $duration = $prev_time - $session_start; $suspicion |= 1 if (@Sagent == 1 && $SusAgent[$Sagent[0]]); $suspicion |= 2 if ($hits>10&&($Ptype[$IMAGE]==0 || $Ptype[$IMAGE]==$hits)); $suspicion &= 253 if (@Sagent==1 && $TxtBrowser[$Sagent[0]]); $suspicion |= 4 if ($Ptype[$HTML]>1 && $Ptype[$IMAGE]==0 && $Rtype[$ANON]==$hits); $suspicion |= 8 if ($Ptype[$ROBOT]); $suspicion |= 16 if ($SusIP{$IP} && $Ptype[$IMAGE]==0 && $Rtype[$ANON]==$hits); $suspicion |= 32 if ($hits > 10 && ($hits/$duration) > 1.4); # include option to Debug suspicion index printf "%d %-15s %s %s %d %d %d %d\n", $suspicion, $IP, time_str($session_start), time_str($v[0]),$Ptype[$IMAGE],$hits,$SusAgent[$Sagent[0]],$Sagent[0] if ($suspicion & $opt_D || $opt_D == -1); push (@result,$suspicion,$session_start,$prev_time); undef $session_start; $SusIP{$IP} += $hits if ($suspicion); for my $i (0 .. $#session_agents){ $suspect[$i] += $session_agents[$i] if ($suspicion && $session_agents[$i]); $non_suspect[$i] += $session_agents[$i] if ($suspicion == 0 && $session_agents[$i]); } } $prev_time = $v[0]; } return (@result); } # ------------------------------------------------------------------------ sub parse_log{ my @w = split ( ' ', $_[0]); # First perform a sanity check # My log uses the 'combined' format as follows: # remotehost login authuser [date] "request" status bytes "Referer" "Agent" # where: remotehost = IP address # [date] = timestamp and tz (always +1000 or +1100 for VIC) # login = remote login as per RFC931 (always -) # authuser = authenticated username (always -) # "request" = request cmd sent from the remote agent # status = numeric status returned by apache # bytes = number of bytes transmitted # "Referer" = URL of the Referer # "Agent" = name of the remote user agent # This will have to be customised for individual sites die "Error insane at line $.:$_" unless ( $w[0] =~ /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ && $w[1] =~ /-/ && $w[2] =~ /-/ && $w[4] =~ /^\+1[01]00\]$/); # Exclude these subnets/addresses return() if ( $w[0] =~ /127\.0\.0\.1/ || $w[0] =~ /1\.2\.3\..*/ || $w[0] =~ /1\.2\.3\.4[01]$/ ); # remove the '[' from the date and convert to timestamp with timelocal() $w[3] =~ s/^\[//; my @t = split(/:/,$w[3]); my @d = split( /\//,shift( @t) ); my @Htime = (reverse(@t),$d[0],$mth{$d[1]},$d[2] - 1900); my $ltime = timelocal(@Htime); # Original HTML cmd, referer and agent are all enclosed in '"' @t = split( '"',$_); # extract the status and size $t[2] =~ s/^\s+//; my @t1 = split(' ',$t[2]); return($w[0],$ltime,$t[1],$t1[0],$t1[1],$t[3],$t[5]); } # ------------------------------------------------------------------------ LoadNonSuspects(); # Get a list of text-only browsers @t = chomp_sql("select agent_id from webagents where text_only;\n"); foreach $i(@t){ $TxtBrowser[$i]++}; # Search for robot suspects while(<>){ next unless (($IP,$htime,$cmd,$status,$bytes,$referer,$agent) = parse_log($_)); @t = split( ' ',$cmd); $page = $t[1]; unless ($page){ $page_type = $NULL; } elsif ($page =~ /\/icons\// || $page =~ /\/images\// ){ $page_type = $IMAGE; } elsif ($page =~ /\.exe$/ || $page =~ /\.gz$/ || $page =~ /\.msi$/ || $page =~ /\.zip$/){ $page_type = $BIN; } elsif ($page =~ /robots.txt$/ ){ $page_type = $ROBOT; } elsif ($page =~ /\.txt$/ || $page =~ /-HOWTO$/ ){ $page_type = $TXT; } else { $page_type = $HTML; } if (length($agent) < 2){ # it's a worm or an attack -- forget it (for now) # Some time (when I get round to it), I will put some code here # to analyse behaviour patterns of Microsoft worms $agent = "-"; next; } unless ($agent_id = $AgentID{$agent}){ push (@agent_array, $agent); $agent_id = $AgentID{$agent} = $#agent_array; } if ($referer =~ /pgts.com.au/ || $referer =~ /^\//){ $ref_type = $LOCAL; } elsif (length($referer) > 3){ $ref_type = $REMOTE; } else { $ref_type = $ANON; } $Phit[$agent_id]++ unless ( $page_type == $NULL || $page_type == $IMAGE); $Thit[$agent_id]++; ${$AgentIP[$agent_id]}{$IP}++; if ($RobotArray[$agent_id]){ my @H = @{$AgentHit[$agent_id]}; push (@{$AgentHit[$agent_id]},$htime); next; } if ($AgentHit[$agent_id]){ # Alias the IP, if hit with same subnet/agent_string within 30 sec # This looks like a kludge, because it is a kludge -- too bad! my @H = @{$AgentHit[$agent_id]}; my @I = @{$AliasIP[$agent_id]}; if ( sub16($IP) eq sub16($I[$#I]) && ($htime - $H[$#H]) < 30){ $IP = $I[$#I]; } } push (@{$AliasIP[$agent_id]},$IP); push (@{$AgentHit[$agent_id]},$htime); my @v = ($htime,$agent_id,$page_type,$ref_type); push (@{$IPdata{$IP}},\@v); next; } # look for suspicious agents for $agent_id(0 .. $#Thit){ next unless ($Thit[$agent_id] > 1); next if ($Thit[$agent_id] != $Phit[$agent_id] && $Phit[$agent_id] > 0); my $nIP = keys %{$AgentIP[$agent_id]}; next if ($TxtBrowser[$agent_id]); printf "%5d %5d %5d %s\n",$Thit[$agent_id],$Phit[$agent_id], $nIP,$agent_array[$agent_id] if ($opt_D == -1); $SusAgent[$agent_id]++; } undef @AliasIP; # now check for suspicious IPs foreach $IP(sort keys %IPdata){ my @t = @{$IPdata{$IP}}; # check the IP address for signs of crawlers my @behave = FindCrawler(@t); # consolidate suspicious behaviour into the SusData array if (@behave < 4){ push (@SusData,$IP,$behave[1],$behave[2]) if ($behave[0]); } else { undef $Start; for (my $i = 0; $i < $#behave; $i+=3){ if ($behave[$i]){ $Start = $behave[$i+1] unless ($Start); push (@SusData,$IP,$Start,$behave[$i+2]) if ($i > $#behave - 3); } else { push (@SusData,$IP,$Start,$behave[$i+1]) if ($Start); undef $Start; } } } } if ($opt_l){ # print a list of suspects to the console for (my $i=0; $i < $#SusData; $i += 3){ $IP = $SusData[$i]; my $t1 = time_str($SusData[$i+1]); my $t2 = time_str($SusData[$i+2]); printf "%-15s %s %s\n", $IP,$t1,$t2; } } if ($opt_u || $opt_U){ my $tmpfile = opentmp_sql(); print OTMP "delete from robot_suspects where confirmed;\n"; for (my $i=0; $i < $#SusData; $i += 3){ $IP = $SusData[$i]; my $t1 = time_str($SusData[$i+1]); my $t2 = time_str($SusData[$i+2]); print OTMP "insert into robot_suspects values ('$IP','$t1','$t2','t');\n"; } runtmp_sql($tmpfile); } # produce SQL to update the agents table if ($opt_u || $opt_U){ my $tmpfile = opentmp_sql(); for my $i(0 .. $#agent_array){ my $IPaddr = ""; my $utime = ""; $ctime = ""; my $hits = $Phit[$i] + 0; $hits = $Thit[$i] + 0 if ($RobotArray[$i]); next unless ($AgentIP[$i]); my %I = %{$AgentIP[$i]}; $IPaddr = join(' ',sort keys %I); $IPaddr = sprintf('(%d)',keys(%I)+0) if (length($IPaddr)>2048); my @H = @{$AgentHit[$i]}; next unless (@H); $utime = psql_time($H[$#H]); $ctime = psql_time($H[0]); die "Missing agent stats" unless ($utime); if ($i <= $#AgentName){ print OTMP "update webagents set hit00 = $hits,"; print OTMP "ip_addr = '$IPaddr',update_date = now(),"; print OTMP "last_visit = '$utime'"; print OTMP "\nwhere agent_id = $i;\n"; next; } my $name = ""; my $version = ""; my $os = ""; $name = which_browser($agent_array[$i]); if ($name =~ /(.*)\s+([0-9.]+)$/){ $name = $1; $version = $2; } $os = which_OS($agent_array[$i]); my $agent_str = $agent_array[$i]; $agent_str =~ s/\t/ /g; $agent_str =~ s/'/\\'/g; print OTMP "insert into webagents(agent_id,agent_string,name,version,os,"; print OTMP "ip_addr,hit00,last_visit,create_date,update_date,robot_ind) "; print OTMP "values ($i,'$agent_str','$name','$version','$os',"; print OTMP "'$IPaddr',$hits,'$utime','$ctime',now(),0);\n"; } print OTMP "update webagents set text_only = 't' where name in ('Links','Lynx','Elinks','W3m');\n"; runtmp_sql($tmpfile); } if ($opt_U){ # Give eligible suspects a permanent entry in the robots file # NOTE: This usually runs at month-end # (and maybe manually when a few new and obvious robots visit) my $tmpfile = opentmp_sql(); for my $i(0 .. $#agent_array){ next if ($RobotArray[$i]); next if ($non_suspect[$i]); next unless ($suspect[$i]); my %I = %{$AgentIP[$i]}; my @H = @{$AgentHit[$i]}; my $ctime = psql_time($H[0]); my $name1 = $AgentName[$i]; my $name2 = which_robot($agent_array[$i]); $name1 = which_browser($agent_array[$i]) unless ($name1); if ($name2 eq "unknown"){ # next if (@H < 10); next if (@H > 1 && ((keys(%I)+0)/(@H+0)) > 0.8); my @T = localtime($ctime); $name = sprintf "u%02d%02d%02d",$T[5]-1900,$T[4]+1,$T[3]; } my $version = ""; my $email = ""; my $url = ""; my $x = $agent_array[$i]; if ($x =~ /(http:\S*)/){ $url = $1; } elsif ($x =~ /(www\.\S*)/) { $url = 'http://' . $1; } $url =~ s/\W+$//; $email = $1 if ($x =~ /(\S+\@\S+)/); $email =~ s/\W+$//; $email =~ s/^\W+//; if ($name2 =~ /(.*)\s+([0-9.]+)$/){ $name2 = $1; $version = $2; } print OTMP "insert into webrobots values("; print OTMP "$i,'$email','$url','f','t',now(),now());\n"; print OTMP "update webagents set robot_ind = -1,name = '$name2',version = '$version',update_date = now() where agent_id = $i;\n"; } # runtmp_sql($tmpfile); }