#!/usr/bin/perl # qstr - print a list of search phrases from the logfile use Time::Local; use Getopt::Std; %mth = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11); getopt(''); =google srch parameters q query as_q All Words as_epq Exact Phrase as_oq Any Words as_eq Except Words as_ft Include (i) or exlude (e) filetype as_qdr Restrict date on returns all, m3 - 3 months, m6 = 6 months, y = 12 months as_occt Where search term must occur any, title, body, url, links start start num Num results per page safe Safe Search (exclude adult content) as_sitesearch Include/Exclude site hl host language sl source language Other search engines either hand off to Google or use their own parameters. These have been included in the subroutine split_query() see http://www.pgts.com.au/pgtsj/pgtsj0307a.html =cut # ------------------------------------------------------------------------ sub parse_log{ my @w = split ( ' ', $_[0]); # My log uses the 'combined' format as follows: # remotehost login authuser [date] "request" status bytes "Referer" "Agent" # where: remotehost = IP address # [date] = timestamp and tz (always +1000 or +1100 for VIC) # login = remote login as per RFC931 (always -) # authuser = authenticated username (always -) # "request" = request cmd sent from the remote agent # status = numeric status returned by apache # bytes = number of bytes transmitted # "Referer" = URL of the Referer # "Agent" = name of the remote user agent # remove the '[' from the date and convert to timestamp with timelocal() $w[3] =~ s/^\[//; my @t = split(/:/,$w[3]); my @d = split( /\//,shift( @t) ); my @Htime = (reverse(@t),$d[0],$mth{$d[1]},$d[2] - 1900); my $ltime = timelocal(@Htime); # Original HTML cmd, referer and agent are all enclosed in '"' @t = split( '"',$_[0]); # extract the status and size $t[2] =~ s/^\s+//; my @t1 = split(' ',$t[2]); # Make allowance for '"' embedded in query strings # (split on ' "' and remove trailing '"') my @t2 = split( ' "',$_[0]); chop $t2[2]; chomp $t2[3]; chop $t2[3]; return($w[0],$ltime,$t[1],$t1[0],$t1[1],$t2[2],$t2[3]); } # ------------------------------------------------------------------------ # convert time to "yyyy-mm-dd hh:mm:ss" format string sub psql_time{ my @T = localtime $_[0]; return (sprintf "%04d-%02d-%02d %02d:%02d:%02d",$T[5]+1900, $T[4]+1,$T[3],$T[2],$T[1],$T[0]); } # ------------------------------------------------------------------------ # make a query string more readable -- substitute hex chars and remove '+' sub make_readable { my $s = $_[0]; $s =~ s/\+/ /g; $s =~ s/\%([0-9A-Fa-f][0-9A-Fa-f])/sprintf("%s",pack('H*',$1))/eg; $s =~ s/\s+$//; $s =~ s/^\s+//; return($s); } # ------------------------------------------------------------------------ # split the query string into parameters (warning: recursive for Google 'prev') sub split_query { # Engine specific parms (other than 'q', 'query' and 'key') %engine_specific = qw( Altavista aqa AOL userQuery Cometway keywords Googlealert u Netscape Keywords Rediff MT Shockwave qkw Virgilio qs Vonna k Webferret wf Websearch qkw Yahoo p ); my %srch_parm; my $q = $_[0]; if ($engine eq 'Webferret'){ $q =~ s/wf,/wf=/; $q =~ s/,/&tail=/; } my @w = split(/\&/,$q); my $url = substr($w[0],0,index($w[0],'?')); $w[0] = substr($w[0],index($w[0],'?')+1); foreach my $x(@w) { next unless ($x =~ /^([A-Za-z_]+)=/); my $k = $1; my $v = make_readable($'); next unless $v; print "$k\t$v\n" if ($opt_d); if ($engine eq "Google" && $k eq "q" && $v =~ /^cache:[_0-9A-z-]+:(\S+)/){ my $p = $1; $v =~ s/^cache:[_0-9A-z-]+:\S+//; $page = 'http://' . $p; $page = substr($page,length($base_url)) if(index($page,$base_url) == 0); } # if we find a parameter 'q', look no further ... return $v if($k eq "q" || $k eq $engine_specific{$engine}); $srch_parm{$k} = $v; } return $srch_parm{as_q} if ($srch_parm{as_q}); return $srch_parm{as_epq} if ($srch_parm{as_epq}); $engine = "Yahoo" if ($url =~ /yahoo/); if ($engine eq "Yahoo") { return $srch_parm{p} if ($srch_parm{p}); return $srch_parm{va} if ($srch_parm{va}); return $srch_parm{vp} if ($srch_parm{vp}); } elsif ($engine eq "Altavista") { return $srch_parm{aqb} if ($srch_parm{aqb}); return $srch_parm{aqp} if ($srch_parm{aqp}); } # Countback if GUI browser fetchs image file from toolbar elsif ($engine eq "Google" && $srch_parm{prev} && $srch_parm{imgrefurl} && index($srch_parm{imgrefurl},$base_url) == 0){ $page = substr($srch_parm{imgrefurl},length($base_url)); return (split_query($srch_parm{prev})); } # check prev parameter, for other Google operations (like translate) elsif ($engine eq "Google" && $srch_parm{prev} ){ return (split_query($srch_parm{prev})); } else { # common or obvious parameters for engines other than Google: return $srch_parm{key} if($srch_parm{key}); return $srch_parm{query} if($srch_parm{query}); return $srch_parm{ask} if($srch_parm{ask}); return $srch_parm{searchfor} if($srch_parm{searchfor}); return $srch_parm{qry} if($srch_parm{qry}); } return $srch_parm{qry} if ($srch_parm{cmd} eq "qry" && $srch_parm{submit} eq "Google Search"); if ($srch_parm{MetaEngine} && $srch_parm{MetaTopic}){ $engine = $srch_parm{MetaEngine}; return($srch_parm{MetaTopic}); } if ($opt_e){ print "--------------------------------------------------\n"; print "$_[0]\n"; print "Search Engine: $engine\n"; foreach my $x(keys %srch_parm){ print "\t$x\t$srch_parm{$x}\n"; } } return (""); } # ------------------------------------------------------------------------ # Main ... die "usage: $0 base_url log_file\n" unless (@ARGV == 2); $base_url = shift(@ARGV); $base_url =~ s/\/$//; die "Missing Base: $base_url" unless ($base_url =~ m#^http://#); die "Cannot Access Log file $ARGV[0]" unless (-r $ARGV[0]); while (<>) { my $query; next unless (($IP,$htime,$cmd,$status,$bytes,$referer,$agent) = parse_log($_)); $page = $cmd; $page =~ s/^GET //; $page =~ s/ HTTP.*$//; undef $engine; if ($referer =~ /Google\+Search/) { $engine = "Google"; } elsif ($referer =~ m#^http://search.msn.com#){ $engine = "MSN"; } elsif ($referer =~ m#^http://search.ninemsn.com.au#){ $engine = "nineMSN"; next unless ($query = split_query($referer) ); } elsif ($referer =~ /\?/){ $engine = "$`"; next unless ($engine =~ /(google\w*)/ || $engine =~ /(altavista)/ || $engine =~ /(yahoo)/ || $engine =~ /(ask.com)/ || $engine =~ /(aol)/ || $engine =~ /(looksmart)/ || $engine =~ /\W(ask)\W/ || $engine =~ /\W(lycos)\W/ || $engine =~ /\W(netscape)\W/ || $engine =~ /\W(optusnet)\W/ || $engine =~ /\W(cometway)\W/ || $engine =~ /\W(ixquick)\W/ || $engine =~ /\W(cnn)\W/ || $engine =~ /\W(websearch)\.com/ || $engine =~ /\W(webferret)\W/ || $engine =~ /\W(\w+)\.com\/.*search/ || $engine =~ /search\.(\w+)/ ); $engine = ucfirst($1); $engine = uc($engine) if (length($engine) < 4); } else { next; } next unless ($query = split_query($referer) ); $ptime = psql_time($htime); next if ($Pengine eq $engine && $PIP eq $IP && $Ppage eq $page && $Pquery eq $query && ($htime - $Phtime) < 600 ); next if ($opt_e); print "$ptime\t$engine\t$IP\t$page\t$query\n"; ($Phtime,$Pengine,$PIP,$Ppage,$Pquery) = ($htime,$engine,$IP,$page,$query) }