#!/usr/bin/perl
# qstr - print a list of search phrases from the logfile
use Time::Local;
use Getopt::Std;
%mth = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11);
getopt('');

=google srch parameters
    q             query
    as_q          All Words
    as_epq        Exact Phrase
    as_oq         Any Words
    as_eq         Except Words
    as_ft         Include (i) or exlude (e) filetype
    as_qdr        Restrict date on returns
                    all, m3 - 3 months, m6 = 6 months, y = 12 months
    as_occt       Where search term must occur
                    any, title, body, url, links
    start         start
    num           Num results per page
    safe          Safe Search (exclude adult content)
    as_sitesearch Include/Exclude site
    hl            host language
    sl            source language

    Other search engines either hand off to Google or use their own
    parameters. These have been included in the subroutine split_query()

    see http://www.pgts.com.au/pgtsj/pgtsj0307a.html

=cut

# ------------------------------------------------------------------------

sub parse_log{
        my @w = split ( ' ', $_[0]);
        # My log uses the 'combined' format as follows:
        # remotehost login authuser [date] "request" status bytes "Referer" "Agent"
        # where: remotehost = IP address
        #        [date]     = timestamp and tz (always +1000 or +1100 for VIC)
        #        login      = remote login as per RFC931 (always -)
        #        authuser   = authenticated username (always -)
        #        "request"  = request cmd sent from the remote agent
        #        status     = numeric status returned by apache
        #        bytes      = number of bytes transmitted
        #        "Referer"  = URL of the Referer
        #        "Agent"    = name of the remote user agent
        # remove the '[' from the date and convert to timestamp with timelocal()
        $w[3] =~ s/^\[//;
        my @t = split(/:/,$w[3]);
        my @d = split( /\//,shift( @t) );
        my @Htime = (reverse(@t),$d[0],$mth{$d[1]},$d[2] - 1900);
        my $ltime = timelocal(@Htime);
        # Original HTML cmd, referer and agent are all enclosed in '"'
        @t = split( '"',$_[0]);
        # extract the status and size
        $t[2] =~ s/^\s+//;
        my @t1 = split(' ',$t[2]);
        # Make allowance for '"' embedded in query strings
        # (split on ' "' and remove trailing '"')
        my @t2 = split( ' "',$_[0]);
        chop $t2[2];
        chomp $t2[3];
        chop $t2[3];
        return($w[0],$ltime,$t[1],$t1[0],$t1[1],$t2[2],$t2[3]);
}

# ------------------------------------------------------------------------

# convert time to "yyyy-mm-dd hh:mm:ss" format string
sub psql_time{
        my @T = localtime $_[0];
        return (sprintf "%04d-%02d-%02d %02d:%02d:%02d",$T[5]+1900,
                $T[4]+1,$T[3],$T[2],$T[1],$T[0]);
}

# ------------------------------------------------------------------------

# make a query string more readable -- substitute hex chars and remove '+'
sub make_readable {
        my $s = $_[0];
        $s =~ s/\+/ /g;
        $s =~ s/\%([0-9A-Fa-f][0-9A-Fa-f])/sprintf("%s",pack('H*',$1))/eg;
        $s =~ s/\s+$//;
        $s =~ s/^\s+//;
        return($s);
}

# ------------------------------------------------------------------------

# split the query string into parameters (warning: recursive for Google 'prev')
sub split_query {
        # Engine specific parms (other than 'q', 'query' and 'key')
        %engine_specific = qw(  Altavista aqa
                                AOL       userQuery
                                Cometway  keywords
                                Googlealert u
                                Netscape  Keywords
                                Rediff    MT
                                Shockwave qkw
                                Virgilio  qs
                                Vonna     k
                                Webferret wf
                                Websearch qkw
                                Yahoo     p );
        my %srch_parm;
	my $q = $_[0];
	if ($engine eq 'Webferret'){
		$q =~ s/wf,/wf=/;
		$q =~ s/,/&tail=/;
	}
        my @w = split(/\&/,$q);
        my $url = substr($w[0],0,index($w[0],'?'));
        $w[0] = substr($w[0],index($w[0],'?')+1);
        foreach my $x(@w) {
                next unless ($x =~ /^([A-Za-z_]+)=/);
                my $k = $1;
                my $v = make_readable($');
                next unless $v;
                print "$k\t$v\n" if ($opt_d);
                if ($engine eq "Google" && $k eq "q" && $v =~ /^cache:[_0-9A-z-]+:(\S+)/){
                        my $p = $1;
                        $v =~ s/^cache:[_0-9A-z-]+:\S+//;
                        $page = 'http://' . $p;
                        $page = substr($page,length($base_url)) if(index($page,$base_url) == 0);
                }
                # if we find a parameter 'q', look no further ...
                return $v if($k eq "q" || $k eq $engine_specific{$engine});
                $srch_parm{$k} = $v;
        }
        return $srch_parm{as_q} if ($srch_parm{as_q});
        return $srch_parm{as_epq} if ($srch_parm{as_epq});
        $engine = "Yahoo" if ($url =~ /yahoo/);
        if ($engine eq "Yahoo") {
                return $srch_parm{p} if ($srch_parm{p});
                return $srch_parm{va} if ($srch_parm{va});
                return $srch_parm{vp} if ($srch_parm{vp});
        }
        elsif ($engine eq "Altavista") {
                return $srch_parm{aqb} if ($srch_parm{aqb});
                return $srch_parm{aqp} if ($srch_parm{aqp});
        }
        # Countback if GUI browser fetchs image file from toolbar
        elsif ($engine eq "Google" && $srch_parm{prev} && $srch_parm{imgrefurl} &&
            index($srch_parm{imgrefurl},$base_url) == 0){
                $page = substr($srch_parm{imgrefurl},length($base_url));
                return (split_query($srch_parm{prev}));
        }
        # check prev parameter, for other Google operations (like translate)
        elsif ($engine eq "Google" && $srch_parm{prev} ){
                return (split_query($srch_parm{prev}));
        }
        else {
                # common or obvious parameters for engines other than Google:
                return $srch_parm{key}          if($srch_parm{key});
                return $srch_parm{query}        if($srch_parm{query});
                return $srch_parm{ask}          if($srch_parm{ask});
                return $srch_parm{searchfor}    if($srch_parm{searchfor});
                return $srch_parm{qry}          if($srch_parm{qry});
        }
        return $srch_parm{qry} if ($srch_parm{cmd} eq "qry" &&
                                   $srch_parm{submit} eq "Google Search");
        if ($srch_parm{MetaEngine} && $srch_parm{MetaTopic}){
                $engine = $srch_parm{MetaEngine};
                return($srch_parm{MetaTopic});
        }
        if ($opt_e){
                print "--------------------------------------------------\n";
                print "$_[0]\n";
                print "Search Engine: $engine\n";
                foreach my $x(keys %srch_parm){
                        print "\t$x\t$srch_parm{$x}\n";
                }
        }
        return ("");
}

# ------------------------------------------------------------------------

# Main ...
die "usage: $0 base_url log_file\n" unless (@ARGV == 2);
$base_url = shift(@ARGV);
$base_url =~ s/\/$//;
die "Missing Base: $base_url" unless ($base_url =~ m#^http://#);
die "Cannot Access Log file $ARGV[0]" unless (-r $ARGV[0]);
while (<>) {
        my $query;
        next unless (($IP,$htime,$cmd,$status,$bytes,$referer,$agent) = parse_log($_));
        $page = $cmd;
        $page =~ s/^GET //;
        $page =~ s/ HTTP.*$//;
        undef $engine;
        if ($referer =~ /Google\+Search/) {
                $engine = "Google";
        }
        elsif ($referer =~ m#^http://search.msn.com#){
                $engine = "MSN";
        }
        elsif ($referer =~ m#^http://search.ninemsn.com.au#){
                $engine = "nineMSN";
                next unless ($query = split_query($referer) );
        }
        elsif ($referer =~ /\?/){
                $engine = "$`";
                next unless ($engine =~ /(google\w*)/ || $engine =~ /(altavista)/ ||
                    $engine =~ /(yahoo)/ || $engine =~ /(ask.com)/ || $engine =~ /(aol)/ ||
                    $engine =~ /(looksmart)/      || $engine =~ /\W(ask)\W/ ||
                    $engine =~ /\W(lycos)\W/      || $engine =~ /\W(netscape)\W/ ||
                    $engine =~ /\W(optusnet)\W/   || $engine =~ /\W(cometway)\W/ ||
                    $engine =~ /\W(ixquick)\W/    || $engine =~ /\W(cnn)\W/ ||
                    $engine =~ /\W(websearch)\.com/  || $engine =~ /\W(webferret)\W/ ||
                    $engine =~ /\W(\w+)\.com\/.*search/  ||
                    $engine =~ /search\.(\w+)/ );
                $engine = ucfirst($1);
                $engine = uc($engine) if (length($engine) < 4);
        }
        else {
                next;
        }
        next unless ($query = split_query($referer) );
        $ptime = psql_time($htime);
        next if ($Pengine eq $engine && $PIP eq $IP && $Ppage eq $page &&
                 $Pquery eq $query && ($htime - $Phtime) < 600 );
        next if ($opt_e);
        print "$ptime\t$engine\t$IP\t$page\t$query\n";
        ($Phtime,$Pengine,$PIP,$Ppage,$Pquery) = ($htime,$engine,$IP,$page,$query)
}
