#!/usr/bin/perl # pagehit - a perl script which Generates page ranking HTML # ------------------------------------------------------------------------ # This perl script generates the HTML source code for the page ranking # pages on my site. Pages are ranked according to the number of hits # registered since 01-Jun-2002, so older pages have an advantage. When # reading this source code you might also wish to view the HTML source # code that this script produces. Here are three examples of page ranking # URLs: # http://www.pgts.com.au/pgtsj/pgtsj0211a.html # http://www.pgts.com.au/download/download_stats.html # http://www.pgts.com.au/download/humour/humour_stats.html # # Normally I do not put many comments in my scripts. I usually only add # enough to remind myself what I was thinking at the time I wrote it. This # probably goes back to a deep seated prejudice that I have against # interpreters. (See: http://www.pgts.com.au/pgtsj/pgtsj0205a.html) In the # past I was always concerned about over-burdening an interpreter with too # many comments. These concerns may have been relevant twenty years ago, # but today's hardware has capacity to spare. So for the purpose of # explaining how this script works I have included a plethora of comments. # This is experimental, since I usually keep documentation (if there is # any) separate from source code. Hopefully this will be sufficient to # serve as documentation for this script. # # This code is site specific. When it gets down to the nitty gritty. It is # unlikely that anyone will write a generic script that deals with # generation of web pages. The task is, by its' nature, unique for each # business and tends to be highly customised. Still, it might give you # ideas for your own site. # # I apologise to those of you who still prefer 80 character console mode # systems. I still work at a text only console, but I have recently taken # to using Linux distributions, which have a very nice font that result in # a console that is 100 characters wide. This script won't look very # pretty on a console that is restricted to 80 characters. You GUI people # don't have to worry about those previous two sentances, however if you # are fond of your mouse and X, you might like to try a recent version of # Konqueror, which will detect the perl shebang (#!/usr/bin/perl), and # display this page in pretty syntax-specific colours. # # I have to warn you that if you are going to comprehend this code, you # will need a working knowledge of basic perl regular expressions. You can # learn about perl regular expressions from the "perlre" man page. # # If you want to contact me, try the following URL: # http://www.pgts.com.au/page04.html # # Gerry Patterson (November Edition PGTS Journal) # ------------------------------------------------------------------------ # We need this for "basename" (extract base file name from filespec) use File::Basename; # This is the location of the master copy $pageSRC = "/My_cronfiles/pagehit.html"; # This is the location of the (apache) DocumentRoot $WWWroot = "/MyDocumentRoot"; # array of the months (note: xxx - should never happen) @Mth = qw/xxx Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/; # ------------------------------------------------------------------------ # This loop reads stdin. This is the output from the hits script which is # used to parse the logfile. # The hits perl scripts produces three columns: # Col 1 is the type (Visitor/Robot) # Col 2 is the number of hits # Col 3 is the page (with the http://domain portion stripped) # ------------------------------------------------------------------------ while (<>){ # When we get to the list of Robots, we're finished ... last if (/^Robot/ ); # Also, I'm not interested in how many visitors bookmarked me next if ( /favicon.ico/); chomp; # This should not happen, since looking at robots.txt is usually # considered 'Robot' behaviour -- I left it, just in case ... next if (/robots.txt$/); # standard awk-style split ... @F = split(' '); # I am only interested in pages that exist on my server ... next unless (-r "$WWWroot" . $F[2]); # Also I am only interested in lines that have hits > 0 next unless ($F[1] > 0); # If it's a directory, make sure there is a trailing '/' if ( -d "$WWWroot" . $F[2]){ $F[2] = $F[2] . "/"; $F[2] =~ s/\/+$/\//; } # Get rid of '//' (usually it's a typo - but it still works) $F[2] =~ s/\/\//\//g; # If we reach this statement - it's a valid page, so count it. $hit{$F[2]} += $F[1]; } # ------------------------------------------------------------------------ # Now the HTML source from the master file, $pageSRC This contains the # page ranking data between two marker comments as follows: # # ... Data Goes Here ... # # The data that goes in between the two markers has all been created with # this script, so we can rely on a reliable standard format. The data # consists of table rows. Each row has four cells as follows: # 1. Number of hits # 2. Page Title/Description (actually a URL) # 3. Author (or Category) # 4. Date Published (for articles - creation date for other documents) # # This next piece of code opens $pageSRC and reads each line, storing it # in the array @HTMLsrc until it reaches the first marker. It then parses # each row, extracting the contents of the cells. The URL is extracted # from cell number 2, which is contained in a tag. The front # portion of the URL is stripped off and the remaining string is used as a # hash key, which is stored in the variable $url. There are four hashes, # one for each cell, %hit, %title, %author and %date. When all the data # has been extracted (i.e. we reach the second marker), the lines are once # more pushed into @HTMLsrc. # ------------------------------------------------------------------------ open (PAGESRC, $pageSRC) || die "$!"; for ($found = 0; ();){ # start of data marker ... $found++ if ( // ); # end of data marker ... $found = 0 if ( // ); # if we are not processing data, push it and loop unless ($found){ push (@HTMLsrc,$_); next; } chomp; # split on the end of data tag for each cell () @sf = split ( /<\/td>/, $_); # if there is no URL, forget it ... next unless ($sf[1] =~ /http:\/\/www.pgts.com.au(\S+)"/); $url = $1; # Now remove HTML tags from each field for $i ( 0 .. $#sf){ $sf[$i] =~ s/<[#-;A-z ="\!\.\/\?]*>//g; } # Somthing has gone wrong if we don't have a URL next unless ($url); # Load the four hashes with cell data $hit{$url} += $sf[0]; $title{$url} = $sf[1]; $author{$url} = $sf[2]; $date{$url} = $sf[3]; } # ------------------------------------------------------------------------ # This next section of code loops through the @HTMLsrc array, printing # each line to stdout, until we reach the first marker. Then we generate a # data row for each key in the %hit hash. Each URL string arrives from the # hits script with the leading portion of the URL stripped. If we are # going to examine the actual file we need to add the DocumentRoot as a # suffix. Note if you use aliased directories this algorithm will fail. # # This section of the algorithm is specific to my site. There are a few # unique aspects to the structure of my site and this script relies on # this structure. These are as follows: # # 1. The articles for "The PGTS Journal" are all published in the "/pgtsj" # folder. This also serves as the archive folder. This means that after # an article is published it does not have to moved to an archive # folder. Because it already is in the archive folder. What changes is # a moving IndexIgnore directive in the .htaccess for the pages in the # current edition for the /pgtsj folder. After the issue moves to next # month this directive is rotated up the next issue (see naming # convention next). # # 2. The articles in the journal all have a name in the format # pgtsjYYMMx.html. Where YY is the year, MM is the month and x is an # alphanumeric letter (starting at 'a'). This means that the date of # publication can be inferred from the name. Of course sometimes I miss # the intended publication date (1st of the month). # # 3. As each issue rolls to the new month the front page that was the # current page is moved into the archive with a new name in the format # pgtsjYYMM.html and linked into the archives. The articles that it # references remain in the same location. This means that search # engines (like Google) don't get upset because files are moved around. # And yet on the face of it the articles "appears" to have been moved # into the archive area. # # 4. The descriptions published in the .htaccess files have a single # unique description for each page. The entries in the humour section are # an exception, however. They have a description which corresponds to # the humour "Category". # # With this knowledge the script can assume that a page that has "hits", # but is not contained in the masterfile (i.e. there is an entry in %hits # but not in %title) is a relatively new file. It looks first for the # description in the .htaccess file. If it doesn't find it, then it sets # the description to the "filename" (i.e. basename). # ------------------------------------------------------------------------ foreach $html (@HTMLsrc){ # look for the second marker (1st one has been omitted from @HTMLsrc) if ($html =~ // ){ # Now print the data for the page stats print "\n"; # sort the array in reverse hits order (highest first) foreach $url(sort{$hit{$b}<=>$hit{$a}} keys %hit){ # full filespec is Document Root + URL $fspec = "$WWWroot$url"; # get the basename ($name,$path) = fileparse($fspec); # is a new file? unless ($title{$url}){ # yes it is, look for entry in .htaccess if ( open HTACCESS, "$path/.htaccess" ){ my @t = grep (/^AddDescription\s*\".*\"\s*$name$/, ()); $title{$url} = $1 if (@t == 1 && $t[0]=~/^AddDescription\s*\"(.*)\"\s*$name$/); } # If it's a directory set the description to basename # as with all these, this can be edited later ... $title{$url} = $url if (-d $fspec); $title{$url} = $name unless ($title{$url}); if ( -f $fspec && $name =~ /^pgtsj([0-9]*)/){ my $x = $1; my $m = substr($x,2); # derive the date from the filename # for articles in the Journal $date{$url} = "01-$Mth[$m]-20" . substr($x,0,2); # G. Patterson is default author for journal $author{$url} = 'G. Patterson'; } # humour files have category humour elsif ( -f $fspec && $path =~ /\/humour\/$/){ $title{$url} = $name; $author{$url} = "Humour"; } } $title{$url} =~ s/\s*$//; unless ($author{$url}){ # Directories have the default category "Directory" if ( -d $fspec){ $author{$url} = "Directory"; $date{$url} = '-'; } } # Is the date missing? unless ($date{$url}){ my @s = stat($fspec); my @t = localtime($s[9]); $date{$url} = sprintf "%02d-%s-%04d", $t[3],$Mth[$t[4] + 1],$t[5] + 1900; } # print the row printf "%d%s%s%s\n", $hit{$url},$url,$title{$url},$author{$url},$date{$url}; } } # print the line from @HTMLsrc print $html; } # ------------------------------------------------------------------------ # I have not included any notifications about copyright or Public Licenses # etc. That's because the script is customised for my own site. It would # need considerable modification to work at another site. Still the # principals could be applied to most businesses. # # In actual fact, I didn't put all these comments into the script. I # simply included a special comment that has format as follows: # # include file_name # # And when the script is copied to my publication area, it is sent via # another perl script which then replaces such statements with all the # statements in "file_name", prepending a "#" to each line. It also makes # a few changes to directories ... You didn't think that I really would # use names like /MyDocumentRoot did you? The main advantage of keeping # the documentation in separate text files was that it allowed me to # format it, change it about easily and spell check it. # # Or, maybe it was just that I still have this prejudice about comments # and interpreters ... # ------------------------------------------------------------------------