#!/usr/local/bin/perl
# spits out some stats about web access by reading an "access_log" file.
# log_suck - Kees Cook 1997
$HOST=`hostname`;
$SITE_MIN = 10;	# minimum number of times a site must hit before showing
$SITE_TOP = 30;	# how many top sites to show 0=all
$BROWSER_MIN = 10;# minimum number of times a browser must hit before showing
$BROWSER_TOP = 20;# how many top browser to show 0=all
$HIT_MIN = 0;	# minimum number of times a page must be hit before showing
$HIT_TOP = 50;	# how many top pages to show 0=all
$IMAGE_MIN = 0;# minimum number of times an image must be hit before showing
$IMAGE_TOP = 50;# how many top image to show 0=all
$FROM_MIN = 0;	# minimum number of times a page must be hit from before showing
$FROM_TOP = 50;	# how many top pages to show 0=all
$TOSS_IMAGES=1; # should images be ignored?

open(LINE,"</etc/httpd/logs/access_log");
while (<LINE>) {
#        /^(\S+)\s+\"([^\"]*)\"\s+\S+\s+\S+\s+\[([^:]*):(\S+)\s+\S+\]\s+\"([^\"]*)\"\s+->\s+\"GET\s+(\S+)\s+\S+\"\s+(\S+)\s+(\S+)$/;
        /^(\S+)\s+\"([^\"]*)\"\s+\S+\s+\S+\s+\[([^:]*):(\S+)\s+\S+\]\s+\"GET\s+(\S+)\s+\S+\"\s+(\S+)\s+(\S+)$/;
        $machine = $1;
        $browser = $2;
        $date = $3;
        $time = $4;
#        $from = $5;
        $to = $5;
        $result = $6;
        $bytes = $7;

	$from =~ s/%(..)/pack("c",hex($1))/ge;
	$to =~ s/%(..)/pack("c",hex($1))/ge;

	if ($machine =~ /^\d+\./) {
		
		$machine =~ s/\.\d+$/\.*/;
	}
	else {	
		$machine =~ s/^[^\.]+\./*\./;
	}

	$browser =~ s/\/.*$//;

	$image=0;
	if (($TOSS_IMAGES==1 && $to =~ /\.(gif|jpg|jpeg)$/i)) {
		$image = 1;
	}
	if ($to =~ /^\/counter\//) {
		$image = 2;
	}

	if ($from =~ /^http:\/\/([^\.]*\.?altavista\.digital\.com|[^\.]*\.?yahoo\.com|[^\.]*\.?excite\.com|[^\.]*\.?hotbot\.com|[^\.]*\.?webcrawler\.com|[^\.]*\.?alta-vista\.com)\//i) {
		$from = $& . "*";
		$from =~ tr/A-Z/a-z/;
	}
	
 
        if ($result==200) {
                $sites{$machine}+=1;
                $browsers{$browser}+=1;
                $pages{$to}+=1 if ($image==0);
		$images{$to}+=1 if ($image==1);
                $links{$from}+=1
			if ($from !~ /^http:\/\/$HOST\// && $from !~/^http:\/\/127.0.0.1\//);
                $total+=$bytes;
#		print "here\n" if ($from eq "http://www.uiuc.edu/ph/www/c-cook");
        }
}
close(LINE);

print "Total bytes sent: $total\n";

print "\n";
print "Top $SITE_TOP " if ($SITE_TOP!=0);
print "Sites:\n";
$i=0;
foreach $site (sort {$sites{$b} <=> $sites{$a}} keys %sites) {
	$i++;
        printf("%6d %s\n",$sites{$site},$site)
		if (($SITE_TOP==0 || $i<=$SITE_TOP) && $sites{$site}>$SITE_MIN);
}

print "\n";
print "Top $BROWSER_TOP " if ($BROWSER_TOP!=0);
print "Browsers:\n";
$i=0;
foreach $browser (sort {$browsers{$b} <=> $browsers{$a}} keys %browsers) {
	$i++;
	printf("%6d %s\n",$browsers{$browser},$browser)
		if (($BROWSER_TOP==0 || $i<=$BROWSER_TOP) &&
			$browsers{$browser}>$BROWSER_MIN);
}

print "\n";
print "Top $HIT_TOP " if ($HIT_TOP!=0);
print "Hits:\n";
$i=0;
foreach $hit (sort {$pages{$b} <=> $pages{$a}} keys %pages) {
	$i++;
	printf("%6d %s\n",$pages{$hit},$hit)
		if (($HIT_TOP==0 || $i<=$HIT_TOP) &&
			$pages{$hit}>$HIT_MIN);
}

print "\n";
print "Top $IMAGE_TOP " if ($IMAGE_TOP!=0);
print "Image Hits:\n";
$i=0;
foreach $hit (sort {$images{$b} <=> $images{$a}} keys %images) {
	$i++;
	printf("%6d %s\n",$images{$hit},$hit)
		if (($IMAGE_TOP==0 || $i<=$IMAGE_TOP) &&
			$images{$hit}>$HIT_MIN);
}

print "\n";
print "Top $FROM_TOP " if ($FROM_TOP!=0);
print "External Links:\n";
$i=0;
foreach $link (sort {$links{$b} <=> $links{$a}} keys %links) {
	$i++;
	printf("%6d %s\n",$links{$link},$link)
		if (($FROM_TOP==0 || $i<=$FROM_TOP) &&
			$links{$link}>$FROM_MIN);
}

print "\n";
