Harry Rose

Home » Perl » Webgraph

WebGraph

I always thought it would be interesting to do this, turns out it generates some pretty crazy graphs (see here for a shot I took).

This generates a lot of edges (and nodes) very quickly depending on the sites it finds, so it can take a fair bit of computing to draw the resulting graph. I advise you not to start on wikipedia otherwise you'll never get off it :)

Anyway, here's the code.

#!/usr/bin/perl 
#
#
use LWP::UserAgent;
use File::MkTemp;
use XML::XPath;
use URI;
use Data::Dumper;
use HTML::Tidy;
 
my %seen;
my $startAt;
my $startDepth;
my %madeLinks;
 
if ($#ARGV >= 1)
{
    $startDepth = shift(@ARGV);
    $startAt = shift(@ARGV);
}
else
{
    print STDERR "Usage:\n\t$0 <depth> <starting url>\n";
    exit(1);
}
 
 
print STDERR "Beginning Crawl at $startAt, maximum depth of $depth\n";
 
print STDOUT "strict digraph test \n{\n";
makeGraph($startDepth,$startAt);
print STDOUT "}";
 
sub getHostFromURL
{
    my $url = shift(@_);
 
    my @host = $url =~ m`://([^?#/]*)`;
 
    return $host[0];
}
 
sub makeGraph
{
    my $depth = shift(@_);
    my $url = shift(@_);
    my $spaces = "\t"x($startDepth-$depth);
    if($depth < 1)
    {
        return;
    }
    my @links = getLinksOnPage($url);
    for my $link (@links)
    {
        my $thisHost = &getHostFromURL($url);
        my $nextHost = &getHostFromURL($link);
        if($madeLinks{$thisHost}->{$nextHost} != 1)
        {
            $madeLinks{$thisHost}->{$nextHost} = 1;
            print STDOUT qq($spaces"$thisHost" -> "$nextHost"\n);
        }
        if($seen{$link} != 1)
        {
            makeGraph($depth-1, $link);
        }
    }
}
 
sub getLinksOnPage
{
    my $url = shift(@_);
    $seen{$url} = 1;
    $userAgent = LWP::UserAgent->new;
    $userAgent->agent("Harrys Web-Graphing Spider");
    $userAgent->timeout(10);
    
    my $request = HTTP::Request->new(GET=>"$url");
    $request->header(Accept => "text/html xml/* text/*");
    my $response = $userAgent->request($request);
    $baseURL = $url;
    my @baseTag = $response->content() =~ m/(\<base .+\>)/i;
    if($baseTag ne "")
    {
        $baseURL = $baseTag =~ m/href=["']([^"']+)["']/g
    }
    my @tmplinks = $response->content() =~ m/href=["']([^"'#]+)["']/g;
    
    my @links;
    foreach my $a (@tmplinks)
    {
        my $absurl = URI->new_abs($a,$baseURL)->as_string;
 
        push(@links,$absurl);
    }
    return @links;
}