WebGraph
I always thought it would be interesting to do this, turns out it generates some pretty crazy graphs (see here for a shot I took).
This generates a lot of edges (and nodes) very quickly depending on the sites it finds, so it can take a fair bit of computing to draw the resulting graph. I advise you not to start on wikipedia otherwise you'll never get off it :)
Anyway, here's the code.
#!/usr/bin/perl # # use LWP::UserAgent; use File::MkTemp; use XML::XPath; use URI; use Data::Dumper; use HTML::Tidy; my %seen; my $startAt; my $startDepth; my %madeLinks; if ($#ARGV >= 1) { $startDepth = shift(@ARGV); $startAt = shift(@ARGV); } else { print STDERR "Usage:\n\t$0 <depth> <starting url>\n"; exit(1); } print STDERR "Beginning Crawl at $startAt, maximum depth of $depth\n"; print STDOUT "strict digraph test \n{\n"; makeGraph($startDepth,$startAt); print STDOUT "}"; sub getHostFromURL { my $url = shift(@_); my @host = $url =~ m`://([^?#/]*)`; return $host[0]; } sub makeGraph { my $depth = shift(@_); my $url = shift(@_); my $spaces = "\t"x($startDepth-$depth); if($depth < 1) { return; } my @links = getLinksOnPage($url); for my $link (@links) { my $thisHost = &getHostFromURL($url); my $nextHost = &getHostFromURL($link); if($madeLinks{$thisHost}->{$nextHost} != 1) { $madeLinks{$thisHost}->{$nextHost} = 1; print STDOUT qq($spaces"$thisHost" -> "$nextHost"\n); } if($seen{$link} != 1) { makeGraph($depth-1, $link); } } } sub getLinksOnPage { my $url = shift(@_); $seen{$url} = 1; $userAgent = LWP::UserAgent->new; $userAgent->agent("Harrys Web-Graphing Spider"); $userAgent->timeout(10); my $request = HTTP::Request->new(GET=>"$url"); $request->header(Accept => "text/html xml/* text/*"); my $response = $userAgent->request($request); $baseURL = $url; my @baseTag = $response->content() =~ m/(\<base .+\>)/i; if($baseTag ne "") { $baseURL = $baseTag =~ m/href=["']([^"']+)["']/g } my @tmplinks = $response->content() =~ m/href=["']([^"'#]+)["']/g; my @links; foreach my $a (@tmplinks) { my $absurl = URI->new_abs($a,$baseURL)->as_string; push(@links,$absurl); } return @links; }