Listing 4.
#!/usr/bin/perl -w
use strict;
use diagnostics;
use LWP::RobotUA;
use HTML::LinkExtor;
use URI::URL;
# Where do we want to start?
my $origin = "";
# Create two hashes, one for URLs we have seen and
# another for those we haven't
my %already_retrieved = ();
my %to_be_retrieved = ($origin => 1);
# Create a hash in which we will deposit URLs we
# wish to use
my %interesting_urls = ();
# Create a new user agent
my $ua = new LWP::RobotUA "ATF/1.0",
while (my $url = (keys %to_be_retrieved)[0])
# Create a new HTTP request
my $request = new HTTP::Request('GET', $url);
# Move this URL from %to_be_retrieved to
# %already_retrieved
delete $to_be_retrieved{$url};
$already_retrieved{$url} = 1;
# Indicate what we are retrieving
print "Retrieving $url...";
# Hand $request to $ua, and get an HTTP
# response back in return
my $response = $ua->request($request);
# Complete printout
print ".\n";
# If there was a problem, send a report to
if (!$response->is_success)
print STDERR qq{Error retrieving "$url":},
$response->status_line , "\n";
# Create an instance of HTML::LinkExtor,
# making links relative to $url
my $parser = HTML::LinkExtor->new(\&callback,
# Parse the output
# Define our callback, which is passed scalar and
# a hash
sub callback
# Get the tag and its associated attributes
my ($tag, %attributes) = @_;
# We only care about anchor tags
return unless ($tag eq "a");
# Iterate through the attributes
foreach my $name (sort keys %attributes)
# If this is a link, then put the URL in the
# queue if ($name eq "href")
# Get the URL for this anchor
my $url = $attributes{$name};
# Ignore non-HTTP URLs
next unless ($url =~ m/^https:/);
# Indicate that we should retrieve this
# URL in the future
$to_be_retrieved{$url} = 1
unless $already_retrieved{$url};