Listing 4. download-recursively.pl
#!/usr/bin/perl -w
use strict;
use diagnostics;
use LWP::RobotUA;
use HTML::LinkExtor;
use URI::URL;
# Where do we want to start?
my $origin = "https://www.lerner.co.il/";
# Create two hashes, one for URLs we have seen and
# another for those we haven't
my %already_retrieved = ();
my %to_be_retrieved = ($origin => 1);
# Create a hash in which we will deposit URLs we
# wish to use
my %interesting_urls = ();
# Create a new user agent
my $ua = new LWP::RobotUA "ATF/1.0",
'reuven@lerner.co.il';
$ua->delay(0);
while (my $url = (keys %to_be_retrieved)[0])
{
# Create a new HTTP request
my $request = new HTTP::Request('GET', $url);
# Move this URL from %to_be_retrieved to
# %already_retrieved
delete $to_be_retrieved{$url};
$already_retrieved{$url} = 1;
# Indicate what we are retrieving
print "Retrieving $url...";
# Hand $request to $ua, and get an HTTP
# response back in return
my $response = $ua->request($request);
# Complete printout
print ".\n";
# If there was a problem, send a report to
# STDERR
if (!$response->is_success)
{
print STDERR qq{Error retrieving "$url":},
$response->status_line , "\n";
}
# Create an instance of HTML::LinkExtor,
# making links relative to $url
my $parser = HTML::LinkExtor->new(\&callback,
$url);
# Parse the output
$parser->parse($response->content);
}
#---------------------------------------------
# Define our callback, which is passed scalar and
# a hash
sub callback
{
# Get the tag and its associated attributes
my ($tag, %attributes) = @_;
# We only care about anchor tags
return unless ($tag eq "a");
# Iterate through the attributes
foreach my $name (sort keys %attributes)
{
# If this is a link, then put the URL in the
# queue if ($name eq "href")
{
# Get the URL for this anchor
my $url = $attributes{$name};
# Ignore non-HTTP URLs
next unless ($url =~ m/^https:/);
# Indicate that we should retrieve this
# URL in the future
$to_be_retrieved{$url} = 1
unless $already_retrieved{$url};
}
}
}