Last week I got a large book "Perl Cookbook". It mentions an useful module HTML::LinkExtor in the book, seems handy to use. Right now, I just wanted to crawl some docs from MVS-OE archive webpage, so I wrote a small script that can demo how to use the module.
use LWP:: Simple ;use HTML:: LinkExtor ;use URI:: URL ;
binmode STDOUT , ':utf8' ;my $url = "http://www2.marist.edu/htbin/wlvindex?mvs-oe" ;my $base = "http://www2.marist.edu/htbin" ;my $ref_links = extract_link ( $url , "" , "a" , "href" );foreach ( @$ref_links ){my $sub_url = $_ ;print "Parsing sub url: " . $sub_url . "\n" ;my $thread_links = extract_link ( $sub_url , $base , "a" , "href" );foreach ( @$thread_links ){print "GET\n" ;print $_ . "\n" ;get ( $_ );}}
sub extract_link (){my $url = shift ;my $base = shift ; # base URLmy $mytag = shift ; # specified html tag name, such as a, form ...my $attr_name = shift ; # link pattern
$base =~ s/\/$//g ;
$ua = LWP:: UserAgent -> new or dir $! ;
# Set up a callback that collect image linksmy @links = ();
sub callback {my ( $tag , %attr ) = @_ ;return if $tag ne $mytag ; # we only look closer at <img ...>push ( @links , $attr { $attr_name });}
# Make the parser. Unfortunately, we don't know the base yet# (it might be different from $url)my $p = HTML:: LinkExtor -> new ( \& callback );
# Request document and parse it as it arrivesmy $res = $ua -> request ( HTTP:: Request -> new ( GET => $url ),sub { $p -> parse ( $_ [ 0 ])}) or die $! ;
# Expand all image URLs to absolute ones$base or $base = $res -> base ;@links = map { $_ = url ( $_ , $base ) -> abs ; } @links ;
return \ @links ;}