自己写的第一个玩具Perl 爬虫程序,主要是体验Perl语言的方便(和强大?)
#!/usr/bin/perl
use warnings;
use strict;
use 5.010;
use LWP::UserAgent;
use HTML::LinkExtor;
use Thread;
use threads;
use threads::shared;
#my $feedLink = "http://www.google.com.hk/webhp?rls=ig";
my $feedLink = "http://www.163.com";
sub linkHandler
{
my ($crawledURLs, $urlsToBeCrawled) = (@_);
sub
{
my ($tag, %values) = @_;
if($tag eq 'a')
{
foreach my $key (keys %values)
{
if($key eq 'href')
{
my $link = $values{$key};
lock($crawledURLs);
if(($link =~ m/^http:\/\//) && !(exists $crawledURLs->{$link}))
{
$crawledURLs->{$link} = 1;
lock($urlsToBeCrawled);
push @{$urlsToBeCrawled}, $link;
cond_signal($urlsToBeCrawled);
}
}
}
}
}
}
sub doCrawlWebs
{
my ($crawledURLs, $urlsToBeCrawled, $pageCnt) = (@_);
my $crawler = LWP::UserAgent->new;
$crawler->agent("Mazilla/5.0");
my $done = 0;
while(! $done)
{
my $url;
{
lock($urlsToBeCrawled);
while(@{$urlsToBeCrawled} == 0)
{
cond_wait($urlsToBeCrawled);
}
$url = pop @{$urlsToBeCrawled};
if(++$pageCnt >= 10000)
{
$done = 1;
continue;
}
}
print "Thr ", Thread->self->tid, " is crawling $url\n";
open my ($fileHandle), '>', "$pageCnt.html";
if(!defined($fileHandle))
{
print("failed to open $pageCnt.html: $!\n");
continue;
}
my $response = $crawler->get($url);
print $fileHandle $response->content();
close($fileHandle);
my $linkExtractor = HTML::LinkExtor->new(&linkHandler($crawledURLs, $urlsToBeCrawled));
$linkExtractor->parse($response->content);
}
}
sub crawlWebs
{
my @urlsToBeCrawled: shared = (@_);
my %crawledURLs: shared;
my $pageCnt: shared = 0;
my @args = (\%crawledURLs, \@urlsToBeCrawled, $pageCnt);
my @crawlerThrs;
for(my $i = 0; $i < 30; ++$i)
{
push @crawlerThrs, Thread->new(\&doCrawlWebs, @args);
}
foreach (@crawlerThrs)
{
$_->join;
}
}
crawlWebs($feedLink);