My first silly-dummy-crappy web-crawler perl program

最新推荐文章于 2025-06-15 18:11:31 发布

CPP_CHEN

最新推荐文章于 2025-06-15 18:11:31 发布

阅读量652

点赞数

分类专栏： Perl 文章标签： perl url signal html 语言

本文链接：https://blog.csdn.net/CPP_CHEN/article/details/6928810

版权

Perl 专栏收录该内容

2 篇文章

订阅专栏

本文介绍了一个使用Perl语言编写的简单爬虫程序。该程序能够抓取网页并解析其中的链接，支持多线程爬取。通过这个实例，读者可以了解Perl在网页爬取方面的便捷性和高效性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

自己写的第一个玩具Perl 爬虫程序，主要是体验Perl语言的方便(和强大?)

#!/usr/bin/perl

use warnings;
use strict;
use 5.010;
use LWP::UserAgent;
use HTML::LinkExtor;
use Thread;
use threads;
use threads::shared;

#my $feedLink = "http://www.google.com.hk/webhp?rls=ig";
my $feedLink = "http://www.163.com";

sub linkHandler
{
my ($crawledURLs, $urlsToBeCrawled) = (@_);
sub
{
    my ($tag, %values) = @_;
    if($tag eq 'a')
    {
      foreach my $key (keys %values)
      {
        if($key eq 'href')
        {
          my $link = $values{$key};
          lock($crawledURLs);
          if(($link =~ m/^http:\/\//) && !(exists $crawledURLs->{$link}))
          {
            $crawledURLs->{$link} = 1;
            lock($urlsToBeCrawled);
            push @{$urlsToBeCrawled}, $link;
           cond_signal($urlsToBeCrawled);
          }
        }
      }
    }
}
}

sub doCrawlWebs
{
my ($crawledURLs, $urlsToBeCrawled, $pageCnt) = (@_);
my $crawler = LWP::UserAgent->new;
$crawler->agent("Mazilla/5.0");

my $done = 0;
while(! $done)
{
    my $url;
    {
      lock($urlsToBeCrawled);
      while(@{$urlsToBeCrawled} == 0)
      {
        cond_wait($urlsToBeCrawled);
      }
      $url = pop @{$urlsToBeCrawled};
      if(++$pageCnt >= 10000)
      {
        $done = 1;
        continue;
      }
    }
    print "Thr ", Thread->self->tid, " is crawling $url\n";
    open my ($fileHandle), '>', "$pageCnt.html";
   if(!defined($fileHandle))
   {
      print("failed to open $pageCnt.html: $!\n");
      continue;
   }
    my $response = $crawler->get($url);
    print $fileHandle $response->content();
    close($fileHandle);
    my $linkExtractor = HTML::LinkExtor->new(&linkHandler($crawledURLs, $urlsToBeCrawled));
    $linkExtractor->parse($response->content);
}
}

sub crawlWebs
{
my @urlsToBeCrawled: shared = (@_);
my %crawledURLs: shared;
my $pageCnt: shared = 0;

my @args = (\%crawledURLs, \@urlsToBeCrawled, $pageCnt);
my @crawlerThrs;
for(my $i = 0; $i < 30; ++$i)
{
    push @crawlerThrs, Thread->new(\&doCrawlWebs, @args);
}

foreach (@crawlerThrs)
{
    $_->join;
}
}

crawlWebs($feedLink);