用Perl脚本来抓去 天涯论坛 的帖子,呵呵 看帖子方便多了[网络爬虫]

功能分析:

  给定一个 天涯论坛 的帖子URL链接地址,获取从当前地址开始的所有整个 主题帖子的内容。

  可以选定包含 非楼主回复内容

1, 先看看天涯论坛帖子中 我们所关心的文档结构:

<body>
 <div class="pagewrap">
   <h1><span>xxx</span>
       <span>标题</span></h1>
   <div class="function">
      <div class="info">点击数 回复数</div>
	  <!-- 或者没有下面的tag-->
	  <div class="pages" id="pageDivTop">
	   <form>
	         <input type="hidden" name="idArticleslist" value="所有的帖子地址ID">
	   </form>
	  </div>
   <!--楼主信息-->
   <table cellspacing="0" cellpadding="0" id="firstAuthor"><td><a>作者</a>发表日期</td></table>
   <div class="allpost" id="pContentDiv">
       <!--楼主贴-->
      <div class="post" _app="eleshare">
	  <!--回复人信息-->
	  <table bgcolor="#f5f9fa" width="100%"><font size="-1" color="green"><a>作者</a>发表日期</font></table>
	  <!--回复人帖子-->
	  <div class="post" _app="eleshare"></div>
 </div>
</body>

2,用HTML::TreeBuilder,LWP::Simple,HTML::Element,Encode库 来实现Reference :CPAN

3,一个文件crawl.pl,使用实例:

 Perl  crawl.pl http://www.tianya.cn/publicforum/content/funinfo/1/2854941.shtml  >ylbg.txt

 Perl  crawl.pl http://www.tianya.cn/publicforum/content/no05/1/204253.shtml > yslt.txt

4, crawl.pl文件源代码,呵呵写的比较挫 还挺见谅啊 下载

【注意 引用处的 $$ 被CSDN替换为$了,CSDN编辑器太烂,都不支持Perl】

use strict;
use warnings;
use HTML::TreeBuilder;
use LWP::Simple;
use Encode;
no utf8; #pragma tells Perl to switch back to treating the source text as literal bytes in the current lexical scope.

#check to print the content of the others
# if 1, print the others, else 0, not to print
my $other=1;
$other=0 if (@ARGV > 1);

my ($url)=@ARGV; 
if (@ARGV <1)
{  
  print "Input an URL\n";  
  exit;
} 
  # http://www.baidu.com
if($url !~/^http:/)
{
  $url="http://$url";
}
  
my ($pagewrap,
    $title,
    $auther,#auther
	$clicks,#click number
	$totalPage,
	@idArticleslist,#all Articile ID list 
	$root  #HTML document root element
    );
&PrintPage($url);

sub InitRootfromURl()
{
  my ($url)=@_;
  my $content=get($url);
  #print "Get HTML document\n";
  die "Couldn't get the URL $url!" unless defined $content;
  $root = HTML::TreeBuilder->new_from_content($content);
  $root->eof( );  # done parsing for this tree
}

sub PrintPage()
{
#http://www.tianya.cn/publicforum/content/feeling/1/1323274.shtml
#urlPattern is :
#http://www.tianya.cn/publicforum/content/feeling/1
#posfitx is:
#shtml
  
    my ($link)=@_;
	my ( $urlPattern,#link pattern
	     $posfitx,#the link postfix
	     $CurrID #Input URL'ID
	    );
    my @tmp=split(/\//,$link);
    $urlPattern=join('/',@tmp[0..@tmp-2]);
    ($CurrID,$posfitx)=split(/\./,$tmp[@tmp-1]);
    @tmp=();
     
    &InitRootfromURl($link);
    &InitFromfirstLink(\$root);
	return unless ($idArticleslist[0]);
	if ($CurrID ne $idArticleslist[0])	
	{
	  $link="$urlPattern/$idArticleslist[0].$posfitx";
	  &InitRootfromURl($link);
      &InitFromfirstLink(\$root);
	  $root->delete(); #the first page will be deleted.
	}
    
	{#This infor below got from the first page.
	  print "title:";&toGBK(\$title) if ($title);
      print "\nauther:";&toGBK(\$auther) if ($auther);
      print "\nclicks:";&toGBK(\$clicks) if ($clicks);
      if ($totalPage){ print "\n";&toGBK(\$totalPage); print "\n";}
	}
	
	#Print the content infor of the all posts
	print "正文:\n";
	
	my ($i,$start)=(0,0);
    foreach my $id (@idArticleslist)
    { 
      $i++;
	  $start=1 if ( !$start && ($id eq $CurrID));
	  
	  if($start)
	  { 
	    print "\nPage $i\n";
	    $link="$urlPattern/$id.$posfitx";
	    &InitRootfromURl($link);
        &ParseAllPost(\$root);
		$root->delete();;# destroy the document
	  }
	}
}

sub toGBK()
{ 
  my ($content)=@_;
  $$content=~ s/牋牋/  /g;
  #print $$content;return 0; #If your OS is English ,please comment it
  my @chars=split(//,$$content);
  foreach my $char (@chars) 
  {
     print encode("gbk",$char);
  } 
}
sub printContent()
{
  my ($content)=@_;
  $$content =~ s/\s+$//g;
  $$content =~ s/TML::Element=.+$//;
  &toGBK($content);
}
sub printReplier()
{
  my ($writer)=@_;
  return if ($$writer =~ /^\s*$/);
  $$writer =~ s/\s+//;
  my $dil="-" x length(Encode::encode_utf8($$writer));
  print  "\n$dil\n|";
  &toGBK($writer);
  print  "|\n$dil\n";
}

#=====
#print content info
sub ParsePost()
{
  my ($node)=@_;

  foreach my $n ($$node->content_list())
  {
     if ( $n =~ /^HTML::Element=/)
	 {
	   next if (! (my $tag=$n->tag()));
	   #print "$tag\n";
	   if ( $tag eq 'br'){print "\n"; next;}
	   next if ( ($tag eq 'div') && ($n->attr('class')) && ($n->attr('class') eq 'post-jb'));
	   next if ( $tag eq 'img');
	   next if ( $tag eq 'span');
	   next if ( $tag eq 'u');
	   &ParsePost(\$n);
	 }

	 {
	   &printContent(\$n);
	 }
  }
}
#Parse allpost
#=====
#print content info
sub ParseAllPost()
{
  my ($node)=@_; #allpost,HTML::Element
  my ($tag,$attr,@poster,$writer,$isShow);
  
  return unless ($$node);
  
  my @a=$$node->look_down(_tag=>'div',class=>'allpost',id=>'pContentDiv');
  return unless ($a[0]);
  $isShow=1;
  foreach my $n ($a[0]->content_list())
  {
    next if ( $n !~ /^HTML::Element=/);
	next if (! ($tag=$n->tag()));
	#parse the content of writer
	if ( $isShow && ($tag eq 'div'))
	{
	  next if (!($attr=$n->attr('class')));
	  next if ($attr ne 'post');
      &ParsePost(\$n);
	}
	#parse the writer
	if ( $tag eq 'table')
	{
	   @poster = $n->look_down(_tag=>'a',target=>'_blank');
	   $writer=$poster[0]->as_text();
	     
	   if (!$other)
	   {
	     if ($writer eq $auther){$isShow =1;}
         else {$isShow = 0;}	 
	   }
	   if ($isShow)
	   {
	      &printReplier(\$writer);
	   }
	}
  }
  #$$node->delete();#Delete all elements
}

#Initialize the auther info and link info
#Parse pagewrap and, find title, auther ,and all links
sub InitFromfirstLink()
{
  my ($node)=@_;
  my ($tag,$attr,@tmp,@a,$b);

  #find the 'pagewrap' tag
  return unless ($$node);
  @tmp = $$node->look_down(_tag=>'div',class=>'pagewrap');
  
  return unless ($tmp[0]);
  foreach my $n ($tmp[0]->content_list())
  {
     next if ( $n !~ /^HTML::Element=/);
	 next if (! ($tag=$n->tag()));
	 if ( $tag eq 'h1')
	 {
	   @tmp = $n->look_down(_tag=>'span');
	   next unless ($tmp[1]);
	   $title=$tmp[1]->as_text();
	   @tmp=();
	   next;
	 }
	 if ( $tag eq 'table')
	 {
	    @tmp = $n->look_down(_tag=>'a');
		next unless ($tmp[0]);
	    $auther=$tmp[0]->as_text();
		@tmp=();
	    next;
	 }
	 next if (!($attr=$n->attr('class')));
	 if ( ($tag eq 'div') && ( $attr eq 'function'))
	 {
	    @tmp = $n->content_list();
		
		{
		  @a=();
		  @a=$n->look_down(_tag=>'span');
		  if ( @a > 0)
		  {$totalPage=$a[0]->as_text();}
		  @a=();
		}
		
		if ($tmp[0]->attr('class') eq 'info')
		{
		   @a=split(/ /,$tmp[0]->as_text());
		   $clicks=$a[1]; 
		   @a=();
		}else{ 
		 #print "error:",$tmp[0]->dump(); next;
		}
		
		{#get  idArticleslist
		 @a=();
		 @a=$n->look_down(_tag=>'input',name=>'idArticleslist');
		 if ( @a > 0){
		   $b=$a[0]->attr('value');
		   @idArticleslist=split(/,/,$b);
		}
		 @a=();
		}
    	 #print $tmp[1]->dump();
	    @tmp=();
		next;
	 }
	 if ( ($tag eq 'div') && ( $attr eq 'allpost'))
	 {
	 	 #$$node->detach;
		 last;
	 }
  }
}


=cut
天涯帖子结构
<body>
 <div class="pagewrap">
   <h1><span>xxx</span>
       <span>标题</span></h1>
   <div class="function">
      <div class="info">点击数 回复数</div>
	  <!-- 或者没有下面的tag-->
	  <div class="pages" id="pageDivTop">
	   <form>
	         <input type="hidden" name="idArticleslist" value="所有的帖子地址ID">
	   </form>
	  </div>
   <!--楼主信息-->
   <table cellspacing="0" cellpadding="0" id="firstAuthor"><td><a>作者</a>发表日期</td></table>
   <div class="allpost" id="pContentDiv">
       <!--楼主贴-->
      <div class="post" _app="eleshare">
	  <!--回复人信息-->
	  <table bgcolor="#f5f9fa" width="100%"><font size="-1" color="green"><a>作者</a>发表日期</font></table>
	  <!--回复人帖子-->
	  <div class="post" _app="eleshare"></div>
 </div>
</body>
=cut


评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值