功能分析:
给定一个 天涯论坛 的帖子URL链接地址,获取从当前地址开始的所有整个 主题帖子的内容。
可以选定包含 非楼主回复内容
1, 先看看天涯论坛帖子中 我们所关心的文档结构:
<body>
<div class="pagewrap">
<h1><span>xxx</span>
<span>标题</span></h1>
<div class="function">
<div class="info">点击数 回复数</div>
<!-- 或者没有下面的tag-->
<div class="pages" id="pageDivTop">
<form>
<input type="hidden" name="idArticleslist" value="所有的帖子地址ID">
</form>
</div>
<!--楼主信息-->
<table cellspacing="0" cellpadding="0" id="firstAuthor"><td><a>作者</a>发表日期</td></table>
<div class="allpost" id="pContentDiv">
<!--楼主贴-->
<div class="post" _app="eleshare">
<!--回复人信息-->
<table bgcolor="#f5f9fa" width="100%"><font size="-1" color="green"><a>作者</a>发表日期</font></table>
<!--回复人帖子-->
<div class="post" _app="eleshare"></div>
</div>
</body>
2,用HTML::TreeBuilder,LWP::Simple,HTML::Element,Encode库 来实现Reference :CPAN
3,一个文件crawl.pl,使用实例:
Perl crawl.pl http://www.tianya.cn/publicforum/content/funinfo/1/2854941.shtml >ylbg.txt
Perl crawl.pl http://www.tianya.cn/publicforum/content/no05/1/204253.shtml > yslt.txt
4, crawl.pl文件源代码,呵呵,写的比较挫 还挺见谅啊 下载
【注意 引用处的 $$ 被CSDN替换为$了,CSDN编辑器太烂,都不支持Perl】
use strict;
use warnings;
use HTML::TreeBuilder;
use LWP::Simple;
use Encode;
no utf8; #pragma tells Perl to switch back to treating the source text as literal bytes in the current lexical scope.
#check to print the content of the others
# if 1, print the others, else 0, not to print
my $other=1;
$other=0 if (@ARGV > 1);
my ($url)=@ARGV;
if (@ARGV <1)
{
print "Input an URL\n";
exit;
}
# http://www.baidu.com
if($url !~/^http:/)
{
$url="http://$url";
}
my ($pagewrap,
$title,
$auther,#auther
$clicks,#click number
$totalPage,
@idArticleslist,#all Articile ID list
$root #HTML document root element
);
&PrintPage($url);
sub InitRootfromURl()
{
my ($url)=@_;
my $content=get($url);
#print "Get HTML document\n";
die "Couldn't get the URL $url!" unless defined $content;
$root = HTML::TreeBuilder->new_from_content($content);
$root->eof( ); # done parsing for this tree
}
sub PrintPage()
{
#http://www.tianya.cn/publicforum/content/feeling/1/1323274.shtml
#urlPattern is :
#http://www.tianya.cn/publicforum/content/feeling/1
#posfitx is:
#shtml
my ($link)=@_;
my ( $urlPattern,#link pattern
$posfitx,#the link postfix
$CurrID #Input URL'ID
);
my @tmp=split(/\//,$link);
$urlPattern=join('/',@tmp[0..@tmp-2]);
($CurrID,$posfitx)=split(/\./,$tmp[@tmp-1]);
@tmp=();
&InitRootfromURl($link);
&InitFromfirstLink(\$root);
return unless ($idArticleslist[0]);
if ($CurrID ne $idArticleslist[0])
{
$link="$urlPattern/$idArticleslist[0].$posfitx";
&InitRootfromURl($link);
&InitFromfirstLink(\$root);
$root->delete(); #the first page will be deleted.
}
{#This infor below got from the first page.
print "title:";&toGBK(\$title) if ($title);
print "\nauther:";&toGBK(\$auther) if ($auther);
print "\nclicks:";&toGBK(\$clicks) if ($clicks);
if ($totalPage){ print "\n";&toGBK(\$totalPage); print "\n";}
}
#Print the content infor of the all posts
print "正文:\n";
my ($i,$start)=(0,0);
foreach my $id (@idArticleslist)
{
$i++;
$start=1 if ( !$start && ($id eq $CurrID));
if($start)
{
print "\nPage $i\n";
$link="$urlPattern/$id.$posfitx";
&InitRootfromURl($link);
&ParseAllPost(\$root);
$root->delete();;# destroy the document
}
}
}
sub toGBK()
{
my ($content)=@_;
$$content=~ s/牋牋/ /g;
#print $$content;return 0; #If your OS is English ,please comment it
my @chars=split(//,$$content);
foreach my $char (@chars)
{
print encode("gbk",$char);
}
}
sub printContent()
{
my ($content)=@_;
$$content =~ s/\s+$//g;
$$content =~ s/TML::Element=.+$//;
&toGBK($content);
}
sub printReplier()
{
my ($writer)=@_;
return if ($$writer =~ /^\s*$/);
$$writer =~ s/\s+//;
my $dil="-" x length(Encode::encode_utf8($$writer));
print "\n$dil\n|";
&toGBK($writer);
print "|\n$dil\n";
}
#=====
#print content info
sub ParsePost()
{
my ($node)=@_;
foreach my $n ($$node->content_list())
{
if ( $n =~ /^HTML::Element=/)
{
next if (! (my $tag=$n->tag()));
#print "$tag\n";
if ( $tag eq 'br'){print "\n"; next;}
next if ( ($tag eq 'div') && ($n->attr('class')) && ($n->attr('class') eq 'post-jb'));
next if ( $tag eq 'img');
next if ( $tag eq 'span');
next if ( $tag eq 'u');
&ParsePost(\$n);
}
{
&printContent(\$n);
}
}
}
#Parse allpost
#=====
#print content info
sub ParseAllPost()
{
my ($node)=@_; #allpost,HTML::Element
my ($tag,$attr,@poster,$writer,$isShow);
return unless ($$node);
my @a=$$node->look_down(_tag=>'div',class=>'allpost',id=>'pContentDiv');
return unless ($a[0]);
$isShow=1;
foreach my $n ($a[0]->content_list())
{
next if ( $n !~ /^HTML::Element=/);
next if (! ($tag=$n->tag()));
#parse the content of writer
if ( $isShow && ($tag eq 'div'))
{
next if (!($attr=$n->attr('class')));
next if ($attr ne 'post');
&ParsePost(\$n);
}
#parse the writer
if ( $tag eq 'table')
{
@poster = $n->look_down(_tag=>'a',target=>'_blank');
$writer=$poster[0]->as_text();
if (!$other)
{
if ($writer eq $auther){$isShow =1;}
else {$isShow = 0;}
}
if ($isShow)
{
&printReplier(\$writer);
}
}
}
#$$node->delete();#Delete all elements
}
#Initialize the auther info and link info
#Parse pagewrap and, find title, auther ,and all links
sub InitFromfirstLink()
{
my ($node)=@_;
my ($tag,$attr,@tmp,@a,$b);
#find the 'pagewrap' tag
return unless ($$node);
@tmp = $$node->look_down(_tag=>'div',class=>'pagewrap');
return unless ($tmp[0]);
foreach my $n ($tmp[0]->content_list())
{
next if ( $n !~ /^HTML::Element=/);
next if (! ($tag=$n->tag()));
if ( $tag eq 'h1')
{
@tmp = $n->look_down(_tag=>'span');
next unless ($tmp[1]);
$title=$tmp[1]->as_text();
@tmp=();
next;
}
if ( $tag eq 'table')
{
@tmp = $n->look_down(_tag=>'a');
next unless ($tmp[0]);
$auther=$tmp[0]->as_text();
@tmp=();
next;
}
next if (!($attr=$n->attr('class')));
if ( ($tag eq 'div') && ( $attr eq 'function'))
{
@tmp = $n->content_list();
{
@a=();
@a=$n->look_down(_tag=>'span');
if ( @a > 0)
{$totalPage=$a[0]->as_text();}
@a=();
}
if ($tmp[0]->attr('class') eq 'info')
{
@a=split(/ /,$tmp[0]->as_text());
$clicks=$a[1];
@a=();
}else{
#print "error:",$tmp[0]->dump(); next;
}
{#get idArticleslist
@a=();
@a=$n->look_down(_tag=>'input',name=>'idArticleslist');
if ( @a > 0){
$b=$a[0]->attr('value');
@idArticleslist=split(/,/,$b);
}
@a=();
}
#print $tmp[1]->dump();
@tmp=();
next;
}
if ( ($tag eq 'div') && ( $attr eq 'allpost'))
{
#$$node->detach;
last;
}
}
}
=cut
天涯帖子结构
<body>
<div class="pagewrap">
<h1><span>xxx</span>
<span>标题</span></h1>
<div class="function">
<div class="info">点击数 回复数</div>
<!-- 或者没有下面的tag-->
<div class="pages" id="pageDivTop">
<form>
<input type="hidden" name="idArticleslist" value="所有的帖子地址ID">
</form>
</div>
<!--楼主信息-->
<table cellspacing="0" cellpadding="0" id="firstAuthor"><td><a>作者</a>发表日期</td></table>
<div class="allpost" id="pContentDiv">
<!--楼主贴-->
<div class="post" _app="eleshare">
<!--回复人信息-->
<table bgcolor="#f5f9fa" width="100%"><font size="-1" color="green"><a>作者</a>发表日期</font></table>
<!--回复人帖子-->
<div class="post" _app="eleshare"></div>
</div>
</body>
=cut