sohu社区(http://club.sohu.com/main.php),经常有一些不错的连载的帖子,少则几千个回复,多则可以上万个,可是最有用最想看的楼主的帖子也就几十个,从这些成千上万的帖子中,一个个找搂主的帖子实在困难重重,不光手累眼累,主要是时间浪费不起!
提了好几次意见,要求增加“只看楼主”的功能,但现在还是没有。
没办法只有自己动手了,写了这个Perl程序,自动抓取所有楼主的帖子保存下来,轻轻松松的欣赏。
注:执行Perl程序需要Perl解释器,其下载地址:(http://www.activestate.com/Products/Download/Download.plex?id=ActivePerl)
,在左边找到《Windows》条目,其下面的《AS package》或者《MSI》任意一个下载安装就行了!
下面是抓取帖子的Perl程序,记事本保存为1.pl,双击打开或右键`打开方式`->`Perl解释器`。
#-----------------------------------------------------
#! perl
use warnings;
use strict;
use IO::Socket;
use Tk;
my $url = '';
my $saveFilePath = 'C:/'; #这里文件路径是用来存储抓取的内容
my $saveFileName = '';
my $username = ''; #可以不抓楼主的,在这里给出用户呢称
#------------------------------------------------------
my $nullurl1='1.帖子已经被删除';
my $nullurl2='2.URL或者参数错误';
#-----------------GUI----------------------------------
my $top = new MainWindow(-title=>"Please Copy-Paste:");
$top->Label(-text=>"http://")->pack(-side=>'left',-expand=>0);
my $text = $top->Text(qw(-height 1))->pack(-side=>'left',-expand=>0);
$top->Button(-text=>"Go",-command=>sub{$url=$text->Contents;$top->destroy;})->pack();
MainLoop;
#------------------------------------------------------
sub main
{
#print "Please Copy-Paste:/nhttp://" unless($url);
#$url=<STDIN> unless($url);chomp($url);
$url='http://'.$url if ($url !~ m!^http://!);print "/n"x5,"Prase... $url";
my $tempFileName=join('',localtime);open(F,">$saveFilePath$tempFileName")||die("Can't open file $!");print F "$url/n/n";
my ($urlbeg,$urlend) = $url =~ /(.*-)/d(-.*)/;die("URL error!/t$url/n") unless($urlbeg || $urlend);
for(my $page=0; parsePage($urlbeg.$page.$urlend,/*F); $page+=12){}
$saveFileName.=".htm";close F;rename("$saveFilePath$tempFileName","$saveFilePath$saveFileName");
print '-'x34,"/nParse Success!< $saveFilePath$saveFileName >/n",'-'x34;
# Tk::MessageBox->new(-message=>"Parse Success!< $saveFilePath$saveFileName >");
}main;
sub parsePage{
my ($urlarg,$F) = @_;
my ($host,$file) = $urlarg =~ m!http://([^/]+)(/[^/#]*)!;
die "Host error/n" unless ($host);
die "File error/n" unless ($file);
if ($host)
{
my $socket = IO::Socket::INET->new(PeerAddr => $host,PeerPort => 'HTTP(80)');
return 1 unless( defined($socket) );
print $socket "GET $file HTTP/1.0/n/n";
$_=<$socket>;return 0 if (/404 Not Found/);
my ($finder,$effectiveUrl)=(0,0);
while( my $line=<$socket> )
{
$line =~ s//r//g;
if (!$finder){return 0 if($line =~ /$nullurl1/);}#|| $line =~ /$nullurl2);
$saveFileName=$1 if("" eq $saveFileName && $line=~//<title/>(.*)/<//title/>/);
if ($line =~ /.*/<script src=(http:.*)/>/<//script.*/)
{
($finder,$effectiveUrl) = (1,$1);
my ($host,$file) = $effectiveUrl =~ m!http://([^/]+)(/[^/#]*)!;
if ($host)
{
print "Parse... $file";
my $socket1 = IO::Socket::INET->new(PeerAddr => $host,PeerPort => 'HTTP(80)');
next unless( defined($socket1));
print $socket1 "Get $effectiveUrl HTTP/1.0/n/n";
my ($finded,$content,$inputdate)=(0);
while( <$socket1> )
{
$_ =~ s//r//g;
$content = $1 if (/.*var.* body_.*=/'(.*)/'/);
return (print "Server:Access control configuration prevents your request"),0 if (/403 Forbidden/i);
$username=$1 if (!$username && /var nickname_.*=/'(.*)/'/);
$finded=1 if (/var nickname_.*=/'$username/'/);
$inputdate=$1,print "/t[Post: $1]/n" if (/var inputdate_.*=/'(.*)/'/);
}
print $F "$inputdate/t$effectiveUrl/n/n$content/n/n/n" if ($finded);
}
}
}
}
return 1;
} # parsePage