使用开源爬虫和wget爬取网页都被robots.txt给block了。
自己模拟浏览器行为写了段脚本,这次你封不了我了吧!哈哈!
#!/usr/bin/perl -w
#use CGI qw(:standard);
use LWP::UserAgent;
use HTTP::Request::Common;
use HTTP::Request::Common qw(POST);
#$e=exp(1);
open(OPENFILE,$ARGV[0]) or die("open file failure!");
#$/=undef;
#$cgi=new CGI;
#print $cgi->header;
$outputPageName="out";
$cnt = 0;
while($line=<OPENFILE>){
++$cnt;
print $line;
if($line){
$line=~s//n|/r/ /ig;
$line=~s//s+$//ig;
$content = getWebPage($line);
open(OUTFILE,">$outputPageName_$cnt");
print OUTFILE $content;
sleep(10);
close(OUTFILE);
}
}
print "Total Record:/$";
close(OPENFILE);
#$slurp=<OPENFILE>;
sub getWebPage{
my $agent=LWP::UserAgent->new;
my @header=(
'User-Agent'=>'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; iCafeMedia; .NET CLR 2.0.50727; CIBA)',
'Accept' => '*/*',
'Accept-Charset' => 'gzip, deflate',
'Accept-Language' => 'zh-cn',
'Cookie'=> 'Tango_UserReference=38D8FF1624305B16496E9808; MTCCK=1; _csuid=48feeef505683659; cookmcnt=999; CID=1459382; cookMemberName=YunFan; cookMemberID=61448; savedEmail=liyunfan@genscriptcorp.com; DLDExec=OK; __utma=232384002.1655516880.1231991960.1231994793.1232000250.3; __utmb=232384002; __utmc=232384002; __utmz=232384002.1231991960.1.1.utmccn=(direct)|utmcsr=(direct)|utmcmd=(none)'
);
#my $url="http://news.baidu.com/n?cmd=2&class=top&page=http%3a%2f%2fnews.jinghua.cn%2f351%2fc%2f200907%2f29%2fn2754593.shtml&clk=rrel&cls=top&where=toppage";
my $url="$_[0]";
print $_[0];
#my $url="http://news.baidu.com/n?cmd=2&class=top&page=http%3a%2f%2fnews.jinghua.cn%2f351%2fc%2f200907%2f29%2fn2754593.shtml&clk=rrel&cls=top&where=toppage";
my $res =$agent->get($url,@header);
my $content="";
if($res->is_success){
$content= $res->content;
print $content;
return $content;
}
exit;
}