其中需要添加一些 获取图片的代码,并且本人机子上显示 vim 乱码尚未解决
#!/usr/bin/perl
use LWP;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Headers;
use HTTP::Response;
use utf8;
use Encode;
use URI::Escape;
use URI::URL;
use Data::Dumper;
use Cwd;
$blog_url="http://blog.csdn.net/lyj1101066558";
$ua = LWP::UserAgent->new;
$ua->agent("Mozilla/5.0 (Windows NT 6.1; rv:30.0) Gecko/20100101 Firefox/30.0");
$res = $ua->get($blog_url);
if( $res->is_success ) {
open BLOGFSFD,">blog.html" || die "open blog file failed:$!";
print BLOGFSFD $res->decoded_content;
use HTML::TreeBuilder::XPath;
my $tree = HTML::TreeBuilder->new;
$tree->parse_file("blog.html");
@category = "";
@Links = $tree->find_by_tag_name('a');
foreach (@Links) {
@href = $_->attr('href');
foreach (@href) {
if($_=~ /.*category.*/) {
push( @category, $_ );
}
}
}
shift @category;
@Types = $tree->findvalues('/html/body//div[@id="panel_Category"]/ul[@class="panel_body"]/li');
if(! -d blog ) {
mkdir "blog";
}
chdir "blog";
foreach (@Types) {
if( !-d $_ ) {
mkdir $_;
}
}
foreach ( $cg=0; $cg<@category; $cg++) {
$cate_url = "http://blog.csdn.net".$category[$cg];
#选择 category
chdir $Types[$cg];
my $cateRes = $ua->get($cate_url);
if( $cateRes->is_success ){
open CATEGORY, ">category.html" || die "open category file failed:$!";
print CATEGORY $cateRes->decoded_content;
my $treeCate = HTML::TreeBuilder->new;
$treeCate->parse_file("category.html");
#获取页数
my @pageString="";
my $pageCount = 1;
@pageString = $treeCate->findvalues('/html/body//div[@id="papelist"]/span');
if($pageString[0]=~/.*(\d)+.*(\d)+.*/) {
$pageCount = $2;
}
#获取每页数据
for( $page=1; $page<=$pageCount; $page++ ){
my $page_url = $cate_url."/".$page;
my $pageDetailRes = $ua->get($page_url);
if($pageDetailRes->is_success) {
open PAGE, ">page.html" || die "open page file failed:$!";
print PAGE $pageDetailRes->decoded_content;
my $treePage = HTML::TreeBuilder->new;
$treePage->parse_file("page.html");
#获取每页有多少条数据
my @pageRows = $treePage->findvalues('/html/body//span[@class="link_title"]');
my $pageRow = @pageRows;
$pageRow = $pageRow*2;
#获取每条数据连接
my @tls = "";
my @tlsSlave = "";
my @titleLinks = $treePage->find_by_tag_name('a');
#筛选出 含有 /liuyangjun/...details/ 的博客
foreach (@titleLinks) {
@titleHref = $_->attr('href');
foreach (@titleHref) {
if($_=~ /.*lyj1101066558.*details.(\d)+$/) {
push( @tlsSlave, $_ );
}
}
}
#筛选 tls 中前面的 属于正文篇幅的 博客
foreach ( $t=0; $t<@tlsSlave; $t++ ) {
if( $t%2 == 0 ){
next;
}
if( $t>=$pageRow ) {
last;
}
push( @tls, $tlsSlave[$t] );
}
#获取每条数据
my @pageTitles = $treePage->findvalues('/html/body//span[@class="link_title"]');
for ( $pl=1; $pl<=@pageTitles; $pl++) {
my $title_url = "http://blog.csdn.net".$tls[$pl];
my $titleRes = $ua->get($title_url);
if( $titleRes->is_success ) {
#输入标题
open TITLE, ">$pageTitles[$pl-1]";
print TITLE $pageTitles[$pl-1]."\n";
#输入内容
open TS, ">ts.html";
print TS $titleRes->decoded_content;
close TS;
my $treeTitle = HTML::TreeBuilder->new;
$treeTitle->parse_file("ts.html");
my @titleDiv = $treeTitle->findvalues('/html/body//div[@id="article_content"]/div');
foreach ( @titleDiv ) {
print TITLE $_."\n";
}
close TITLE;
}
sleep(3);
}
close PAGE;
}
sleep(3);
}
close CATEGORY;
}
sleep(5);
chdir $ENV{"OLDPWD"};
}
close BLOGFSFD;
}