新浪博客、百度博客搬家(PHP代码)
/**
* $Id blogmove.php
*
* 博客搬家
*
* @author skillCoding $date 2011-09-14
*/
include 'Snoopy.class.php';
include 'function_crawling.php';
class BlogMove{
var $snoopy;
var $links = array();
var $blog_links = array();
var $blogs = array();
var $needle_rule = array(); //匹配规则
function BlogMove(){
$this->snoopy = new Snoopy();
}
/**
* 新浪博客搬家
*
* @param String $url
*/
function move_sina($url){
if(!preg_match('/^http:\/\/blog\.sina\.com\.cn\/\w+/', $url)){
return 'Invalid blog address';
}
$this->snoopy->fetch($url);
$str = $this->snoopy->results;
$this->snoopy->results = null;
$str = html_format($str);
//匹配博客目录
if(preg_match_all('/http:\/\/blog\.sina\.com\.cn\/s\/articlelist_\d+_0_1\.html/', $str, $m)){
$url = $m[0][0];
$this->needle_rule = array(
'page'=>array('s\/articlelist_\d+_0_\d+','\.html$'),
'blog'=>array('\/s\/blog_','\.html$')
);
$this->get_blog_list($url);
if($this->blog_links){
foreach ($this->blog_links as $link){
$this->snoopy->fetch($link);
$str = $this->snoopy->results;
$this->snoopy->results = null;
$str = html_format($str);
//匹配标题
$titlearr=crawl_match($str, '/
<\>/');
$title = strip_tags($titlearr[0][0]);
//匹配日期
$datearr=crawl_match($str, '/
(.*?)<\>/');
$date = strip_tags($datearr[0][0]);
$date = str_replace(array('(',')'), '', $date);
//匹配正文
$needle = 'id="sina_keyword_ad_area2"';
$contentarr=crawl_match($str, '/
<\>/',$needle);
$content = $contentarr[0];
$content = trim(str_replace(array($needle,'class="articalcontent ">'), '', $content));
$blog = array(
'title'=>$title,
'dateline'=>$date,
'link'=>$link,
'content'=>$content,
);
array_push($this->blogs, $blog);
}
return $this->blogs;
}else{
return 'No Blog';
}
}else{
return 'Invalid blog address';
}
}
/**
* 百度博客搬家
*
* @param String $url
*/
function move_baidu($url){
if(!preg_match('/^http:\/\/hi\.baidu\.com\/(.*?)\/blog$/', $url)){
return 'Invalid blog address';
}
$url .= '/index/0';
$this->needle_rule = array(
'page'=>array('\/blog\/index\/','\d+$'),
'blog'=>array('\/blog\/item\/','\.html$')
);
$this->get_blog_list($url);
if($this->blog_links){
foreach ($this->blog_links as $link){
$this->snoopy->fetch($link);
$str = $this->snoopy->results;
$this->snoopy->results = null;
$str = html_format($str);
//匹配标题
$titlearr=crawl_match($str, '/
(.*?)<\>/');
$title = $titlearr[1][1];
//匹配日期
$datearr=crawl_match($str, '/
(.*?)<\>/'); $date = $datearr[1][0]; //匹配正文 $needle = array('id="blog_text"','class="cnt"'); $contentarr=crawl_match($str, '/
<\>/',$needle); $content = $contentarr[0]; $content = trim(str_replace($needle, '', $content)); $content = ltrim($content,'>'); $blog = array( 'title'=>$title, 'dateline'=>$date, 'link'=>$link, 'content'=>$content, ); array_push($this->blogs, $blog); } return $this->blogs; }else{ return 'No Blog'; } } /** * 获取博文列表 * * @param String $url */ function get_blog_list($url){ $page_links = array(); $this->snoopy->fetchlinks($url); $links = $this->snoopy->results; $this->snoopy->results = null; $page_links = links_filter($links, $this->needle_rule['page']); //获取列表页数 if(!$page_links) array_push($page_links, $url); foreach ($page_links as $page){ $this->snoopy->fetchlinks($page); $links = $this->snoopy->results; $this->snoopy->results = null; $blog_linkarr = links_filter($links, $this->needle_rule['blog']);//获取当前页博文列表 $this->blog_links = array_merge($this->blog_links,$blog_linkarr); } } }
<\>
<\>
<\>
新浪博客、百度博客搬家(PHP代码)
$blog = new BlogMove();
$return = $blog->move_baidu("http://hi.baidu.com/blog/blog/");
$return = $blog->move_sina("http://blog.sina.com.cn/sunsan");
新浪博客、百度博客搬家(PHP代码)