新浪博客、百度博客搬家(PHP代码)
/**
* $Id blogmove.php
*
* 博客搬家
*
* @author skillCoding $date 2011-09-14
*/
include 'Snoopy.class.php';
include 'function_crawling.php';
class BlogMove{
var $snoopy;
var $links = array();
var $blog_links = array();
var $blogs = array();
var $needle_rule = array();//匹配规则
function BlogMove(){
$this->snoopy = new Snoopy();
}
/**
* 新浪博客搬家
*
* @param String $url
*/
function move_sina($url){
if(!preg_match('/^http:\/\/blog\.sina\.com\.cn\/\w+/', $url)){
return 'Invalid blog address';
}
$this->snoopy->fetch($url);
$str = $this->snoopy->results;
$this->snoopy->results = null;
$str = html_format($str);
//匹配博客目录
if(preg_match_all('/http:\/\/blog\.sina\.com\.cn\/s\/articlelist_\d+_0_1\.html/', $str, $m)){
$url = $m[0][0];
$this->needle_rule = array(
'page'=>array('s\/articlelist_\d+_0_\d+','\.html$'),
'blog'=>array('\/s\/blog_','\.html$')
);
$this->get_blog_list($url);
if($this->blog_links){
foreach ($this->blog_links as $link){
$this->snoopy->fetch($link);
$str = $this->snoopy->results;
$this->snoopy->results = null;
$str = html_format($str);
//匹配标题
$titlearr=crawl_match($str, '/
/');
$title = strip_tags($titlearr[0][0]);
//匹配日期
$datearr=crawl_match($str, '/
(.*?)/'); $date = strip_tags($datearr[0][0]); $date = str_replace(array('(',')'), '', $date); //匹配正文 $needle = 'id="sina_keyword_ad_area2"'; $contentarr=crawl_match($str, '/
/',$needle); $content = $contentarr[0]; $content = trim(str_replace(array($needle,'class="articalcontent ">'), '', $content)); $blog = array( 'title'=>$title, 'dateline'=>$date, 'link'=>$link, 'content'=>$content, ); array_push($this->blogs, $blog); } return $this->blogs; }else{ return 'No Blog'; } }else{ return 'Invalid blog address'; } } /** * 百度博客搬家 * * @param String $url */ function move_baidu($url){ if(!preg_match('/^http:\/\/hi\.baidu\.com\/(.*?)\/blog$/', $url)){ return 'Invalid blog address'; } $url .= '/index/0'; $this->needle_rule = array( 'page'=>array('\/blog\/index\/','\d+$'), 'blog'=>array('\/blog\/item\/','\.html$') ); $this->get_blog_list($url); if($this->blog_links){ foreach ($this->blog_links as $link){ $this->snoopy->fetch($link); $str = $this->snoopy->results; $this->snoopy->results = null; $str = html_format($str); //匹配标题 $titlearr=crawl_match($str, '/
(.*?)/'); $title = $titlearr[1][1]; //匹配日期 $datearr=crawl_match($str, '/
(.*?)/'); $date = $datearr[1][0]; //匹配正文 $needle = array('id="blog_text"','class="cnt"'); $contentarr=crawl_match($str, '/
/',$needle); $content = $contentarr[0]; $content = trim(str_replace($needle, '', $content)); $content = ltrim($content,'>'); $blog = array( 'title'=>$title, 'dateline'=>$date, 'link'=>$link, 'content'=>$content, ); array_push($this->blogs, $blog); } return $this->blogs; }else{ return 'No Blog'; } } /** * 获取博文列表 * * @param String $url */ function get_blog_list($url){ $page_links = array(); $this->snoopy->fetchlinks($url); $links = $this->snoopy->results; $this->snoopy->results = null; $page_links = links_filter($links, $this->needle_rule['page']); //获取列表页数 if(!$page_links) array_push($page_links, $url); foreach ($page_links as $page){ $this->snoopy->fetchlinks($page); $links = $this->snoopy->results; $this->snoopy->results = null; $blog_linkarr = links_filter($links, $this->needle_rule['blog']);//获取当前页博文列表 $this->blog_links = array_merge($this->blog_links,$blog_linkarr); } } }
新浪博客、百度博客搬家(PHP代码)
$blog = new BlogMove();
$return = $blog->move_baidu("http://hi.baidu.com/blog/blog/");
$return = $blog->move_sina("http://blog.sina.com.cn/sunsan");
新浪博客、百度博客搬家(PHP代码)