基于dede的一个自定义采集器

原创 2015年11月20日 21:28:12

搞不懂dede的采集器,有些东西难以满足需求,于是写了这个

<?php

/**
 * 
 * 采集绝想日记网文章
 * @author 秦仙游 <dab1117@163.com>
 * @version 1.0
 * @package suibiwu.com
 */
/* 处理页面最大请求时间为无限 */
set_time_limit(0);
header('Content-Type:text/html;charset=utf8');
require_once("simple_html_dom.php");
require_once("./framework/tool/MySQLDBTool.class.php");
require_once("./framework/tool/ImageTool.class.php");

/* key为需要采集的id号 */
$arr = array();

$arr['24'] = 10196; 
$arr['21'] = 10197; 
$GLOBALS['prefix'] = 'http://www.juexiang.com';

if (count($arr) > 0) {
    foreach ($arr as $key => $value) {
        echo "<h2 style='color:red;'>{$key}分类</h2>";

        $typeid = $key;
        /* 副栏目号 */
        $typeid2 = '';
        $base_url = 'http://www.juexiang.com/list/' . $value;
        for ($i = 6; $i <= 8; $i++) {
            $list_url = $base_url . '?p=' . $i;
            $items = getArcticleList($list_url);
            echo "<h3>第{$i}页</h3>";
            foreach ($items as $k => $v) {
                $id = $v['id'];
                console("获取到id号:$id");
                myFlush();
                sleep(1);
                $obj = getArticle('detail/' . $id);
                console("获取到文档信息:$id");
                $obj['typeid'] = $typeid;
                $obj['typeid2'] = $typeid2;
                sleep(0);
                $info = htmlToText(postToDede($obj)) . "\r\n";
                console("执行结果:$info");
                echo '<hr/>';
                myFlush();
            }
            /* 防盗链设置,虽然可能没有效果 */
            sleep(3);
        }
    }
}

/**
 * 格式化输出
 * @param type $msg
 */
function console($msg) {

    echo "<p style='line-height:20px; font-size:12px; line-height:20px;'>{$msg}</p>";
}

/* 刷新缓存 */

function myFlush() {
    ob_flush();
    flush();
}

/**
 * 获取文章详情
 * @param type $id
 * @return type
 */
function getArticle($id) {

    $obj = pickOne($id);
    $body_without_html = htmlToText($obj['body']);
    $obj['description'] = mb_substr($body_without_html, '0', 150, 'utf8');


    $obj['source'] = '绝想日记网';
    /* 设置文章属性 */
    if (!empty($obj['litpic'])) {
        $obj['flag'] = ',f,p';
    }
    if (rand(1, 3) == 2)
        $obj['flag'] = ',c';
    if (rand(1, 3) == 2)
        $obj['flag'] = ',h';
    if (rand(1, 3) == 2)
        $obj['flag'] = ',s';

    $obj['flag'] = ltrim($flag['flag'], ',');
    $obj['flag'] = explode(',', $flag);
    $obj['keywords'] = '随笔坞';
    $obj['qianbian'] = rand(0, 20);
    $obj['zhichi'] = rand(0, 300);
    $obj['zhaoma'] = rand(0, 70);
    $obj['gaoxiao'] = rand(0, 80);
    $obj['chedan'] = rand(0, 100);
    $obj['bujie'] = rand(0, 200);
    $obj['chijing'] = rand(0, 50);
    $obj['henbang'] = rand(0, 200);
    return $obj;
}

//var_dump($obj);
//echo saveToDatabase($obj) . "\r\n";

function postToDede($obj) {

    /* 构造表单数据 */
    $data = array(
        'channelid' => '1',
        'dopost' => 'save',
        'title' => $obj['title'],
        'shorttitle' => '',
        'redirecturl' => '',
        'tags' => '',
        'weight' => '1',
        'picname' => '',
        'litpic' => '',
        'source' => $obj['source'],
        'writer' => $obj['writer'],
        'typeid' => $obj['typeid'],
        'typeid2' => $obj['typeid2'],
        'keywords' => '',
        'autokey' => '1',
        'desciption' => '',
        'qianbian' => $obj['qianbian'],
        'zhichi' => $obj['zhichi'],
        'zhaoma' => $obj['zhaoma'],
        'gaoxiao' => $obj['gaoxiao'],
        'chedan' => $obj['chedan'],
        'bujie' => $obj['bujie'],
        'chijing' => $obj['chijing'],
        'henbang' => $obj['henbang'],
        'music' => $obj['music'],
        'dede_addonfields' => 'qianbian,int;zhichi,int;zhaoma,int;gaoxiao,int;chedan,int;bujie,int;chijing,int;henbang,int;music,text',
        'remote' => '1',
        'dellink' => '1',
        'autolitpic' => '1',
        'needwatermark' => '1',
        'sptype' => 'hand',
        'spsize' => 'voteid',
        'body' => $obj['body'],
        'voteid' => '',
        'notpost' => '0',
        'click' => $obj['views'],
        'sortup' => '0',
        'color' => '',
        'arcrank' => '',
        'money' => '0',
        'pubdate' => date('Y-m-d H:i:s', $obj['senddate']),
        'ishtml' => '0',
        'filename' => '',
        'templet' => '',
        'imageField.x' => '24',
        'imageField.x' => '14'
    );
    if (isset($obj['flag'])) {
        $index = 0;
        foreach ($obj['flag'] as $key => $value) {
            $data['flag[' . $index . ']'] = $value;
            $index++;
        }
    }
    $url = 'http://sx.cc/administrator/article_add.php';
    $pro = curl_init();
    curl_setopt($pro, CURLOPT_URL, $url);
    curl_setopt($pro, CURLOPT_POST, true);
    curl_setopt($pro, CURLOPT_RETURNTRANSFER, 1);
    /* cookie字符串,需替换成自己的 */
    curl_setopt($pro, CURLOPT_COOKIE, 'menuitems=1_1%2C2_1%2C3_1%2C4_1; Hm_lvt_86f43783acc56b0c8abb5bb039edc763=1447468152; lastCid=12; lastCid__ckMd5=32334e2dceed96e5; PHPSESSID=3i9hfpit8h8gvoho4mbptou1h7; DedeUserID=1; DedeUserID__ckMd5=21656f81551e2194; DedeLoginTime=1447951987; DedeLoginTime__ckMd5=e8b68eb16c46c0a4; dede_vote_2365=1; ENV_GOBACK_URL=%2Fadministrator%2Fcontent_list.php%3Fchannelid%3D1');
    curl_setopt($pro, CURLOPT_REFERER, 'http://sx.cc/administrator/article_add.php?channelid=1&cid=0');
    curl_setopt($pro, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36');
    curl_setopt($pro, CURLOPT_POSTFIELDS, $data);

    $result = curl_exec($pro);
    unset($pro);
    return $result;
}

/**
 * 保存数据库到数据库 不可用,最好的方式还是提交到dede
 * @param array $obj 文章对象
 */
function saveToDatabase($obj) {
    $option = array(
        'host' => 'localhost',
        'port' => '3306',
        'user' => 'root',
        'password' => 'sa123',
        'charset' => 'utf8',
        'database' => 'dedecmsv57utf8sp1',
        'prefix' => 'dede_'
    );
    $db = MySQLDBTool::getInstance($option);
    $sql = "select count(*) from dede_archives where title='{$obj['title']}'";
    $count = $db->executeScalar($sql);
    if ($count > 0)
        return false;
    $sql1 = "INSERT INTO dede_arctiny(typeid,typeid2,arcrank,channel,senddate,sortrank,`mid`) VALUES ('{$obj["typeid"]}','{$obj["typeid2"]}',0,1,'{$obj["senddate"]}','{$obj["sortrank"]}',1)";
    $id = $db->last_insert_id($sql1);
    if ($id > 0) {
        $sql2 = "INSERT INTO dede_archives(id,typeid,typeid2,sortrank,flag,ismake,channel,arcrank,click,money,title,shorttitle,color,writer,source,litpic,"
                . "pubdate,senddate,`mid`,keywords,lastpost,scores,goodpost,badpost,voteid,notpost,description,filename,dutyadmin,tackid,mtype,weight) VALUES"
                . "({$id},'{$obj['typeid']}','{$obj['typeid2']}','{$obj['sortrank']}','{$obj['flag']}','-1','1','0','{$obj['views']}','0','{$obj['title']}','','','{$obj['writer']}',"
                . "'{$obj['source']}','{$obj['litpic']}','{$obj['pubdate']}','{$obj['pubdate']}','1','{$obj['keywords']}','0','0','0','0','0','0','{$obj['description']}','','1','0','0','0')";
        $sql2_r = $db->exec($sql2);

        $sql3 = "INSERT INTO dede_addonarticle(aid,typeid,body,redirecturl,templet,userip,qianbian,zhichi,zhaoma,gaoxiao,chedan,bujie,chijing,henbang,music) VALUES "
                . "({$id},{$obj['typeid']},'{$obj['body']}','','','127.0.0.1','{$obj['qianbian']}','{$obj['zhichi']}','{$obj['zhaoma']}','{$obj['gaoxiao']}','{$obj['chedan']}','{$obj['bujie']}','{$obj['chijing']}','{$obj['henbang']}','{$obj['music']}')";
        $sql3_r = $db->exec($sql3);
        if ($sql3_r < 0 or $sql2_r < 0) {
            $db->exec('delete from dede_archives where id=' . $id);
            $db->exec('delete from dede_addonarticle where aid=' . $id);
            $db->exec('delete from dede_arctiny where id=' . $id);
            return false;
        }
        return true;
    } else {
        return false;
    }
}

/**
 * 取得没有html文档的字符串
 * @param type $html_str
 * @return type
 */
function htmlToText($html_str) {
    return preg_replace('/\s+/', '', preg_replace('/&nbsp;/s', '', preg_replace('/<.*>/sU', '', $html_str)));
}

/**
 *  dede中的sortrank计算方法增加天数 既然是提交,就不需要这个方法了
 *
 * @param     int  $ntime  当前时间
 * @param     int  $aday   增加天数
 * @return    int 计算后的时间
 */
function AddDay($ntime, $aday) {
    $dayst = 3600 * 24;
    $oktime = $ntime + ($aday * $dayst);
    return $oktime;
}

/**
 * 获取单个列表页全部的文章列表
 * @param int $url 列表页链接地址
 */
function getArcticleList($url) {
    $dom = new simple_html_dom;
    $dom->load_file($url);
    $i = 0;

    $items = $dom->find('.left .item .arttitle');
    foreach ($items as $k => $v) {
        $res[$i]['href'] = $GLOBALS['prefix'] . $v->children[0]->attr['href'];
        $res[$i]['title'] = $v->children[0]->innertext;
        preg_match('/(\d+)\.html/isU', $res[$i]['href'], $temp_1);
        $res[$i]['id'] = $temp_1[1];
        $i++;
    }

    /* 释放资源 */
    $dom->clear();
    unset($dom);
    return $res;
}

/**
 * 获取一篇文章信息绝想
 * @param int $id 日记id
 */
function pickOne($id) {
    $url = $GLOBALS['prefix'] . "/{$id}.html";
    echo $url;
    $dom = new simple_html_dom;
    $dom->load_file($url);

    $postHeader_title = $dom->find('.left h1[0]');
    $pubtime = $dom->find('.pubtime');
    $heart = $dom->find('.week a');
    $info = $dom->find('.author a');
    $content = $dom->find('.content');
    $views = $dom->find('.views b');

    $writer = $info[0]->innertext;
    $title = $postHeader_title[0]->innertext;
    $senddate = $pubdate = strtotime($pubtime[0]->innertext);
    $body = $content[0]->innertext;
    $views = $views[0]->innertext;

    /* 释放资源 */
    $dom->clear();
    unset($dom);
    $mp3 = preg_match_all('/\?mp3=(.+\.mp3).*autostart/Us', $body, $res);

    if ($mp3 and isset($res) and count($res) > 0) {
        $music = $res[1][0];
    }

    /* 去除超链接和开头的空白段落 */
    $body = preg_replace('/<a.*>(.*)<\/a>/isU', '$1', preg_replace('/<div.*>(.*)<\/div>/isU', '', preg_replace('/^<p>\s*<\/p>/', '', preg_replace('/\s*<style.+<\/style>/s', '', $body))));

    /* 本地化图片 */
    saveImages($body, $res_img);
    $obj = array('title' => $title, 'writer' => $writer, 'body' => $body, 'pubdate' => $pubdate, 'senddate' => $senddate, 'views' => $views);
    $obj['sortrank'] = AddDay($senddate, 0);
    /* 处理背景音乐 */
    $obj['music'] = isset($music) ? $music : '';
    return $obj;
}

/**
 * 生成缩略图并打水印
 * @param string $file 文件全名
 * @return string 缩略图名字
 */
function makeThumb($file) {
    $image_tool = new ImageTool($file);
    $new_name = $image_tool->makeThumb(300, 300);
    $image_tool->waterMark('./upload/cklogo.png');
    return $new_name;
}

/**
 * 本地化远程图片并返回图片列表
 * @param string $html_str 要存储图片的路径
 * @param string $predix 要查找图片的前缀
 */
function saveImages(& $html_str, & $result) {
    $reg = '/<img.+src\s*=\s*[\'"]\s*(.+\.(\w{3,5}))\s*[\'"].*>/iU';
//    $html_str = preg_replace_callback($reg, 'savePic', $html_str);
    $html_str = preg_replace_callback($reg, 'noHttp', $html_str);
    preg_match_all($reg, $html_str, $result);
}

/* 判断图片是否是http绝对路径开头 */

function noHttp($matches) {
    $image_url = $matches[1];
    if (strpos($image_url, '/') === false) {
        $image_url = $GLOBALS['prefix'] . $image_url;
    }
    return '<center><img src="' . $image_url . '" style="max-width:680px;"/></center>';
}

/**
 * 保存图片到本地
 * @param type $matches
 * @return type
 */
function savePic($matches) {
    $error_img = '/upload/yimo.png';
    $filename = uniqid() . '.' . $matches[2];
    if (!downloadPicture($matches[1], './uploads/cj/', $filename)) {
        $src = $error_img;
    } else {
        $src = 'http://eshop.cc/uploads/cj/' . $filename;
    }
    return "<img style=\"max-width:500px\" src=\"$src\"/>";
}

/**
 * 下载远程图片
 * @param string $image_url 图片保存地址
 * @param string $save_path 图片保存路径
 * @param string $filename 图片文件名
 * @return string 成功返回文件字节数,失败返回false
 */
function downloadPicture($image_url, $save_path, $filename = 'rand') {

    $image_url = (!strpos($image_url, 'http://')) ? $GLOBALS['prefix'] . $image_url : $image_url;

    if (substr($save_path, -1) == '/' || substr($save_path, -1) == '\\') {
        $save_path = rtrim($save_path, '/');
        $save_path = rtrim($save_path, '\\');
    }

    if (!is_dir($save_path)) {
        if (!mkdir($save_path, 0, true))
            die('创建目录失败');
    }
    if ($filename == 'rand') {
        $filename = uniqid();
    }

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $image_url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_COOKIE, 'juexiangssid=acnjkmock336p1ofl7sbgf2p97');
    curl_setopt($ch, CURLOPT_REFERER, 'http://www.juexiang.com/');
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36');
    $img = curl_exec($ch);
    curl_close($ch);
    unset($ch);
    return file_put_contents($save_path . DIRECTORY_SEPARATOR . $filename, $img);
}
版权声明:转载请注明来源地址。

相关文章推荐

Python3网络爬虫(十四):跟股神巴菲特学习炒股之财务报表入库(MySQL)

网上有很多《跟巴菲特学看上市公司财务报表》诸如此类的文章,仁者见仁智者见智。本文重点不在于,如何分析财务报表,而是如何获得财务报表,为后续的方便分析做准备!

Python3《机器学习实战》学习笔记(六):Logistic回归基础篇之梯度上升算法

转载请注明作者和出处: http://blog.csdn.net/c406495762 机器学习知乎专栏:https://zhuanlan.zhihu.com/ml-jack CSDN博客专栏:h...

Python3网络爬虫(十一):爬虫黑科技之让你的爬虫程序更像人类用户的行为(代理IP池等)

转载请注明作者和出处:http://blog.csdn.net/c406495762 运行平台: Windows Python版本: Python3.x IDE: Sublime text3 ...

Python3网络爬虫(十二):初识Scrapy之再续火影情缘

《火影忍者》不是已经完结了吗?《火影忍者》是完结了,但是鸣人儿子的故事才刚刚开始,《博人传之火影忍者新时代》正在热播中。因此,我又开始追动漫了,虽然现在不会像儿时那样激动到上蹿下跳,但是我依然喜欢看,...

AndroidManifest.xml

Every application must have an AndroidManifest.xml file (with precisely that name) in its root direc...

Python3网络爬虫(十三):王者荣耀那些事!(Fiddler之手机APP爬取)

我之前的爬虫博客,爬的都是网页的信息,什么下载小说啊,下载动漫啊,下载帅哥图、妹子图啊。玩这些东西的时候,你想过爬取手机APP里面的东西吗?

Python3网络爬虫(九):使用Selenium爬取百度文库word文章

转载请注明作者和出处: http://blog.csdn.net/c406495762 运行平台: Windows Python版本: Python3.x IDE: Sublime text3前...

掠夺者—自己写新闻采集器(3)

六、GetWebNews 采集WEB新闻,这是整个掠夺者的核心了,也是比较麻烦的地方。这里给出的是采集**网新闻的例子,采集后可以将所有与该网站相关的文字、注释等过滤掉,仅留新闻正文和图片,用到了大量...

Python3网络爬虫(十):这个帅哥、肌肉男横行的世界(爬取帅哥图)

最近,有关注我爬虫教程的朋友说,希望我可以出个爬取图片的教程。那么,今天就谈一谈如何爬取图片吧!今天咱就不爬妹子图了,咱爬《帅哥图》!
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:深度学习:神经网络中的前向传播和反向传播算法推导
举报原因:
原因补充:

(最多只允许输入30个字)