话不多说,上代码.
require_once __DIR__.'/../autoloader.php';
use phpspider\core\phpspider;
use phpspider\core\db;
//下面这个注释删了就跑不起来.....
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$fields_one=[
[
'name' => "keywords",//关键词
'selector' => "//meta[@name='keywords']/@content",
'required' =>false,
],
[
'name' => "description",//描述
'selector' => "//meta[@name='description']/@content",
'required' =>false,
],
[
'name' => "cate_name",//栏目
'selector' => "//div[@class='bread']/a[last()-1]",
'required' => true,
],
[
'name' => "title",//标题
'selector' => "//div[@class='arc_title']//h1",
'required' => true,
],
[
'name' => "time",//时间
'selector' => "//div[@class='arc_title']//span[1]",
'required' => true,
],
[
'name' => "content",//内容
'selector' => "//div[@class='arc_content']",
'required' => true,
],
];
$fields_two=[
[
'name' => "keywords",//关键词
'selector' => "//meta[@name='keywords']/@content",
'required' =>false,
],
[
'name' => "description",//描述
'selector' => "//meta[@name='description']/@content",
'required' =>false,
],
[
'name' => "cate_name",//栏目
'selector' => "//div[@class='crumbs_wrap']//a[last()]",
'required' => true,
],
[
'name' => "title",//标题
'selector' => "//div[@class='container']//h1",
'required' => true,
],
[
'name' => "time",//时间
'selector' => "//div[@class='container']//span[@class='time']",
'required' => true,
],
[
'name' => "content",//内容
'selector' => "//div[@class='container_text']",
'required' => true,
],
];
$configs = array(
'name' => '特玩',
'log_show' =>false,
'log_file' =>'tewan.log',
'log_type' => 'warn,error,debug',
'tasknum' => 3,//爬虫任务数
//'save_running_state' => true,
//主域名
'domains' =>[
'www.te5.com',
],
//入口地址
'scan_urls' => [
//最新地址
'http://www.te5.com/news/',
//攻略汇总
//'http://www.te5.com/news/shouyouzx/',
//游戏新闻
//'http://www.te5.com/news/youxixinwen/',
//热门游戏--先只采集这一个栏目列表下的数据
//'http://www.te5.com/news/danji/'
],
//列表url 匹配规则
'list_url_regexes' => [
//"/news/(.*)/list_\d+_\d+.html"
],
//详情页url 匹配规则
'content_url_regexes' =>[
"/news/\d+.html",
"/news/\d+/\d+.html"
],
'db_config' => [
'host' => '127.0.0.1',
'port' => 3306,
'user' => 'root',
'pass' => 'root',
'name' => 'qiushibaike',
],
'fields' => $fields_one,//可以更换为fields_one,更换规则再采集一次
);
//实例化蜘蛛
$spider=new phpspider($configs);
//采集初始=>数据库初始化
$spider->on_start = function($phpspider)
{
//***直接生成列表页URL入队列
// for ($i = 0; $i <= 100; $i++)
// {
// $url = "http://www.te5.com/news/danji/list_1880_{$i}.html";
// $phpspider->add_url($url);
// }
// 数据库连接
$db_config = $phpspider->get_config("db_config");
db::set_connect('default', $db_config);
db::_init();
};
//详情页=>提取字段=>处理字段
$spider->on_extract_field = function($fieldname, $data, $page)
{
if($fieldname=='content'){
$content=json_encode($data,JSON_UNESCAPED_UNICODE);//转成json
$content_gz=gzcompress($content);//压缩字符串
$data=base64_encode($content_gz);//组成base64
}
return $data;
};
//详情页=>提取最终的数据=>入库
$spider->on_extract_page = function($page, $data)
{
$savedata =[];
$savedata["keywords"]=strip_tags($data["keywords"]);//关键词
$savedata["description"]=strip_tags($data["description"]);//描述
$savedata["cate_name"]=strip_tags($data["cate_name"]);//栏目名称
$savedata["title"]=strip_tags($data["title"]);//标题
$savedata['create_time'] =$data["time"];//发布时间
$savedata["content"]=$data['content'];//内容
$savedata['status'] = 1;//状态
//如果采集的数据content字段不为空,直接插入数据库
if($savedata['content']!==''){
//db::insert("news", $savedata);//数据库保存
echo '数据库已保存!';
}
return $data;
};
$spider->on_list_page = function($page, $content, $phpspider)
{
echo '采集列表页:'.var_dump($page);
};
$spider->start();
代码啥意思就不多讲了,反正注释都在代码里面,关于phpspider,用过的都说屌.
要使用好PHPspider,就要掌握好几个知识点:
1.xpath 用这个来寻找节点,查找元素很方便.
2.正则 去匹配url的时候,需要这个东西.
3.在命令行中执行.
4.那个注释别删,是作者弄了个正则匹配彩蛋代码在里面,如果你删了,就会运行不起来