前段时间做了爬取学校信息并展示的小软件,爬取内容包括学校官网、教学管理系统、招生就业信息等,其中用到了QueryList库、翻页爬虫,定时爬虫,插入数据库等
不熟系QueryList的可以看一下我的这篇博客querylist数据爬虫入门
<?php
namespace app\crawler\controller;
use QL\QueryList;
use think\Controller;
use think\Db;
use think\Request;
class Getnews extends Controller
{
public function addImg_header($content="",$url="http://www.cdutcm.edu.cn/Upload/")
{
$pregRule = "/<[img|IMG].*?src=[\'|\"](.*?(?:[\.jpg|\.jpeg|\.png|\.gif|\.bmp]))[\'|\"].*?[\/]?>/";
$content = preg_replace($pregRule, '<img src="'.$url.'${1}"', $content);
return $content;
}
//爬取学校官网信息
public function qulist()
{
set_time_limit(0); //防止程序响应30秒后 报错
for($i = 1; $i < 2; $i++){
//综合新闻
$data_list = QueryList::get('https://www.cdutcm.edu.cn/xwsd_'.$i)
// 设置采集规则
->rules([
// 爬取图片地址
"url" => array("#content-wrap ul>li>a", "href"),
"title" => array("#content-wrap ul>li>a", "text")
])
->query()->getData();
foreach ($data_list as $key => $value) {
$find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
if (empty($find)) {
echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
//爬取详情
$url = "https://www.cdutcm.edu.cn{$value['url']}";
$detail_data = $this->getData($url);
// halt($detail_data);
$detail_data = $this->addImg_header($detail_data);
halt(json_decode($detail_data));
//组合数据库
$db_data['article_title'] = $data_list[$key]['title'];
$db_data['article_date'] = $detail_data[0]['date'];
$db_data['article_content'] = $detail_data[0]['content'];
$db_data['type'] = '综合新闻';
print_r($db_data);
$info = Db::name("wx_newlist")->insert($db_data);
if (!empty($info)) {
echo "已插入数据";
}
} else {
echo "数据库已有此数据";
}
}
}
for ($i = 1; $i < 2; $i++) {
//学校要闻
$data_list = QueryList::get('https://www.cdutcm.edu.cn/xxyw_'.$i)
// 设置采集规则
->rules([
// 爬取图片地址
"url" => array("#content-wrap ul>li>a", "href"),
"title" => array("#content-wrap ul>li>a", "text")
])
->query()->getData();
foreach ($data_list as $key => $value) {
$find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
if (empty($find)) {
echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
//爬取详情
$url = "https://www.cdutcm.edu.cn{$value['url']}";
$detail_data = $this->getData($url);
//组合数据库
$db_data['article_title'] = $data_list[$key]['title'];
$db_data['article_date'] = $detail_data[0]['date'];
$db_data['article_content'] = $detail_data[0]['content'];
$db_data['type'] = '学校要闻';
$info = Db::name("wx_newlist")->insert($db_data);
if (!empty($info)) {
echo "已插入数据";
}
} else {
echo "数据库已有此数据";
}
}
}
for ($i = 1; $i < 2; $i++) {
//学术活动
$data_list = QueryList::get('https://www.cdutcm.edu.cn/xshd_' . $i)
// 设置采集规则
->rules([
// 爬取图片地址
"url" => array("#content-wrap ul>li>a", "href"),
"title" => array("#content-wrap ul>li>a", "text")
])
->query()->getData();
foreach ($data_list as $key => $value) {
$find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
if (empty($find)) {
echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
//爬取详情
$url = "https://www.cdutcm.edu.cn{$value['url']}";
$detail_data = $this->getData($url);
//组合数据库
$db_data['article_title'] = $data_list[$key]['title'];
$db_data['article_date'] = $detail_data[0]['date'];
$db_data['article_content'] = $detail_data[0]['content'];
$db_data['type'] = '学术活动';
//插入数据库
$info = Db::name("wx_newlist")->insert($db_data);
if (!empty($info)) {
echo "已插入数据";
}
} else {
echo "数据库已有此数据";
}
}
}
}
function getData($url)
{
$data = QueryList::get($url)// 设置采集规则
->rules([
// 爬取图片地址
// "title1" => array("#content-wrap .article h1.title", 'text'),
"date" => array("#content-wrap .article div.article-hd.text-center div.meta>span:nth-child(2)", 'text'),
//爬取内容
"content" => array("#content-wrap .article div.article-bd", "html"),
])
->query()->getData();
return $data;
}
//爬取招生就业信息
public function getjob()
{
//没有规律的第一页
$data_list = QueryList::get('http://zsjy.cdutcm.edu.cn/News/2002')
// 设置采集规则
->rules([
// 爬取图片地址
"url" => array(".wrap div>.list div>p>a", "href"),
"title" => array(".wrap div>.list div>p>a", "text"),
"date" => array(".wrap div>.list div>p>span:nth-child(1)", "text"),
])
->query()->getData();
foreach ($data_list as $key => $value) {
$find = Db::name("wx_getjob")->where("article_title", "=", $data_list[$key]['title'])->find();
if (empty($find)) {
echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
//爬取详情
$url = "zsjy.cdutcm.edu.cn{$value['url']}";
$detail_data = QueryList::get($url)// 设置采集规则
->rules([
//爬取内容
"content" => array("div.clear", "html"),
])
->query()->getData();
//组合数据库
$db_data['article_title'] = $data_list[$key]['title'];
$db_data['article_date'] = $data_list[$key]['date'];
$db_data['article_content'] = $detail_data[0]['content'];
//媒体焦聚
$db_data['type'] = '就业快讯';
print_r($db_data);
//插入数据库
$info = Db::name("wx_getjob")->insert($db_data);
if (!empty($info)) {
echo "已插入数据";
} else {
echo "插入失败";
}
} else {
echo "数据库已有此数据";
}
}
for ($i = 1; $i < 2; $i++) {
$data_list = QueryList::get('http://zsjy.cdutcm.edu.cn/News/Index/2002/pager/' . $i)
// 设置采集规则
->rules([// 爬取图片地址
"url" => array(".wrap div>.list div>p>a", "href"),
"title" => array(".wrap div>.list div>p>a", "text"),
"date" => array(".wrap div>.list div>p>span:nth-child(1)", "text"),])
->query()->getData();
foreach ($data_list as $key => $value) {
$find = Db::name("wx_getjob")->where("article_title", "=", $data_list[$key]['title'])->find();
if (empty($find)) {
echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
//爬取详情
$url = "zsjy.cdutcm.edu.cn{$value['url']}";
$detail_data = QueryList::get($url)// 设置采集规则
->rules([
//爬取内容
"content" => array("div.clear", "html"),
])
->query()->getData();
//组合数据库
$db_data['article_title'] = $data_list[$key]['title'];
$db_data['article_date'] = $data_list[$key]['date'];
$db_data['article_content'] = $detail_data[0]['content'];
//媒体焦聚
$db_data['type'] = '就业快讯';
print_r($db_data);
//插入数据库
$info = Db::name("wx_getjob")->insert($db_data);
if (!empty($info)) {
echo "已插入数据";
} else {
echo "插入失败";
}
} else {
echo "数据库已有此数据";
}
}
}
}
//爬取教务公告
public function adujwc()
{
//没有规律的第一页
$data_list = QueryList::get('http://jwc.cdutcm.edu.cn/list-782803923682.aspx')
// 设置采集规则
->rules([
// 爬取图片地址
#content > div.right.container > div.list > li:nth-child(1) > span.left_link > a
"url" => array("#content div.list>li>span>a", "href"),
"title" => array("#content div.list>li>span>a", "text"),
"date" => array("#content div.list>li span.time", "text"),
])
->query()->getData();
// var_dump($data_list);
foreach ($data_list as $key => $value) {
$find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
if (empty($find)) {
echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
//爬取详情
$url = "http://jwc.cdutcm.edu.cn{$value['url']}";
$detail_data = QueryList::get($url)// 设置采集规则
->rules([
//爬取内容
#content > div.right.container > div.new_detail
"content" => array("#content div.new_detail", "html","-.margin_b_10"),
])
->query()->getData();
//组合数据库
$db_data['article_title'] = $data_list[$key]['title'];
$db_data['article_date'] = $data_list[$key]['date'];
$db_data['article_content'] = $detail_data[0]['content'];
//媒体焦聚
$db_data['type'] = '教务公告';
print_r($db_data);
//插入数据库
**$info = Db::name("wx_newlist")->insert($db_data);**
if (!empty($info)) {
echo "已插入数据{$data_list[$key]['title']}<br>";
} else {
echo "插入失败{$data_list[$key]['title']}<br>";
}
} else {
echo "数据库已有{$data_list[$key]['title']}<br>";
}
}
}
}