爬取学校官网信息公告并存入数据库

前段时间做了爬取学校信息并展示的小软件,爬取内容包括学校官网、教学管理系统、招生就业信息等,其中用到了QueryList库、翻页爬虫,定时爬虫,插入数据库等
不熟系QueryList的可以看一下我的这篇博客querylist数据爬虫入门

<?php

namespace app\crawler\controller;

use QL\QueryList;
use think\Controller;
use think\Db;
use think\Request;

class Getnews extends Controller
{
    public function addImg_header($content="",$url="http://www.cdutcm.edu.cn/Upload/")
    {
        $pregRule = "/<[img|IMG].*?src=[\'|\"](.*?(?:[\.jpg|\.jpeg|\.png|\.gif|\.bmp]))[\'|\"].*?[\/]?>/";
        $content = preg_replace($pregRule, '<img src="'.$url.'${1}"', $content);
        return $content;
    }
    //爬取学校官网信息
    public function qulist()
    {
        set_time_limit(0);  //防止程序响应30秒后  报错
        for($i = 1; $i < 2; $i++){
            //综合新闻
            $data_list = QueryList::get('https://www.cdutcm.edu.cn/xwsd_'.$i)
            // 设置采集规则
            ->rules([
                // 爬取图片地址
                "url" => array("#content-wrap ul>li>a", "href"),
                "title" => array("#content-wrap  ul>li>a", "text")
            ])
                ->query()->getData();
            foreach ($data_list as $key => $value) {
                $find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
                if (empty($find)) {
                    echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
                    //爬取详情
                    $url = "https://www.cdutcm.edu.cn{$value['url']}";
                    $detail_data = $this->getData($url);
//                    halt($detail_data);
                    $detail_data = $this->addImg_header($detail_data);
                    halt(json_decode($detail_data));

                    //组合数据库
                    $db_data['article_title'] = $data_list[$key]['title'];
                    $db_data['article_date'] = $detail_data[0]['date'];
                    $db_data['article_content'] = $detail_data[0]['content'];
                    $db_data['type'] = '综合新闻';
                    print_r($db_data);
                    $info = Db::name("wx_newlist")->insert($db_data);
                    if (!empty($info)) {
                        echo "已插入数据";
                    }
                } else {
                    echo "数据库已有此数据";
                }
            }

        }

        for ($i = 1; $i < 2; $i++) {

            //学校要闻
            $data_list = QueryList::get('https://www.cdutcm.edu.cn/xxyw_'.$i)

                // 设置采集规则
                ->rules([
                    // 爬取图片地址
                    "url" => array("#content-wrap ul>li>a", "href"),
                    "title" => array("#content-wrap  ul>li>a", "text")
                ])
                ->query()->getData();
            foreach ($data_list as $key => $value) {
                $find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
                if (empty($find)) {
                    echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
                    //爬取详情
                    $url = "https://www.cdutcm.edu.cn{$value['url']}";

                    $detail_data = $this->getData($url);
                    //组合数据库
                    $db_data['article_title'] = $data_list[$key]['title'];
                    $db_data['article_date'] = $detail_data[0]['date'];
                    $db_data['article_content'] = $detail_data[0]['content'];
                    $db_data['type'] = '学校要闻';
                    $info = Db::name("wx_newlist")->insert($db_data);
                    if (!empty($info)) {
                        echo "已插入数据";
                    }
                } else {
                    echo "数据库已有此数据";
                }
            }
        }
        for ($i = 1; $i < 2; $i++) {

            //学术活动
            $data_list = QueryList::get('https://www.cdutcm.edu.cn/xshd_' . $i)
                // 设置采集规则
                ->rules([
                    // 爬取图片地址
                    "url" => array("#content-wrap ul>li>a", "href"),
                    "title" => array("#content-wrap  ul>li>a", "text")
                ])
                ->query()->getData();
            foreach ($data_list as $key => $value) {
                $find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
                if (empty($find)) {
                    echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
                    //爬取详情
                    $url = "https://www.cdutcm.edu.cn{$value['url']}";

                    $detail_data = $this->getData($url);
                    //组合数据库
                    $db_data['article_title'] = $data_list[$key]['title'];
                    $db_data['article_date'] = $detail_data[0]['date'];
                    $db_data['article_content'] = $detail_data[0]['content'];
                    $db_data['type'] = '学术活动';
                        //插入数据库
                    $info = Db::name("wx_newlist")->insert($db_data);
                    if (!empty($info)) {
                        echo "已插入数据";
                    }
                } else {
                    echo "数据库已有此数据";
                }
            }
        }

    }

    function getData($url)
    {
        $data = QueryList::get($url)// 设置采集规则
        ->rules([
            // 爬取图片地址
            // "title1" => array("#content-wrap .article h1.title", 'text'),
            "date" => array("#content-wrap .article div.article-hd.text-center div.meta>span:nth-child(2)", 'text'),
            //爬取内容
            "content" => array("#content-wrap .article div.article-bd", "html"),
        ])
            ->query()->getData();
        return $data;
    }

    //爬取招生就业信息
    public function getjob()
    {
        //没有规律的第一页
        $data_list = QueryList::get('http://zsjy.cdutcm.edu.cn/News/2002')
            // 设置采集规则
            ->rules([
                // 爬取图片地址
                "url" => array(".wrap div>.list div>p>a", "href"),
                "title" => array(".wrap div>.list div>p>a", "text"),
                "date" => array(".wrap div>.list div>p>span:nth-child(1)", "text"),
            ])
            ->query()->getData();
        foreach ($data_list as $key => $value) {
            $find = Db::name("wx_getjob")->where("article_title", "=", $data_list[$key]['title'])->find();
            if (empty($find)) {
                echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
                //爬取详情
                $url = "zsjy.cdutcm.edu.cn{$value['url']}";
                $detail_data = QueryList::get($url)// 设置采集规则
                ->rules([
                    //爬取内容
                    "content" => array("div.clear", "html"),
                ])
                    ->query()->getData();
                //组合数据库
                $db_data['article_title'] = $data_list[$key]['title'];
                $db_data['article_date'] = $data_list[$key]['date'];
                $db_data['article_content'] = $detail_data[0]['content'];
                //媒体焦聚
                $db_data['type'] = '就业快讯';
                print_r($db_data);
                    //插入数据库
                $info = Db::name("wx_getjob")->insert($db_data);
                if (!empty($info)) {
                    echo "已插入数据";
                } else {
                    echo "插入失败";
                }
            } else {
                echo "数据库已有此数据";
            }

        }
        for ($i = 1; $i < 2; $i++) {
            $data_list = QueryList::get('http://zsjy.cdutcm.edu.cn/News/Index/2002/pager/' . $i)
                // 设置采集规则
                ->rules([// 爬取图片地址
                    "url" => array(".wrap div>.list div>p>a", "href"),
                    "title" => array(".wrap div>.list div>p>a", "text"),
                    "date" => array(".wrap div>.list div>p>span:nth-child(1)", "text"),])
                ->query()->getData();
            foreach ($data_list as $key => $value) {
                $find = Db::name("wx_getjob")->where("article_title", "=", $data_list[$key]['title'])->find();
                if (empty($find)) {
                    echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
                    //爬取详情
                    $url = "zsjy.cdutcm.edu.cn{$value['url']}";
                    $detail_data = QueryList::get($url)// 设置采集规则
                    ->rules([
                        //爬取内容
                        "content" => array("div.clear", "html"),
                    ])
                        ->query()->getData();

                    //组合数据库
                    $db_data['article_title'] = $data_list[$key]['title'];
                    $db_data['article_date'] = $data_list[$key]['date'];
                    $db_data['article_content'] = $detail_data[0]['content'];

                    //媒体焦聚
                    $db_data['type'] = '就业快讯';
                    print_r($db_data);
                        //插入数据库
                    $info = Db::name("wx_getjob")->insert($db_data);
                    if (!empty($info)) {
                        echo "已插入数据";
                    } else {
                        echo "插入失败";
                    }
                } else {
                    echo "数据库已有此数据";
                }

            }
        }
    }

    //爬取教务公告
    public function adujwc()
    {
        //没有规律的第一页
        $data_list = QueryList::get('http://jwc.cdutcm.edu.cn/list-782803923682.aspx')
            // 设置采集规则
            ->rules([
                // 爬取图片地址
                #content > div.right.container > div.list > li:nth-child(1) > span.left_link > a
                "url" => array("#content div.list>li>span>a", "href"),
                "title" => array("#content div.list>li>span>a", "text"),
                "date" => array("#content div.list>li span.time", "text"),
            ])
            ->query()->getData();
//        var_dump($data_list);
        foreach ($data_list as $key => $value) {
            $find = Db::name("wx_newlist")->where("article_title", "=", $data_list[$key]['title'])->find();
            if (empty($find)) {
                echo "开始获取<<{$data_list[$key]['title']}>>的详情\n";
                //爬取详情
                $url = "http://jwc.cdutcm.edu.cn{$value['url']}";
                $detail_data = QueryList::get($url)// 设置采集规则
                ->rules([
                    //爬取内容
                    #content > div.right.container > div.new_detail
                    "content" => array("#content div.new_detail", "html","-.margin_b_10"),
                ])
                    ->query()->getData();
                //组合数据库
                $db_data['article_title'] = $data_list[$key]['title'];
                $db_data['article_date'] = $data_list[$key]['date'];
                $db_data['article_content'] = $detail_data[0]['content'];
                //媒体焦聚
                $db_data['type'] = '教务公告';
                print_r($db_data);
                //插入数据库
                **$info = Db::name("wx_newlist")->insert($db_data);**
                if (!empty($info)) {
                    echo "已插入数据{$data_list[$key]['title']}<br>";
                } else {
                    echo "插入失败{$data_list[$key]['title']}<br>";
                }
            } else {
                echo "数据库已有{$data_list[$key]['title']}<br>";
            }

        }

    }


}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值