php 抓取 百度、微博、搜狗热门关键词

/**
     * 获取 百度 关键词
     */
    public function index(){
        $data = $this->getBaiduHotKeyWord();
        // 查询 数据库
        $where['how_type'] = 4;
        $saveData = [];
        for ($i = 0;$i<10;$i++){
            $saveData[$i] = [
                'howid' => $i +1,
                'how_type' => 4,
                'how_words' => $data[$i],
                'sort' => $i+1,
                'url' => 'https://www.baidu.com/s?wd='.$data[$i],
                'create_time' => date('Y-m-d H:i:s'),
                'update_time' => date('Y-m-d H:i:s'),
            ];
        }
        $model = M('udc_out_hotwords');
        if ($model->where($where)->select()){
            for ($j = 0;$j<10;$j++) {
                $where['howid'] = $j +1;
                unset($saveData[$j]['howid']);
                $model->where($where)->save($saveData[$j]); // 根据条件更新记录
            }
        }else{
            $model->addAll($saveData);
        }
        Log::write($model->getError());
    }

    /**
     * 获取 搜狗关键词
     */
    public function souGou(){
        $html = $this->getUrlContent("http://top.sogou.com/hot/shishi_1.html?fr=tph_righ");
        $newTable = $this->getSouGouHtml($html);
        //print_r($data);

        $data = $newTable;
        foreach ($data as $key=>&$datum){
            if ($key<3){
                $datum = mb_substr($datum[2],0,8);
            }else{
                $datum = $datum[1];
            }
        }
        // 查询 数据库
        $where['how_type'] = 1;
        $saveData = [];
        for ($i = 0;$i<10;$i++){
            $saveData[$i] = [
                'howid' => $i +1+20,
                'how_type' => 1,
                'how_words' => $data[$i],
                'sort' => $i+1,
                'url' => 'https://www.sogou.com/sogou?query='.$data[$i],
                'create_time' => date('Y-m-d H:i:s'),
                'update_time' => date('Y-m-d H:i:s'),
            ];
        }
        $model = M('udc_out_hotwords');
        if ($model->where($where)->select()){
            for ($j = 0;$j<10;$j++) {
                $where['howid'] = $j +1+20;
                unset($saveData[$j]['howid']);
                $model->where($where)->save($saveData[$j]); // 根据条件更新记录
            }
        }else{
            $model->addAll($saveData);
        }
        Log::write($model->getError());
    }


    /**
     * 获取 微博 关键词
     */
    public function wei(){
        $html = $this->getUrlContent("https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6");
        $table = $this->getTable($html);
        $table = array_slice($table,2); # 把前面多余部分截掉
        $newTable = array_column($table,'1');
        foreach ($newTable as &$value){
            $str = trim($value);
            $str1 = preg_replace('/\s+/','+',$str);
            $arr = explode ('+',$str1);
            $value = $arr[0];
        }
        $data = $newTable;
        // 查询 数据库
        $where['how_type'] = 2;
        $saveData = [];
        for ($i = 0;$i<10;$i++){
            $saveData[$i] = [
                'howid' => $i +1+10,
                'how_type' => 2,
                'how_words' => $data[$i],
                'sort' => $i+1,
                'url' => 'https://s.weibo.com/weibo?q='.$data[$i],
                'create_time' => date('Y-m-d H:i:s'),
                'update_time' => date('Y-m-d H:i:s'),
            ];
        }
        $model = M('udc_out_hotwords');
        if ($model->where($where)->select()){
            for ($j = 0;$j<10;$j++) {
                $where['howid'] = $j +1+10;
                unset($saveData[$j]['howid']);
                $model->where($where)->save($saveData[$j]); // 根据条件更新记录
            }
        }else{
            $model->addAll($saveData);
        }
        Log::write($model->getError());
    }


    private function getBaiduHotKeyWord()
    {
        $templateRss = file_get_contents('http://top.baidu.com/rss_xml.php?p=top10');
        If (preg_match('/<table>(.*)<\/table>/is', $templateRss, $_description)) {
            $templateRss = $_description [0];
            $templateRss = str_replace("&", "&amp;", $templateRss);
        }
        $templateRss = "<?xml version=\"1.0\" encoding=\"GBK\"?>" . $templateRss;
        $xml = simplexml_load_String($templateRss);
        foreach ($xml->tbody->tr as $temp) {
            if (!empty ($temp->td->a)) {
                $keyArray [] = trim(($temp->td->a));
            }
        }
        return $keyArray;
    }



    private function getUrlContent($url){//通过url获取html内容
        $ch = curl_init();
        curl_setopt($ch,CURLOPT_URL,$url);
        curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1 )");
        curl_setopt($ch,CURLOPT_HEADER,1);
        curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
        $output = curl_exec($ch);
        curl_close($ch);
        return $output;
    }


    private function getTable($html) {
        preg_match_all("/<table>[\s\S]*?<\/table>/i",$html,$table);
        $table = $table[0][0];
        $table = preg_replace("'<table[^>]*?>'si","",$table);
        $table = preg_replace("'<tr[^>]*?>'si","",$table);
        $table = preg_replace("'<td[^>]*?>'si","",$table);
        $table = str_replace("</tr>","{tr}",$table);
        $table = str_replace("</td>","{td}",$table);
        //去掉 HTML 标记
        $table = preg_replace("'<[/!]*?[^<>]*?>'si","",$table);
        //去掉空白字符
        $table = preg_replace("'([rn])[s]+'","",$table);
        $table = str_replace(" ","",$table);
        $table = str_replace(" ","",$table);
        $table = explode('{tr}', $table);
        array_pop($table);
        foreach ($table as $key=>$tr) {
            // 自己可添加对应的替换
            $tr = str_replace("\n\n","",$tr);
            $td = explode('{td}', $tr);
            array_pop($td);
            $td_array[] = $td;
        }
        return $td_array;
    }

    private function getSouGouHtml($html){
        preg_match_all("/<ul class=\"pub-list\">[\s\S]*?<\/ul>/i",$html,$table);
        $table = $table[0][0];
        $table = preg_replace("'<ul[^>]*?>'si","",$table);
        $table = preg_replace("'<li[^>]*?>'si","",$table);
        $table = preg_replace("'<span[^>]*?>'si","",$table);
        $table = str_replace("</li>","{tr}",$table);
        $table = str_replace("</span>","{td}",$table);
        $table = preg_replace("'<[/!]*?[^<>]*?>'si","",$table);
        //去掉空白字符
        $table = preg_replace("'([rn])[s]+'","",$table);
        $table = str_replace(" ","",$table);
        $table = str_replace(" ","",$table);
        $table = explode('{tr}', $table);
        array_pop($table);
        foreach ($table as $key=>$tr) {
            // 自己可添加对应的替换
            $tr = str_replace("\n\n","",$tr);
            $td = explode('{td}', $tr);
            array_pop($td);
            $td_array[] = $td;
        }
        return $td_array;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值