PHP的curl_init采集网页数据实例教程

最新推荐文章于 2021-04-09 05:35:00 发布

打杂人

最新推荐文章于 2021-04-09 05:35:00 发布

阅读量2.9k

点赞数

PHP+MYSQL 专栏收录该内容

208 篇文章 0 订阅

订阅专栏

 
<?php
 
// 1. 初始化
 
$ch = curl_init();
 
 
 
// 2. 设置选项，包括URL
 
curl_setopt($ch, CURLOPT_URL, "http://blog.snsgou.com");
 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
 
curl_setopt($ch, CURLOPT_HEADER, 0);
 
 
 
// 3. 执行并获取HTML文档内容
 
$output = curl_exec($ch);
 
 
 
// 4. 释放curl句柄
 
curl_close($ch);
 
?>

curl_setopt中的 CURLOPT_URL, CURLOPT_RETURNTRANSFER 等参数，请参考php文档手册,里面有详细说明！
现在得到$output内容…使用正则表达式匹配出你需要的内容。

 
<?php
 
/**
 
 * 采集类
 
 */
 
class Gather {
 
 
 
    public $pagestring = '';
 
    private $db;
 
 
 
    function __construct() {
 
        global $db;
 
        $this->db = $db;
 
    }
 
 
 
    function getUrlFile($url) {
 
        $url = trim($url);
 
        $content = '';
 
        if (extension_loaded('curl')) {
 
            $ch = curl_init();
 
            curl_setopt($ch, CURLOPT_URL, $url);
 
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
 
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 
            curl_setopt($ch, CURLOPT_HEADER, 0);
 
            $content = curl_exec($ch);
 
            curl_close($ch);
 
        } else {
 
            $content = file_get_contents($url);
 
        }
 
        return trim($content);
 
    }
 
 
 
    function get_all_url($code) {
 
        preg_match_all('/<a.+?href=["|\']?([^>"\' ]+)["|\']?\s*[^>]*>([^>]+)<\/a>/is', $code, $arr);
 
        return array('name' => $arr[2], 'url' => $arr[1]);
 
    }
 
 
 
    function get_sub_content($str, $start, $end) {
 
        $start = trim($start);
 
        $end = trim($end);
 
        if ($start == '' || $end == '') {
 
            return $str;
 
        }
 
        $str = explode($start, $str);
 
        $str = explode($end, $str[1]);
 
        return $str[0];
 
    }
 
 
 
    function vd($var) {
 
        echo "<div style=\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\">\r\n";
 
        echo "<pre style=\"font-family:Arial,Vrinda;font-size:14px;\">\r\n";
 
        var_dump($var);
 
        echo "\r\n</pre>\r\n";
 
        echo "</div>";
 
    }
 
 
 
}
 
 
 
?>

 
<?php
 
define('ROOT_PATH', str_replace('\\', '/', dirname(__FILE__)));
 
//include ROOT_PATH."/Gather.class.php";
 
set_time_limit(0);
 
header("Content-type: text/html; charset=gb2312");
 
//目标网址
 
$url = 'http://news.163.com/special/00013C0O/guojibjtj_03.html';
 
//实例化采集机器
 
$gather = new Gather();
 
//获取目标网址HTML
 
$html = $gather->getUrlFile($url);
 
//定义采集列表区间
 
$start = '<div class="bd clearfix">';
 
$end = '<div class="pages-1 mt25">';
 
//获取区间内的文章URL和TITLE
 
$code = $gather->get_sub_content($html, $start, $end);
 
$newsAry = $gather->get_all_url($code);
 
//打印出结果
 
$gather->vd($newsAry);
 
 
 
$tarGetUrl = $newsAry['url'][0];
 
//获取目标网址HTML
 
$html = $gather->getUrlFile($tarGetUrl);
 
//定义采集列表区间
 
$start = '<div id="endText">';
 
$end = '<span class="cDGray right" style="white-space:nowrap;">';
 
//获取区间内的文章URL和TITLE
 
$code = $gather->get_sub_content($html, $start, $end);
 
$killHtml = '<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>';
 
$killHtml2 = '<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>';
 
$code = str_replace($killHtml, "", $code);
 
$code = str_replace($killHtml2, "", $code);
 
$gather->vd($code);
 
?>