获取内容的一个curl类(采集专用)

一般用于采集,防止ip被禁以后还在运行



<?php
 
    //curl扩展类
    include "Curl.class.php";
 
    //实例化对象
    $curl = new Curl();
 
    //调用方法
    $con = $curl->get("http://www.lampbrother.net");
 
    echo $con;
 
    //作业  采集讲师页面
 
 ?>
<?php
 
/**
 * Wget Curl驱动核心
 *
 * @author     jonwang(jonwang@myqee.com)
 * @category   MyQEE
 * @package    System
 * @subpackage Core
 * @copyright  Copyright (c) 2008-2012 myqee.com
 * @license    http://www.myqee.com/license.html
 */
class Curl
{
 
    protected $http_data = array();
 
    protected $agent;
 
    protected $cookies;
 
    protected $referer;
 
    protected $ip;
 
    protected $header = array();
 
    protected $_option = array();
 
    protected $_post_data = array();
 
    /**
     * 多列队任务进程数,0表示不限制
     *
     * @var int
     */
    protected $multi_exec_num = 100;
 
    const ERROR_HOST = '请求的URL错误';
 
    const ERROR_GET = 'GET请求错误';
 
    const ERROR_POST = 'POST请求错误';
 
    function __construct()
    {
 
    }
 
    /**
     * 设置$cookie
     *
     * @param $agent
     * @return HttpClient_Driver_Curl
     */
    public function set_agent($agent)
    {
        $this->agent = $agent;
        return $this;
    }
 
    /**
     * 设置$cookie
     *
     * @param string $cookie
     * @return HttpClient_Driver_Curl
     */
    public function set_cookies($cookies)
    {
        $this->cookies = $cookies;
        return $this;
    }
 
    /**
     * 设置$referer
     *
     * @param string $referer
     * @return HttpClient_Driver_Curl
     */
    public function set_referer($referer)
    {
        $this->referer = $referer;
        return $this;
    }
 
    /**
     * 设置IP
     *
     * @param string $ip
     * @return HttpClient_Driver_Curl
     */
    public function set_ip($ip)
    {
        $this->ip = $ip;
        return $this;
    }
 
    /**
     * 设置curl参数
     *
     * @param string $key
     * @param value $value
     * @return HttpClient_Driver_Curl
     */
    public function set_option($key, $value)
    {
        if ( $key===CURLOPT_HTTPHEADER )
        {
            $this->header = array_merge($this->header,$value);
        }
        else
        {
            $this->_option[$key] = $value;
        }
        return $this;
    }
 
    /**
     * 设置多个列队默认排队数上限
     *
     * @param int $num
     * @return HttpClient_Driver_Curl
     */
    public function set_multi_max_num($num=0)
    {
        $this->multi_exec_num = (int)$num;
        return $this;
    }
 
    /**
     * 用POST方式提交,支持多个URL
     *
     *   $urls = array
     *   (
     *     'http://www.baidu.com/',
     *     'http://mytest.com/url',
     *     'http://www.abc.com/post',
     *   );
     *   $data = array
     *   (
     *      array('k1'=>'v1','k2'=>'v2'),
     *      array('a'=>1,'b'=>2),
     *      'aa=1&bb=3&cc=3',
     *   );
     *   HttpClient::factory()->post($url,$data);
     *
     * @param $url
     * @param string/array $vars
     * @param $timeout 超时时间,默认120秒
     * @return string, false on failure
     */
    public function post($url, $vars, $timeout = 60)
    {
        # POST模式
        $this->set_option( CURLOPT_HTTPHEADER, array('Expect:') );
        $this->set_option( CURLOPT_POST, true );
 
        if (is_array($url))
        {
            $myvars = array();
            foreach ($url as $k=>$url)
            {
                if (isset($vars[$k]))
                {
                    if (is_array($vars[$k]))
                    {
                        $myvars[$url] = http_build_query($vars[$k]);
                    }
                    else
                    {
                        $myvars[$url] = $vars[$k];
                    }
                }
            }
        }
        else
        {
            $myvars = array($url=>$vars);
        }
        $this->_post_data = $myvars;
 
        return $this->get($url,$timeout);
    }
 
    /**
     * GET方式获取数据,支持多个URL
     *
     * @param string/array $url
     * @param $timeout
     * @return string, false on failure
     */
    public function get($url, $timeout = 10)
    {
 
        if ( is_array($url) )
        {
            $getone = false;
            $urls = $url;
        }
        else
        {
            $getone = true;
            $urls = array($url);
        }
 
        $data = $this->request_urls($urls, $timeout);
 
        $this->clear_set();
 
        if ( $getone )
        {
            $this->http_data = $this->http_data[$url];
            return $data[$url];
        }
        else
        {
            return $data;
        }
    }
 
    /**
     * 创建一个CURL对象
     *
     * @param string $url URL地址
     * @param int $timeout 超时时间
     * @return curl_init()
     */
    public function _create($url,$timeout)
    {
        if ( false===strpos($url, '://') )
        {
            preg_match('#^(http(?:s)?\://[^/]+/)#', $_SERVER["SCRIPT_URI"] , $m);
            $the_url = $m[1].ltrim($url,'/');
        }
        else
        {
            $the_url = $url;
        }
 
        if ($this->ip)
        {
            # 如果设置了IP,则把URL替换,然后设置Host的头即可
            if ( preg_match('#^(http(?:s)?)\://([^/\:]+)(\:[0-9]+)?/#', $the_url.'/',$m) )
            {
                $this->header[] = 'Host: '.$m[2];
                $the_url = $m[1].'://'.$this->ip.$m[3].'/'.substr($the_url,strlen($m[0]));
            }
        }
 
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $the_url);
        curl_setopt($ch, CURLOPT_HEADER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
 
        if ( preg_match('#^https://#i', $the_url) )
        {
            curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        }
 
        if ( $this->cookies )
        {
            curl_setopt($ch, CURLOPT_COOKIE, http_build_query($this->cookies, '', ';'));
        }
 
        if ( $this->referer )
        {
            curl_setopt($ch, CURLOPT_REFERER, $this->referer);
        }
 
        if ( $this->agent )
        {
            curl_setopt($ch, CURLOPT_USERAGENT, $this->agent);
        }
        elseif ( array_key_exists('HTTP_USER_AGENT', $_SERVER) )
        {
            curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
        }
 
        foreach ( $this->_option as $k => $v )
        {
            curl_setopt($ch, $k, $v);
        }
 
        if ( $this->header )
        {
            $header = array();
            foreach ($this->header as $item)
            {
                # 防止有重复的header
                if (preg_match('#(^[^:]*):.*$#', $item,$m))
                {
                    $header[$m[1]] = $item;
                }
            }
            curl_setopt($ch, CURLOPT_HTTPHEADER, array_values($header));
        }
 
        # 设置POST数据
        if (isset($this->_post_data[$the_url]))
        {
            curl_setopt($ch , CURLOPT_POSTFIELDS , $this->_post_data[$the_url]);
        }
 
        return $ch;
    }
 
    /**
     * 支持多线程获取网页
     *
     * @see http://cn.php.net/manual/en/function.curl-multi-exec.php#88453
     * @param Array/string $urls
     * @param Int $timeout
     * @return Array
     */
    protected function request_urls($urls, $timeout = 10)
    {
        # 去重
        $urls = array_unique($urls);
 
        if (!$urls)return array();
 
        $mh = curl_multi_init();
 
        # 监听列表
        $listener_list = array();
 
        # 返回值
        $result = array();
 
        # 总列队数
        $list_num = 0;
 
        # 排队列表
        $multi_list = array();
        foreach ( $urls as $url )
        {
            # 创建一个curl对象
            $current = $this->_create($url, $timeout);
 
            if ( $this->multi_exec_num>0 && $list_num>=$this->multi_exec_num )
            {
                # 加入排队列表
                $multi_list[] = $url;
            }
            else
            {
                # 列队数控制
                curl_multi_add_handle($mh, $current);
                $listener_list[$url] = $current;
                $list_num++;
            }
 
            $result[$url] = null;
            $this->http_data[$url] = null;
        }
        unset($current);
 
        $running = null;
 
        # 已完成数
        $done_num = 0;
 
        do
        {
            while ( ($execrun = curl_multi_exec($mh, $running)) == CURLM_CALL_MULTI_PERFORM );
            if ( $execrun != CURLM_OK ) break;
 
            while ( true==($done = curl_multi_info_read($mh)) )
            {
                foreach ( $listener_list as $done_url=>$listener )
                {
                    if ( $listener === $done['handle'] )
                    {
 
                        # 获取内容
                        $this->http_data[$done_url] = $this->get_data(curl_multi_getcontent($done['handle']), $done['handle']);
 
                        if ( $this->http_data[$done_url]['code'] != 200 )
                        {
 
                            $result[$done_url] = false;
                        }
                        else
                        {
                            # 返回内容
                            $result[$done_url] = $this->http_data[$done_url]['data'];
 
                        }
 
                        curl_close($done['handle']);
 
                        curl_multi_remove_handle($mh, $done['handle']);
 
                        # 把监听列表里移除
                        unset($listener_list[$done_url],$listener);
                        $done_num++;
 
                        # 如果还有排队列表,则继续加入
                        if ( $multi_list )
                        {
                            # 获取列队中的一条URL
                            $current_url = array_shift($multi_list);
 
                            # 创建CURL对象
                            $current = $this->_create($current_url, $timeout);
 
                            # 加入到列队
                            curl_multi_add_handle($mh, $current);
 
                            # 更新监听列队信息
                            $listener_list[$current_url] = $current;
                            unset($current);
 
                            # 更新列队数
                            $list_num++;
                        }
 
                        break;
                    }
                }
            }
 
            if ($done_num>=$list_num)break;
 
        } while (true);
 
        # 关闭列队
        curl_multi_close($mh);
 
        return $result;
    }
 
    public function get_resut_data()
    {
        return $this->http_data;
    }
 
    protected function get_data($data, $ch)
    {
        $header_size      = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
        $result['code']   = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $result['data']   = substr($data, $header_size);
        $result['header'] = explode("\r\n", substr($data, 0, $header_size));
        $result['time']   = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
 
        return $result;
    }
 
    /**
     * 清理设置
     */
    protected function clear_set()
    {
        $this->_option = array();
        $this->header = array();
        $this->ip = null;
        $this->cookies = null;
        $this->referer = null;
        $this->_post_data = array();
    }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值