一直写PHP爬虫程序,使用cUrl扩展进行爬取, 每次cUrl的初始化设置真是过于冗长, 所以封装好初始化类, 方便以后直接使用。
有什么问题欢迎邮箱交流:jiangyeziwh@gmail.com 转载请注明出处,谢谢
代码如下(文件名CurlUtil.class.php):
<?php
/**
* Created by JetBrains PhpStorm.
* User: jiangyeziwh@gmail.com
* Date: 12-7-28
* Time: 下午1:27
* 单行抓取类
* To change this template use File | Settings | File Templates.
*/
class CurlUtil
{
private $_curl;
private $_timeout = 30;
/**
* 初始化curl对象
*/
public function __construct($refer_str = '', $user_agent_str = '', $post_data_str = '', $cookie_str = '', $is_need_head = 0)
{
$this->_curl = curl_init();
if($refer_str != '')
{
curl_setopt($this->_curl, CURLOPT_REFERER, $refer_str);
}
if($user_agent_str != '')
{
curl_setopt($this->_curl, CURLOPT_USERAGENT, $user_agent_str);
}
if($post_data_str != '')
{
curl_setopt($this->_curl, CURLOPT_POSTFIELDS, $post_data_str);
}
if($cookie_str != '')
{
curl_setopt($this->_curl, CURLOPT_COOKIEFILE, str_replace('\\', '/', dirname(__FILE__)) . '/' . $cookie_str);
curl_setopt($this->_curl, CURLOPT_COOKIEJAR, str_replace('\\', '/', dirname(__FILE__)) . '/' . $cookie_str);
}
curl_setopt($this->_curl, CURLOPT_HTTPHEADER, array('Accept-Language:zh-CN,zh;q=0.8'));
curl_setopt($this->_curl, CURLOPT_HEADER, $is_need_head);
curl_setopt($this->_curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($this->_curl, CURLOPT_TIMEOUT, $this->_timeout);
curl_setopt($this->_curl, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($this->_curl, CURLOPT_MAXREDIRS, 5);
}
/**
* 注销curl对象
*/
public function __destruct()
{
curl_close($this->_curl);
}
/**
* 抓取函数
*/
public function getHtml($url)
{
curl_setopt($this->_curl, CURLOPT_URL, $url);
return curl_exec($this->_curl);
}
}
?>