在网络爬虫抓取信息的过程中,如果抓取频率高过了网站的设置阀值,将会被禁止访问。通常,网站的反爬虫机制都是依据IP来标识爬虫的。
1. 使用代理IP,在IP被封掉之前或者封掉之后迅速换掉该IP,这种做法主要需要大量稳定的代理IP,代理IP有免费的,但是不稳定。这里的技巧是循环使用,在一个IP没有被封之前,就换掉,过一会再换回来。这样就可以使用相对较少的IP进行大量访问。讯代理首页每10分钟更新的免费代理,发现还是挺好的,于是就先把讯代理首页的免费代理先爬下来(10分钟爬一次数据缓存起来,缓存10分钟失效),再用爬下来的代理循环使用去爬其他网站的东西。
2. 使用VPN,VPN跟带来作用类似,只是技术上稍有差别。本质是一样的。
3. 使用有大规模云采集集群的软件工具,比如八爪鱼
4.抓取的时候控制访问频次,抓取后(sleep)休息下再抓
5.对于动态获取IP的路由器,重启后自动换IP
以上三种方式,优先推荐使用代理IP,当然八爪鱼采集器也支持使用代理IP。或者使用云采集平台
1、简单一点的可以在header伪造X-FORWARDED-FOR,并伪造referer,代码如下:
curl_setopt($ch, CURLOPT_HTTPHEADER, array('X-FORWARDED-FOR:111.222.333.4', 'CLIENT-IP:111.222.333.4'));
curl_setopt($ch, CURLOPT_REFERER, "http://www.baidu.com");
2、上面的方法大多数能糊弄过去,但也有抓到了真实IP的。就使用代理IP,麻烦在于你有一个有效的代理ip和端口号,有的还需要用户名密码,可以根据代理建立有效的代理数据库。代码如下:
// 指定代理地址
$ip = $ips[array_rand($ips, 1)]; // 随机获取一个代理IP
curl_setopt($ch, CURLOPT_PROXY, $ip);
// 如果需要的话,提供用户名和密码
curl_setopt($ch, CURLOPT_PROXYUSERPWD,'user:pass');
另外还有一种情况,就是用浏览器可以访问,用curl就是不行,发现对方检查了useragent,如果没有就认为是抓取等非法来源,那么我们就自己在header加上useragent,代码如下:
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11");
测试伪造信息info.php
<?php
function getClientIp()
{
if (!empty($_SERVER["HTTP_CLIENT_IP"])) {
$ip = $_SERVER["HTTP_CLIENT_IP"];
} else {
if (!empty($_SERVER["HTTP_X_FORWARDED_FOR"])) {
$ip = $_SERVER["HTTP_X_FORWARDED_FOR"];
} else {
if (!empty($_SERVER["REMOTE_ADDR"])) {
$ip = $_SERVER["REMOTE_ADDR"];
} else {
$ip = "无法获取IP";
}
}
}
return $ip;
}
echo "IP: ".getClientIp()."<br/>";
echo "referer: ".$_SERVER["HTTP_REFERER"];
调用fake.php,请求info.php
<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "http://localhost/info.php");
curl_setopt($ch, CURLOPT_HTTPHEADER, array('X-FORWARDED-FOR:8.8.8.8', 'CLIENT-IP:8.8.8.8')); //构造IP
curl_setopt($ch, CURLOPT_REFERER, "http://www.gosoa.com.cn/ "); //构造来路
curl_setopt($ch, CURLOPT_HEADER, 1);
$out = curl_exec($ch);
curl_close($ch);
没有负载时使用remote_addr获取真实IP,remote_addr无法伪造,可以通过代理来处理。有负载时使用HTTP_X_FORWARDED_FOR获取IP
完整例子
<?php
function task($url)
{
$headers = randFakeIP();
$userAgent = randFakeUserAgent();
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); //模拟IP
curl_setopt($curl, CURLOPT_USERAGENT, $userAgent); //模拟浏览器类型
curl_setopt($curl, CURLOPT_REFERER, "http://www.baidu.com"); //模拟来源网址
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curl, CURLOPT_TIMEOUT, 300); // 设置超时限制防止死循环
curl_setopt($curl, CURLOPT_HEADER, 0); // 显示返回的Header区域内容
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // 获取的信息以文件流的形式返回
$tmpInfo = curl_exec($curl);
if (curl_errno($curl)) {
echo "Error: ".curl_error($curl);
} else {
curl_close($curl);
}
}
//此函数提供了国内的IP地址
function randFakeIP()
{
$ipLong = [
['607649792', '608174079'], //36.56.0.0-36.63.255.255
['1038614528', '1039007743'], //61.232.0.0-61.237.255.255
['1783627776', '1784676351'], //106.80.0.0-106.95.255.255
['2035023872', '2035154943'], //121.76.0.0-121.77.255.255
['2078801920', '2079064063'], //123.232.0.0-123.235.255.255
['-1950089216', '-1948778497'], //139.196.0.0-139.215.255.255
['-1425539072', '-1425014785'], //171.8.0.0-171.15.255.255
['-1236271104', '-1235419137'], //182.80.0.0-182.92.255.255
['-770113536', '-768606209'], //210.25.0.0-210.47.255.255
['-569376768', '-564133889'], //222.16.0.0-222.95.255.255
];
$randKey = mt_rand(0, 9);
$ip = long2ip(mt_rand($ipLong[$randKey]['0'], $ipLong[$randKey]['1']));
$headers['CLIENT-IP'] = $ip;
$headers['X-FORWARDED-FOR'] = $ip;
$headerArr = [];
foreach ($headers as $n => $v) {
$headerArr[] = $n.':'.$v;
}
return $headerArr;
}
//浏览器userAgent
function randFakeUserAgent()
{
$agentArray = [
//PC端的UserAgent
"safari 5.1 – MAC" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"safari 5.1 – Windows" => "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Firefox 38esr" => "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"IE 11" => "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"IE 9.0" => "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"IE 8.0" => "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"IE 7.0" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"IE 6.0" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Firefox 4.0.1 – MAC" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Firefox 4.0.1 – Windows" => "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera 11.11 – MAC" => "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera 11.11 – Windows" => "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Chrome 17.0 – MAC" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"傲游(Maxthon)" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"腾讯TT" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"世界之窗(The World) 2.x" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"世界之窗(The World) 3.x" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"360浏览器" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"搜狗浏览器 1.x" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Avant" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Green Browser" => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
];
$userAgent = $agentArray[array_rand($agentArray, 1)]; //随机浏览器userAgent
return $userAgent;
}