php常用的几种采集方法

function curl_get($url) {

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($ch, CURLOPT_HEADER, 1);

$result = curl_exec($ch);

$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);

if ($code != '404' && $result) {

return $result;

}

curl_close($ch);

}
//获取页面url链接

function get_page_urls($spider_page_result, $base_url) {

$get_url_result = preg_match_all("/\'\"\ ]*).*?>/", $spider_page_result, $out);

if ($get_url_result) {

return $out[1];

} else {

return;

}

}
/**模拟百度蜘蛛采集**/
function _GetContent( $url ){
        $ch = curl_init();
        $ip = '220.181.108.91';  // 百度蜘蛛
        $timeout = 15;
        curl_setopt($ch,CURLOPT_URL,$url);
        curl_setopt($ch,CURLOPT_TIMEOUT,0);
        //伪造百度蜘蛛IP
        curl_setopt($ch,CURLOPT_HTTPHEADER,array('X-FORWARDED-FOR:'.$ip.'','CLIENT-IP:'.$ip.''));
        //伪造百度蜘蛛头部
        curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
        curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
        curl_setopt($ch,CURLOPT_HEADER,0);
        curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);
        $content = curl_exec($ch);
        if($content === false)
        {//输出错误信息
            $no = curl_errno($ch);
            switch(trim($no))
            {
                case 28 : $error = '访问目标地址超时'; break;
                default : $error = curl_error($ch); break;
            }
            echo $error;
        }
        else
        {
            $succ = true;
            return $content;
        }
    }
	
function post($url, $data = array())
{
	global $nochange;
	$o = "";
	foreach ($data as $k => $v) {
		$o .= "$k=" . $v . "&";
	}
	$data = substr($o, 0, -1);
	$ch = curl_init();
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
	curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
	curl_setopt($ch, CURLOPT_URL, $url);
	curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
	curl_setopt($ch, CURLOPT_REFERER, $url);
	curl_setopt($ch, CURLOPT_USERAGENT, $url);
	curl_setopt($ch, CURLOPT_HEADER, false);
	curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP:' . get_rand_ip(), 'X-FORWARDED-FOR:' . get_rand_ip()));
	curl_setopt($ch, CURLOPT_TIMEOUT, 30);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	curl_setopt($ch, CURLOPT_POST, 1);
	curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
	$rs = curl_exec($ch);
	$info = curl_getinfo($ch);
	curl_close($ch);
	$content_type = $info['content_type'];
	header('content-type:' . $content_type);
	if (nochange_url($content_type, $nochange_url) < 1) {
		$rs = change_link($rs);
		$rs = regstr($rs);
	}
	return $rs;
}

f

unction get($url)
{
	global $user_curl, $user_agent, $user_client, $nochange;
	if (!($cache = cache('r', $url))) {
		if (function_exists('curl_init') && $user_curl == "1") {
			$ch = curl_init();
			if ($user_agent == 'baidu') {
				if ($user_client == "mobile") {
					$user_agent = 'Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
				} elseif (is_mobile() && $user_client == "auto") {
					$user_agent = 'Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
				} elseif ($user_client == "pc" || !is_mobile()) {
					$user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
				} else {
					$user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
				}
			} elseif ($user_agent == 'google') {
				if ($user_client == "mobile") {
					$user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
				} elseif (is_mobile() && $user_client == "auto") {
					$user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
				} elseif ($user_client == "pc" || !is_mobile()) {
					$user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
				} else {
					$user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
				}
			} elseif ($user_agent == 'yahoo') {
				$user_agent = 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)';
			} elseif ($user_agent == 'bing') {
				if ($user_client == "mobile") {
					$user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)';
				} elseif (is_mobile() && $user_client == "auto") {
					$user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)';
				} elseif ($user_client == "pc" || !is_mobile()) {
					$user_agent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)';
				} else {
					$user_agent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)';
				}
			} else {
				$user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
			}
			curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
			curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
			curl_setopt($ch, CURLOPT_REFERER, $url);
			curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
			curl_setopt($ch, CURLOPT_HEADER, false);
			curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP:' . get_rand_ip(), 'X-FORWARDED-FOR:' . get_rand_ip()));
			curl_setopt($ch, CURLOPT_TIMEOUT, 30);
			curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
			$rs = curl_exec($ch);
			$info = curl_getinfo($ch);
			curl_close($ch);
			$content_type = $info['content_type'];
		} else {
			$rs = file_get_contents($url);
		}
		if (nochange_url($content_type, $nochange) < 1 && nochange_url($content_type, "jpg|jpeg|gif|png|bmp") < 1) {
			$rs = change_link($rs);
			$rs = regstr($rs);
		}
		if (nochange_url($content_type, "jpg|jpeg|gif|png|bmp") < 1) {
			$cache = array('content_type' => $content_type, 'rs' => $rs);
			cache('w', $url, $cache);
		}
		header('content-type:' . $content_type);
		return $rs;
	} else {
		extract($cache);
		header('content-type:' . $content_type);
		return $rs;
	}
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值