该程序以最接近真实浏览器的方式浏览页面,可以作为一个完美的采集器使用!
1.使用http1.1协议发送请求;
2.支持keep-alive的持久连接模式;
3.支持chunked方式的分块传输;
4.自定义请求头部;
5.支持Cookie;
6.支持自动跳转,包括301/302和http或wml页面的跳转,不支持js的;
7.单线程,根据需要可以用多进程模拟多线程;
8.不解析css;
9.模拟文件缓存
<?php
/*
* randbrush.php
* by chenshuanj
* last modified time: 2011-8-15
*/
class randbrush
{
var $fp;
var $domain;
var $host;
var $path;
var $alive = 0;
var $headers = array();
var $cookies = array();
var $caches = array(); //缓存url,不断增长
var $port = 80;
var $proxy_ip = '';
var $proxy_port = '';
var $mode = 1; //浏览器模式 =1
function opensocket($host, $port){
if($this->host != $host){
$this->fp && @fclose($this->fp);
$this->host = $host;
}elseif($this->alive == 1 && $this->fp){
return true;
}
$host = $proxy_ip ? $proxy_ip : $host;
$port = $proxy_port ? $proxy_port : $port;
$this->fp = fsockopen( $host, $port);
if(!$this->fp)
return false;
else
return true;
}
function read_alive_data($fp){
//Content-Length: x
//Transfer-Encoding: chunked
$chunked = 0;
$length = 0;
$line = '';
$results = '';
while($line !== "\r\n"){
$line = fgets($fp);
$results .= $line;
if(substr($line, 0, 15) === 'Content-Length:'){
$length = intval(substr($line, 16));
}elseif($line == "Transfer-Encoding: chunked\r\n"){
$chunked = 1;
}
}
if($chunked == 0){
/*
$sum = 0;
while($sum < $length){
$line = fgets($fp);
$sum += strlen($line);
$results .= $line;
}
*/
if($length>0)
$results .= fread($fp, $length);
}else{
while(1){
$line = fgets($fp);
$length = hexdec(trim($line));
if($length == 0)
break;
$sum = 0;
while($sum < $length){
$line = fgets($fp);
$sum += strlen($line);
$results .= $line;
}
}
}
return $results;
}
function setcookie($str){
preg_match_all("/Set-Cookie: (.*?)\r\n/i", $str, $matches);
if($matches[1]){
foreach($matches[1] as $v){
preg_match("/ Domain=(.*?); /i", $v, $d);
if($d[1])
$domain = preg_replace('/^./','',$d[1]);
else
$domain = $this->domain;
$para = explode('; ', $v, 2);
if(!$this->cookies[$domain]){
$this->cookies[$domain] = $para[0];
}else{
$key = explode('=', $para[0], 2);
$dd = $key[0].'=';
if(strstr($this->cookies[$domain], $dd)){
if(!strstr($this->cookies[$domain], $para[0])){
$cs = explode('; ', $this->cookies[$domain]);
foreach($cs as $v){
if(strpos($v, $dd)===0){
if($key[1]){
$this->cookies[$domain] = str_replace($v, $para[0], $this->cookies[$domain]);
}else{
$this->cookies[$domain] = str_replace($v, '', $this->cookies[$domain]);
}
break;
}
}
}
}else
$this->cookies[$domain] .= '; '.$para[0];
}
}
}
}
//简单检查相对路径
function checkurl($url){
$url = str_replace('&', '&', $url);
if($url[0] == '.'){
$n = substr_count($url, '../');
if($n<1)
return preg_replace('/^./','',$url);
else{
$p = explode('/', $this->path);
$m = count($p);
if($m-$n < 2)
return '/';
$path = '/';
for($i=1; $i<$m-$n; $i++)
$path.= $p[$i].'/';
$url = str_replace('../', '', $url);
return $path.$url;
}
}elseif($url[0] != '/'){
if(strpos($url, 'http://') !== 0)
$url = '/'.$url;
}
return $url;
}
function checkresult($results, $script){
$data = explode("\r\n\r\n", $results, 2);
if($this->alive == 1 && strpos($data[0], "Connection: close\r\n")>0){
if($this->fp)
fclose($this->fp);
}
//cookie
$this->setcookie($data[0]);
//302/301跳转
preg_match("/Location: (.*?)\r\n/i", $data[0], $matches);
if($matches[1])
return $this->sendsocket($this->checkurl($matches[1]));
//缓存Cache-Control: max-age=num
preg_match("/Cache-Control: max-age=(.*?)\r\n/i", $data[0], $matches);
if($matches[1])
$this->caches[] = $this->host . $script;
if($this->mode == 1){
//判断请求类型 Content-Type: image/x
if(strpos($data[0], "Content-Type: image/")>0)
return $data[1];
/*页面跳转
wml:
1.<card * ontimer="hello.html" *>
2.<card * onenterforward="hello.html" *>
3.<onevent type="onenterforward"><go href="hello.html"/></onevent>
http:
1.<meta http-equiv="refresh" content="5;url=hello.html">
*/
$url = '';
preg_match("/(<card\s+.*(ontimer|onenterforward)=[\"|']?)([^>\"'\s]+?)(\s*[^>]*>)/iesU", $str, $matches);
if($matches[3])
$url = $matches[3];
preg_match("/(<onevent\s+type=(\"|')(onenterforward|ontimer)(\"|')>\s{0,}<go\s+href=[\"|']?)([^>\"'\s]+?)(\s*[^>]*>)/iesU", $str, $matches);
if($matches[5])
$url = $matches[5];
preg_match("/(<meta\s+http-equiv=[\"|']?)(refresh[\"|']?)\s+(content=[\"|']?)\d+;url=([^>\"'\s]+?)(\s*[^>]*>)/iesU", $str, $matches);
if($matches[4])
$url = $matches[4];
if($url)
return $this->sendsocket($this->checkurl($url));
}
return $data[1];
}
function sendsocket($url, $way='GET', $Referer='', $data=''){
$url = parse_url($url);
$host = $url['host'] ? $url['host'] : $this->host;
$port = $url['port'] ? $url['port'] : $this->port;
$host != $this->host && $this->domain = preg_replace('/^www./i','',$host);
$port != $this->port && $this->port = $port;
if(!$url['path']){
$this->path = $url['path'] = '/';
}elseif($url['path'] != '/'){
$this->path = dirname($url['path']);
}else{
$this->path = '/';
}
$script = $url['path'].($url['query'] ? "?".$url['query'] : "");
//判断缓存
$ckey = $host.$script;
if(in_array($ckey, $this->caches))
return null;
if(!$this->opensocket($host, $port))
return false;
$q = sprintf("%s %s%s HTTP/1.1\r\n", $way, $proxy_ip ? 'http://'.$host : '', $script); //代理则加http头
$q .= "Host: $host\r\n";
if($this->headers){
foreach($this->headers as $k=>$v){
$q .= "$k: $v\r\n";
}
}
if($way=='POST'){
$l = strlen($data);
$q .= "Content-Type: application/x-www-form-urlencoded\r\n";
$q .= "Content-Length: $l\r\n";
}
if($this->alive == 0){
$q .= "Connection: close\r\n";
}else{
$q .= "Connection: keep-alive\r\n";
//$q .= "Keep-Alive: 300\r\n"; //一般默认30秒?
}
if(!empty($Referer)){
$q .= "Referer: $Referer\r\n";
}
if($this->cookies[$this->domain]){
$q .= "Cookie: ".$this->cookies[$this->domain]."\r\n";
}
$q .= "\r\n";
$q .= $data;
$s = fputs($this->fp, $q);
if(!$s){
//echo 'fputs error: '.$q;
return null;
}
$results = '';
if($this->alive == 0){
while(!feof($this->fp)){
$line = fread($this->fp, 1024);
$results .= $line;
}
fclose($this->fp);
}else{
$results = $this->read_alive_data($this->fp);
}
return $this->checkresult($results, $script);
}
//link,script-src
function matchheadlink($str){
preg_match_all("/(<link\s+.*href=[\"|']?)([^>\"'\s]+?)(\s*[^>]*>)/iesU",$str,$link);
preg_match_all("/(<script\s+.*src=[\"|']?)([^>\"'\s]+?)(\s*[^>]*>)/iesU",$str,$script);
return array_merge($link[2], $script[2]);
}
function matchimgsrc($str){
preg_match_all("/(<img\s+.*src=[\"|']?)([^>\"'\s]+?)(\s*[^>]*>)/iesU",$str,$urls);
return $urls[2];
}
function matchurl($str){
preg_match_all("/(<a\s+.*href=[\"|']?)([^>\"'\s]+?)(\s*[^>]*>)/iesU",$str,$urls);
return $urls[2];
}
function setsort($arr){
$url1 = $url2 = array();
foreach($arr as $v){
if(strpos($v, 'http://') !== 0)
$url1[] = $v;
else
$url2[] = $v;
}
return array_merge($url1, $url2);
}
function browse($url, $Referer=''){
$str = $this->sendsocket($url, 'GET', $Referer);
//去掉注释
$str = preg_replace('/<!--(.*?)-->/ies','',$str);
//link
$links = $this->matchheadlink($str);
if($links){
if($this->alive == 1){
//同域url排序
$links = $this->setsort($links);
}
foreach($links as $u){
$this->sendsocket($this->checkurl($u), 'GET', $url);
}
}
//img
$srcs = $this->matchimgsrc($str);
if($srcs){
if($this->alive == 1){
$srcs = $this->setsort($srcs);
}
foreach($srcs as $s){
$this->sendsocket($this->checkurl($s), 'GET', $url);
}
}
if($this->alive == 1){
@fclose($this->fp);
}
return $str;
}
function rand_brush($url, $headers=array(), $deep=3){ //随机爬行
$this->headers = $headers;
$str = $this->browse($url);
while($deep > 0){
$hrefs = $this->matchurl($str);
if(!$hrefs)
break;
$Referer = $url;
$n = rand(0, count($hrefs));
$url = $hrefs[$n];
if(strpos($url, 'http://') !== 0){
$uri = parse_url($Referer);
$url = 'http://'.$uri['host'].$this->checkurl($url);
}
$str = $this->browse($url, $Referer);
$deep--;
}
}
}
$h = new randbrush;
$h->headers = array('User-Agent' => 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
);
$str = $h->browse('http://www.baidu.com');
//echo $str;
print_r($h->cookies);
?>