修正了在第一组,没有上一组的问题。实现了单次配置,自动化采集,不需人为监控,会自动分类的把HTML网页采集到对应的分类目录,对HTML网页进行过滤和提取就比较简单了
<?php
header("content-type: text/html; charset=utf-8");
class HttpWrap
{
public $timeout=10;
public $status='';
public $host;
public $port=80;
private $conn;
private $path;
private $url;
private $scheme;
public $http_method='GET';
public $http_version="HTTP/1.1";
public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0";
public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";
public $gzip="gzip";
public $referer;
public $cookie;
public $submit_type="application/x-www-form-urlencoded";
private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
public $connection="keep-alive";
private $cmd_line;
private $header;
public $post_content;
private $redirect;
private $is_gzip;
public $response_num;
public $response_header;
public $response_body_length=0;
public $response_body;
public $roll_link;
public $filename;
public function init($url)
{
$this->url=$url;
$url_pair = parse_url($url);
$this->host = $url_pair['host'];
$this->path = $url_pair['path'];
$this->scheme = $url_pair['scheme'];
if(!empty($url_pair['port']))
{
$this->port = $url_pair['port'];
}
$this->connect();
// echo $this->status; exit();
$this->sendRequest();
//如果响应头部存在重定向,则对重定向发送请求
if($this->redirect)
{
if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect))
{
$this->referer=$this->host."/".parse_url($this->redirect)['path'];
$this->init($this->redirect);
}
}
if($this->roll_link)
{
$next_url = substr($this->url,0,strripos($this->url, '/')+1).$this->roll_link;
//当进入最后一页时,跳转到下一组
if(strtolower(trim(basename($this->url,'.html'))) == strtolower(trim(basename($next_url,'.html'))))
{
//对下一组网页进行采集
echo str_repeat(" ", 2048);
echo "<font color='red'>对下一组网页进行采集</font><br />";
$next_group = $this->getNextGroup($this->filename);
sleep(1);
$this->init($next_group);
//echo $next_group;
}
else
{
$this->init($next_url);
}
}
}
private function connect()
{
$this->conn = fsockopen($this->host,$this->port,$errno,$errstr,$this->timeout);
if($this->conn)
{
$this->status = '链接成功';
return true;
}
else
{
switch($errno)
{
case -3:
$this->status="创建socket链接失败";
case -4:
$this->status="dns查询失败";
case -5:
$this->status="链接被拒绝或超时";
default:
$this->status="创建连接失败";
}
return false;
}
}
private function sendRequest()
{
if(empty($this->path))
{
$this->path="/";
}
$this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version."\r\n";
if(!empty($this->host))
{
$this->header .= "Host: ".$this->host."\r\n";
}
if(!empty($this->agent))
{
$this->header .="User-Agent: ".$this->agent."\r\n";
}
if(!empty($this->accept))
{
$this->header .= "Accept: ". $this->accept ."\r\n";
}
if(!empty($this->gzip))
{
if ( function_exists("gzinflate") )
{
$this->header .= "Accept-encoding: gzip\r\n";
}
else
{
$this->status = "不支持压缩";
}
}
if(!empty($this->referer))
{
$this->header .= "Referer: ".$this->referer."\r\n";
}
if(!empty($this->accept_language))
{
$this->header .= "Accept-Language: ".$this->accept_language."\r\n";
}
if(!empty($this->cookie))
{
if(!is_array($this->cookie))
{
$this->header .="Cookie: ".$this->cookie;
}
else
{
if(count($this->cookie) >0)
{
$cookie = "Cookie: ";
foreach($this->cookie as $key => $val)
{
$cookie.=$key."=".urlencode($val).";";
}
$cookie = substr($cookie, 0, strlen($cookie)-1)."\r\n";
}
$this->header .= $cookie;
}
}
if(!empty($this->submit_type))
{
$this->header .="Content-Type: ".$this->submit_type."\r\n";
}
if(!empty($this->post_content))
{
$this->header .= "Content-length: ".strlen($this->post_content)."\r\n";
}
if(!empty($this->connection))
{
$this->header .= "Connection: ".$this->connection."\r\n";
}
$this->header .="\r\n";
//上面是HTTP请求头部信息
//echo $this->cmd_line.$this->header.$this->post_content; exit();
//发送请求
$len = strlen($this->cmd_line.$this->header.$this->post_content);
if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len))
{
$this->status = "发送请求failed";
}
//接受响应,每次读取一行内容,首先解析响应头
while($response_header = fgets($this->conn, 1024))
{
if(preg_match("|^HTTP/|",$response_header))
{
//匹配状态数字,200表示请求成功
if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$response_header, $status))
{
$this->response_num= $status[1];//返回代表数字的状态
}
}
//echo $this->response_num; exit();
// 判断是否需要重定向
if(preg_match("#^(Location:|URI:)#i",$response_header))
{
// 获取重定向地址
preg_match("#^(Location:|URI:)\s+(.*)#",trim($response_header),$matches);
//如果重定向字段不包含主机名,不是以以://开头的,则拼接王完整的请求地址,模式+主机+端口
if(!preg_match("#\:\/\/#",$matches[2]))
{
// 补全主机名
$this->redirect = "http://".$this->host.":".$this->port;
//添加路径
if(!preg_match("|^/|",$matches[2]))
$this->redirect .= "/".$matches[2];
else
$this->redirect .= $matches[2];
}
else
//包含完整的主机地址
$this->redirect = $matches[2];
}
//判断返回的数据的压缩格式
if (preg_match("#^Content-Encoding: gzip#", $response_header) )
{
$this->is_gzip = true;
}
if(preg_match('#^Content-Length:\s*(\d+)#i', $response_header, $len))
{
$this->response_body_length = $len[1];
}
//解析完响应头部
if(preg_match("/^\r?\n$/", $response_header) )
break;
$this->response_header[]=$response_header;
}
//可以成功返回响应头部信息,响应状态码也为200
// var_dump($this->response_header); exit();
if($this->response_num==200)
{
//问题出在这里
//echo "ok"; exit();
$sub_dir;
$dirname;
$path;
$filename;
if(preg_match('#/(\d+)/#', $this->url, $sub_dir))
{
$dirname = "./download/".$sub_dir[1];
}
else
{
$dirname = "./download/".date("Ymd");
}
$len=0;
while($items = fread($this->conn, $this->response_body_length))
{
if(!is_dir($dirname))
{
$path = mkdir($dirname,0777,true);
}
$filename = $dirname.'/'.basename($this->url,'.html').'.txt';
//保存的总是最后一个文件,也就是采集完一组网页后的最后一页
$this->filename = $filename;
$len = $len+strlen($items);
file_put_contents($filename, $items, FILE_APPEND);
//这里必须判断读取的长度,不然会在这里阻塞
if($len >= $this->response_body_length) break;
}
if($this->is_gzip)
{
$this->response_body = gzinflate ($this->response_body);
}
echo str_repeat(" ", 2048);
echo "对链接".$this->url."发起请求<br />";
sleep(1);
$this->getRollLink($filename);
// sleep(1);
}
}
private function getRollLink($filename)
{
if(!file_exists($filename)) die($filename.'文件不存在__'.__LINE__);
$temp=file_get_contents($filename, false, null, 0, 64);
$encoding=mb_detect_encoding($temp, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
$content='';
if($encoding !='UTF-8')
{
$content = mb_convert_encoding(file_get_contents($filename), 'UTF-8', $encoding);
}
if(preg_match('#<ul\s+class="image"[^>]*?>.*?</ul>#is', $content, $match))
{
if(preg_match('#<a\s+href="([^"]+?)">下一页</a>#ui', $match[0], $next))
{
$this->roll_link = $next[1];
}
}
else
{
$this->roll_link = false;
}
}
private function getNextGroup($filename)
{
$temp=file_get_contents($filename, false, null, 0, 64);
$encoding=mb_detect_encoding($temp, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
$content='';
if($encoding !='UTF-8')
{
$content = mb_convert_encoding(file_get_contents($filename), 'UTF-8', $encoding);
}
if(preg_match('#<ul\s+class="page"[^>]*?>.*?</ul>#is', $content, $match))
{
//echo $match[0]."<br />";
if(preg_match_all('#<a\s+href="([^"]*?)">.*?</a>#usi', $match[0], $next))
{
//var_dump($next[1]);
$choice;
if(count($next[1])==2)
{
$first = basename($next[1][0], ".html");
$second = basename($next[1][1], ".html");
//往前翻页,进入下一组
if(intval($first) < intval($second))
{
$choice = $first;
}
else
{
$choice = $second;
}
//h获取下一组
foreach($next[1] as $item)
{
if(strripos($item, $choice) !=false )
{
if(substr($item, 0,2) =='..')
{
$link= substr($item, 2);
$sub_path = explode('/', $this->path);
$url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;
return $url;
}
}
}
}
//如果是最后一组,即没有下一组了
else if(count($next[1])==1)
{
if(substr($next[1][0],0,2)=='..')
{
$link = substr($next[1][0],2);
$sub_path = explode('/', $this->path);
$url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;
return $url;
}
}
}
else
{
$this->status = "failed to match href";
}
}
else
{
$this->status = "failed to match class=page";
}
}
}
ob_implicit_flush(true);
set_time_limit(0);
$url = $url = "http://www.mmkao.com/Beautyleg/201412/7044.html";
$http = new HttpWrap();
$http->cookie = "safedog-flow-item=41E2DBFEF121A8A2835ADB4476E5D3EC";
$http->referer = "www.mmkao.com";
$http->init($url);
?>