php网页采集 想对高效版

想对前面写的版本,极大的减小了IO开销,减小了对主机的解析

<?php
header("content-type: text/html; charset=utf-8");
class HttpWrap
{
    public $timeout=10;
    public $status='';

    public $host;
    public $port=80;
    private $ip;
    private $conn;
    private $path;
    private $url;
    private $scheme;

    public $http_method='GET';
    public $http_version="HTTP/1.1";
    public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0";
    public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";
    public $gzip="gzip";
    public $referer;
    public $cookie;
    public $submit_type="application/x-www-form-urlencoded";
    private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
    public $connection="keep-alive";


    private $cmd_line;
    private $header;
    public $post_content;

    private $redirect;
    private $is_gzip;

    public $response_num;
    public $response_header;
    public $response_body_length=0;
    public $response_body;

    public $roll_link;
    public $roll_group;
    public $filename;
    public $encoding;

   public  function init($url)
    {
        $this->url=$url;
        $url_pair = parse_url($url);
        $this->host = $url_pair['host'];
        $this->path = $url_pair['path'];
        $this->scheme = $url_pair['scheme'];

        if(empty($this->ip))
        {
            $this->ip = gethostbyname($this->host);
        }

        if(!empty($url_pair['port']))
        {
            $this->port = $url_pair['port'];
        }
        $this->connect();
       // echo $this->status; exit();
        $this->sendRequest();
        //如果响应头部存在重定向,则对重定向发送请求
        if($this->redirect)
        {
            if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect))
            {
                $this->referer=$this->host."/".parse_url($this->redirect)['path'];
                $this->init($this->redirect);
            }
        }
        if($this->roll_link)
        {
            $next_url = substr($this->url,0,strripos($this->url, '/')+1).$this->roll_link;
            //如果下一页等于当前页
            if(strtolower(trim(basename($this->url,'.html'))) == strtolower(trim(basename($next_url,'.html'))))
            {
               $next_group = $this->getNextGroup($this->response_body);
               echo "<font color='color'>即将采集下一组</font><br />";
               sleep(1);
               $this->init($next_group);
            }
            else
            {
                $this->init($next_url);
            }

        }
        else
        {
            die('没有下一页');
        }

    }

   private function connect()
   {
       $this->conn = fsockopen($this->ip,$this->port,$errno,$errstr,$this->timeout);
       if($this->conn)
       {
           $this->status = '链接成功';
           return true;
       }
       else
       {
            switch($errno)
            {
                case -3:
                        $this->status="创建socket链接失败";
                case -4:
                        $this->status="dns查询失败";
                case -5:
                        $this->status="链接被拒绝或超时";
                default:
                        $this->status="创建连接失败";
            }
            return false;
       }
   }
   private function sendRequest()
   {
       if(empty($this->path))
       {
           $this->path="/";
       }
       $this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version."\r\n";

       if(!empty($this->host))
       {
           $this->header .= "Host: ".$this->host."\r\n";
       }

       if(!empty($this->agent))
       {
           $this->header .="User-Agent: ".$this->agent."\r\n";
       }

       if(!empty($this->accept))
       {
           $this->header .= "Accept: ". $this->accept ."\r\n";
       }
       if(!empty($this->gzip))
       {
           if ( function_exists("gzinflate") )
           {
                $this->header .= "Accept-encoding: gzip\r\n";
            }
            else
            {
                $this->status = "不支持压缩";
            }
       }
       if(!empty($this->referer))
       {
           $this->header .= "Referer: ".$this->referer."\r\n";
       }
       if(!empty($this->accept_language))
       {
           $this->header .= "Accept-Language: ".$this->accept_language."\r\n";
       }
       if(!empty($this->cookie))
       {
           if(!is_array($this->cookie))
           {
               $this->header .="Cookie: ".$this->cookie;
           }
           else
           {
               if(count($this->cookie) >0)
               {
                   $cookie = "Cookie: ";
                   foreach($this->cookie as $key => $val)
                   {
                       $cookie.=$key."=".urlencode($val).";";
                   }
                  $cookie = substr($cookie, 0, strlen($cookie)-1)."\r\n";
               }
               $this->header .= $cookie;
           }
       }
       if(!empty($this->submit_type))
       {
           $this->header .="Content-Type: ".$this->submit_type."\r\n";
       }
       if(!empty($this->post_content))
       {
           $this->header .= "Content-length: ".strlen($this->post_content)."\r\n";
       }
       if(!empty($this->connection))
       {
           $this->header .= "Connection: ".$this->connection."\r\n";
       }
       $this->header .="\r\n";
       //上面是HTTP请求头部信息
       //echo $this->cmd_line.$this->header.$this->post_content; exit();
       //发送请求
       $len = strlen($this->cmd_line.$this->header.$this->post_content);
      if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len))
      {
          $this->status = "发送请求failed";
      }

       //接受响应,每次读取一行内容,首先解析响应头
       while($response_header = fgets($this->conn, 1024))
       {
           if(preg_match("#^HTTP/#",$response_header))
            {
                //匹配状态数字,200表示请求成功
                if(preg_match("#^HTTP/[^\s]*\s(.*?)\s#",$response_header, $status))
                {
                        $this->response_num= $status[1];//返回代表数字的状态
                }
            }
            //echo $this->response_num; exit();
            // 判断是否需要重定向
            if(preg_match("#^(Location:|URI:)#i",$response_header))
            {
                // 获取重定向地址
                preg_match("#^(Location:|URI:)\s+(.*)#",trim($response_header),$matches);

                //如果重定向字段不包含主机名,不是以以://开头的,则拼接王完整的请求地址,模式+主机+端口
                if(!preg_match("#\:\/\/#",$matches[2]))
                {
                    // 补全主机名
                    $this->redirect = "http://".$this->host.":".$this->port;

                    //添加路径
                    if(!preg_match("|^/|",$matches[2]))
                           $this->redirect .= "/".$matches[2];
                    else
                           $this->redirect .= $matches[2];
                }
                else
                //包含完整的主机地址
                        $this->redirect = $matches[2];
            }

        //判断返回的数据的压缩格式
	if (preg_match("#^Content-Encoding: gzip#", $response_header) )
          {
                $this->is_gzip = true;
          }
          if(preg_match('#^Content-Length:\s*(\d+)#i', $response_header, $len))
          {
              $this->response_body_length = $len[1];
          }

        //解析完响应头部
        if(preg_match("/^\r?\n$/", $response_header) )
            break;

        $this->response_header[]=$response_header;
       }
       //可以成功返回响应头部信息,响应状态码也为200
      // var_dump($this->response_header); exit();

        if($this->response_num==200)
        {
            //问题出在这里
            //echo "ok"; exit();
            $sub_dir;
            $dirname;
            $path;
            $filename;
            if(preg_match('#/(\d+)/#', $this->url, $sub_dir))
            {
                $dirname = "./download/".$sub_dir[1];
            }
            else
            {
                $dirname = "./download/".date("Ymd");
            }

            $len=0;
            while($items = fread($this->conn, $this->response_body_length))
            {
                if(!is_dir($dirname))
                {
                    $path = mkdir($dirname,0777,true);
                }
                $filename = $dirname.'/'.basename($this->url);
                $len = $len+strlen($items);
                $this->response_body = $items;
                file_put_contents($filename, $items, FILE_APPEND);
                //这里必须判断读取的长度,不然会在这里阻塞
                if($len >= $this->response_body_length) break;

            }

            if($this->is_gzip)
            {
                $this->response_body = gzinflate ($this->response_body);
            }
            echo str_repeat("  ", 2048);
            echo "对链接".$this->url."发起请求<br />";
            $this->getRollLink($this->response_body);
           // sleep(1);
        }
   }
    private function getRollLink($filename)
   {
         $content='';
        if(empty($this->encoding))
        {
            $this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
            if($this->encoding !='UTF-8')
            {
               $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);
            }
        }
       else
       {
           $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);
       }
       if(preg_match('#<ul\s+class="image"[^>]*?>.*?</ul>#is', $content, $match))
       {
           if(preg_match('#<a\s+href="([^"]+?)">下一页</a>#ui', $match[0], $next))
           {
                $this->roll_link =  trim($next[1]);
           }
       }
       else
       {
          $this->roll_link = false;
       }
   }
   private  function getNextGroup($filename)
   {
        if(empty($this->encoding))
        {
            $this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
            if($this->encoding !='UTF-8')
            {
               $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);
            }
        }
       else
       {
           $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);
       }

	if(preg_match('#<ul\s+class="page"[^>]*?>.*?</ul>#is', $content, $match))
       {
			//echo $match[0]."<br />";
           if(preg_match_all('#<a\s+href="([^"]*?)">.*?</a>#usi', $match[0], $next))
           {
                //var_dump($next[1]);
                $choice;
                if(count($next[1])==2)
                {
                    $first = basename($next[1][0], ".html");
                    $second = basename($next[1][1], ".html");
                    //往前翻页,进入下一组
                    if(intval($first) < intval($second))
                    {
                            $choice = $first;
                    }
                    else
                    {
                            $choice = $second;
                    }
                    //h获取下一组
                    foreach($next[1] as $item)
                    {
                        if(strripos($item, $choice) !=false )
                        {
                            if(substr($item, 0,2) =='..')
                            {
                                    $link=  substr($item, 2);
                                    $sub_path = explode('/', $this->path);
                                    $url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;
                                    return $url;
                            }
                        }
                    }
                }
                //如果是最后一组,即没有下一组了
                else if(count($next[1])==1)
                {
                      if(substr($next[1][0],0,2)=='..')
                      {
                        $link = substr($next[1][0],2);
                        $sub_path = explode('/', $this->path);
                        $url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;
                        return $url;
                      }
                }
           }
           else
           {
               $this->status = "failed to match href";
           }

       }
       else
       {
            $this->status = "failed to match class=page";
       }
   }

}
ob_implicit_flush(true);
set_time_limit(0);
$url = $url = "http://www.mmkao.com/Beautyleg/201412/7066.html";
$http = new HttpWrap();
$http->cookie = "safedog-flow-item=41E2DBFEF121A8A2835ADB4476E5D3EC";
$http->referer = "www.mmkao.com";
$http->init($url);
?>


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
PHP电影自动采集运营源码是一种基于PHP语言开发的电影采集系统,它能够自动从各大电影网站上获取最新的电影信息,并将其展示在我们的网站上。这样,我们就可以方便地浏览和搜索各种电影资源。 该运营源码包含了许多功能和模块,使得我们可以轻松地管理和运营这个电影采集系统。其中一些主要功能包括: 1. 自动采集:源码能够自动定时从指定的电影网站上采集最新的电影资料,包括电影名称、海报、演员表、剧情介绍等等。这样,我们就无需手动输入这些信息,省时省力。 2. 数据存储:采集到的电影信息会被存储在数据库中,方便我们后续对这些数据进行管理、编辑和展示。 3. 电影展示:采集到的电影信息可以在我们的网站上进行展示,包括电影列表、电影详情页、分类浏览等等。用户可以根据自己的需求进行搜索和筛选,找到自己感兴趣的电影。 4. 广告投放:源码还提供了广告投放的功能,可以在网站上显示一些广告,以获取一定的收入。这对于我们的运营和盈利非常重要。 5. 用户管理:我们可以通过源码对用户进行管理,包括用户注册、登录、评论、评分等等。这样,我们可以和用户进行互动,了解他们的需求和反馈。 总之,PHP电影自动采集运营源码是一款非常实用的工具,帮助我们轻松地搭建和运营一个电影采集系统。它能够自动采集电影信息,方便用户浏览和搜索电影资源,并提供了一些运营功能,帮助我们获取一定的收入。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值